# on issdm, it's /usr/local/mpich2/bin.
# Hook for extra -I options, etc.
-EXTRA_CFLAGS = -O3 -g #-I${HOME}/include -L${HOME}/lib
+EXTRA_CFLAGS = #-I${HOME}/include -L${HOME}/lib
+EXTRA_CFLAGS += -g
+EXTRA_CFLAGS += -pg
+EXTRA_CFLAGS += -O3
# base
CFLAGS = -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE ${EXTRA_CFLAGS}
extractosdmaps: extractosdmaps.cc common.o osd.o mon.o ebofs.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
-cmon: cmon.cc mon.o msg/SimpleMessenger.o common.o
+cmon: cmon.o mon.o msg/SimpleMessenger.o common.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
cmonctl: cmonctl.cc msg/SimpleMessenger.o common.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
-cosd: cosd.cc osd.o ebofs.o msg/SimpleMessenger.o common.o
+cosd: cosd.o osd.o ebofs.o msg/SimpleMessenger.o common.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
-cmds: cmds.cc mds.o osdc.o msg/SimpleMessenger.o common.o
+cmds: cmds.o mds.o osdc.o msg/SimpleMessenger.o common.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
-csyn: csyn.cc client.o osdc.o msg/SimpleMessenger.o common.o
+csyn: csyn.o client.o osdc.o msg/SimpleMessenger.o common.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
-cfuse: cfuse.cc client.o osdc.o client/fuse.o client/fuse_ll.o msg/SimpleMessenger.o common.o
+cfuse: cfuse.o client.o osdc.o client/fuse.o client/fuse_ll.o msg/SimpleMessenger.o common.o
${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@
# fake*
-fakefuse: fakefuse.cc mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o client/fuse_ll.o msg/FakeMessenger.o common.o
+fakefuse: fakefuse.o mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o client/fuse_ll.o msg/FakeMessenger.o common.o
${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@
-fakesyn: fakesyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o
+fakesyn: fakesyn.o mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o
${CC} ${CFLAGS} ${LIBS} $^ -o $@
- per-mds, shared standby queues
-sage mds
+mds bugs
+- open file rejournaling vs capped log...
+ - open files vs shutdown in general! need to export any caps on replicated metadata
+- export caps to auth on unlinked inodes
+- stray purge on shutdown
+
+- rename slave in-memory rollback on failure
-bugs to fix
-- fix server unlink .. needs to use slave_requests to clean up any failures during the resolve stage
- fix purge_stray bug
- try_remove_unlinked_dn thing
-- emetablob playback with bad linkage.. from sloppy unlink? hmm
+
- client session open from locker.. doesn't work properly with delays
+ -> journal the session open _with_ the import(start)
- proper handling of cache expire messages during rejoin phase?
- verify once-per-segment jouranl context is working...
+mds
- extend/clean up filepath to allow paths relative to an ino
- fix path_traverse
- fix reconnect/rejoin open file weirdness
+- get rid of replicate objects for replicate_to .. encode to bufferlists directly
+
- stray reintegration
-- stray purge on shutdown
- - need to export stray crap to another mds..
- verify stray is empty on shutdown
+- real chdir (directory "open")
+ - relative metadata ops
+
- consistency points/snapshots
- dentry versions vs dirfrags...
-
- detect and deal with client failure
- failure during reconnect vs clientmap. although probalby the whole thing needs a larger overhaul...
- inode.max_size
- inode.allocated_size
-
-- real chdir (directory "open")
- - relative metadata ops
-
+
- osd needs a set_floor_and_read op for safe failover/STOGITH-like semantics.
-- EMetablob should return 'expired' if they have higher versions (and are thus described by a newer journal entry)
-
-- could mark dir complete in EMetaBlob by counting how many dentries are dirtied in teh current log epoch in CDir...
+- could mark dir complete in EMetaBlob by counting how many dentries are dirtied in the current log epoch in CDir...
- fix rmdir empty exported dirfrag race
- export all frags <= 1 item? then we ensure freezing before empty, avoiding any last unlink + export vs rmdir race.
- need to move state from replicas to auth. simplelock doesn't currently support that.
- ScatterLock or something? hrm.
-- FIXME how to journal root and stray inode content?
+- FIXME how to journal/store root and stray inode content?
- in particular, i care about dirfragtree.. get it on rejoin?
- and dir sizes, if i add that... also on rejoin?
+- efficient stat for single writers
+- lstat vs stat?
+- add FILE_CAP_EXTEND capability bit
+- only share osdmap updates with clients holding capabilities
+- delayed replica caps release... we need to set a timer event? (and cancel it when appropriate?)
+
osdmon
- allow forcefeed for more complicated rule structures. (e.g. make force_stack a list< set<int> >)
-mds
-- distributed client management
-- chdir (directory opens!)
-- rewrite logstream
- - clean up
- - be smart about rados ack vs reread
- - log locking? root log object
- - trimming, rotation
-
-- efficient stat for single writers
-- lstat vs stat
-- add FILE_CAP_EXTEND capability bit
-- only share osdmap updates with clients holding capabilities
-- delayed replica caps release... we need to set a timer event? (and cancel it when appropriate?)
-- finish hard links!
- - reclaim danglers from inode file on discover...
- - fix rename wrt hard links
-- interactive hash/unhash interface
-- test hashed readdir
-- make logstream.flush align itself to stripes
-
-- carefully define/document frozen wrt dir_auth vs hashing
-
client
-> for correct result, need to _stop_ client writers while gathering metadata.
-SAGE:
-
-- string table?
-
-- hard links
- - fix MExportAck and others to use dir+dentry, not inode
- (otherwise this all breaks with hard links.. altho it probably needs reworking already!)
-
-- do real permission checks?
-
-
-
-
-
-
-ISSUES
-
-
-- discover
- - soft: authority selectively repicates, or sets a 'forward' flag in reply
- - hard: authority always replicates (eg. discover for export)
- - forward flag (see soft)
- - error flag (if file not found, etc.)
- - [what was i talking about?] make sure waiters are properly triggered, either upon dir_rep update, or (empty!) discover reply
-
-
-
-DOCUMENT
-- cache, distributed cache structure and invariants
-- export process
-- hash/unhash process
-
-
-TEST
-- hashing
- - test hash/unhash operation
- - hash+export: encode list of replicated dir inodes so they can be discovered before import is procesed.
- - test nauthitems (wrt hashing?)
-
-
-IMPLEMENT
-
-- smarter balancing
- - popularity calculation and management is inconsistent/wrong.
- - does it work?
-
- dump active config in run output somewhere
*
* update MDS location cache for a single inode
*/
-void Client::update_inode_dist(Inode *in, InodeStat *st)
+void Client::update_dir_dist(Inode *in, DirStat *dst)
{
// auth
in->dir_auth = -1;
- if (st->dirfrag_auth.size() == 1) {
- in->dir_auth = st->dirfrag_auth.begin()->second;
+ if (dst->frag == frag_t()) {
+ in->dir_auth = dst->auth;
} else {
- for (map<frag_t,int>::iterator p = st->dirfrag_auth.begin();
- p != st->dirfrag_auth.end();
- ++p) {
- dout(20) << "got dirfrag map for " << in->inode.ino << " frag " << p->first << " to mds " << p->second << dendl;
- in->fragmap[p->first] = p->second;
- }
+ dout(20) << "got dirfrag map for " << in->inode.ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
+ in->fragmap[dst->frag] = dst->auth;
}
// replicated
- in->dir_replicated = false;
- if (!st->dirfrag_rep.empty())
- in->dir_replicated = true; // FIXME
+ in->dir_replicated = dst->is_rep; // FIXME that's just one frag!
// dist
/*
dout(10) << "insert_trace got " << reply->get_trace_in().size() << " inodes" << dendl;
list<string>::const_iterator pdn = reply->get_trace_dn().begin();
+ list<DirStat*>::const_iterator pdir = reply->get_trace_dir().begin();
for (list<InodeStat*>::const_iterator pin = reply->get_trace_in().begin();
pin != reply->get_trace_in().end();
cur = root = new Inode((*pin)->inode, objectcacher);
dout(10) << "insert_trace new root is " << root << dendl;
inode_map[root->inode.ino] = root;
+ root->dir_auth = 0;
}
} else {
// not root.
Dir *dir = cur->open_dir();
+ assert(pdn != reply->get_trace_dn().end());
cur = this->insert_inode(dir, *pin, *pdn);
dout(10) << "insert_trace dn " << *pdn << " ino " << (*pin)->inode.ino << " -> " << cur << dendl;
++pdn;
lru.lru_touch(cur->dn);
}
- // update dist info
- update_inode_dist(cur, *pin);
-
// set cache ttl
if (g_conf.client_cache_stat_ttl) {
cur->valid_until = now;
cur->valid_until += g_conf.client_cache_stat_ttl;
}
+
+ // update dir dist info
+ if (pdir == reply->get_trace_dir().end()) break;
+ update_dir_dist(cur, *pdir);
+ ++pdir;
}
return cur;
mount_cond.Signal(); // mount might be waiting for this.
}
+ if (m->get_epoch() < mdsmap->get_epoch()) {
+ dout(1) << "handle_mds_map epoch " << m->get_epoch() << " is older than our "
+ << mdsmap->get_epoch() << dendl;
+ delete m;
+ return;
+ }
+
dout(1) << "handle_mds_map epoch " << m->get_epoch() << dendl;
- epoch_t was = mdsmap->get_epoch();
mdsmap->decode(m->get_encoded());
- assert(mdsmap->get_epoch() >= was);
// send reconnect?
if (frommds >= 0 &&
if (!dirfragtree.empty()) {
__gnu_cxx::hash<string> H;
frag_t fg = dirfragtree[H(dname)];
- if (fragmap.count(fg) &&
- fragmap[fg] >= 0) {
- //cout << "picked frag ino " << inode.ino << " dname " << dname << " fg " << fg << " mds" << fragmap[fg] << std::endl;
- return fragmap[fg];
+ while (fg != frag_t()) {
+ if (fragmap.count(fg) &&
+ fragmap[fg] >= 0) {
+ //cout << "picked frag ino " << inode.ino << " dname " << dname << " fg " << fg << " mds" << fragmap[fg] << std::endl;
+ return fragmap[fg];
+ }
+ fg = frag_t(fg.value(), fg.bits()-1); // try more general...
}
}
return authority();
// metadata cache
Inode* insert_inode(Dir *dir, InodeStat *in_info, const string& dn);
- void update_inode_dist(Inode *in, InodeStat *st);
+ void update_dir_dist(Inode *in, DirStat *st);
Inode* insert_trace(MClientReply *reply);
// ----------------------
run_until = utime_t(0,0);
dout(5) << "run" << dendl;
+ int seq = 0;
+
for (list<int>::iterator it = modes.begin();
it != modes.end();
it++) {
break;
case SYNCLIENT_MODE_MAKEDIRS:
{
- string sarg1 = get_sarg(0);
+ string sarg1 = get_sarg(seq++);
int iarg1 = iargs.front(); iargs.pop_front();
int iarg2 = iargs.front(); iargs.pop_front();
int iarg3 = iargs.front(); iargs.pop_front();
}
} else {
// shared
- if (whoami == 0) {
+ if (true || whoami == 0) {
for (int c=0; c<count; c++) {
sprintf(d,"dir.%d.run%d", 0, c);
client->mkdir(d, 0755);
}
} else {
- sleep(5);
+ sleep(2);
}
}
mds->mds_lock.Unlock();
// done
- delete mds;
+ //delete mds;
return 0;
}
} else {
if (fvals[i] > 0 && vals[i] == 0)
out << "\t" << fvals[i];
- else
+ else {
+ //cout << this << " p " << i << " and size is " << vals.size() << std::endl;
out << "\t" << vals[i];
+ }
}
}
out << std::endl;
if (i < 0) i = type->add_set(key);
maybe_resize(i+1);
+ //cout << this << " set " << i << " to " << v << std::endl;
long r = vals[i] = v;
logger_lock.Unlock();
return r;
if (i < 0) i = type->add_set(key);
maybe_resize(i+1);
+ //cout << this << " fset " << i << " to " << v << std::endl;
double r = fvals[i] = v;
logger_lock.Unlock();
return r;
vector< vector<double> > vals_to_avg;
void maybe_resize(unsigned s) {
- if (s >= vals.size()) {
- vals.resize(s);
- fvals.resize(s);
- vals_to_avg.resize(s);
+ while (s >= vals.size()) {
+ vals.push_back(0);
+ fvals.push_back(0.0);
+ vals_to_avg.push_back(vector<double>());
}
}
debug_mds: 1,
debug_mds_balancer: 1,
debug_mds_log: 1,
+ debug_mds_log_expire: 1,
debug_mds_migrator: 1,
debug_buffer: 0,
debug_timer: 0,
journaler_batch_max: 16384, // max bytes we'll delay flushing
// --- mds ---
- mds_cache_size: MDS_CACHE_SIZE,
+ mds_cache_size: 300000, //MDS_CACHE_SIZE,
mds_cache_mid: .7,
mds_decay_halflife: 5,
mds_beacon_grace: 15, //60*60.0,
mds_log: true,
- mds_log_max_len: MDS_CACHE_SIZE / 3,
- mds_log_max_trimming: 10000,
- mds_log_read_inc: 1<<20,
+ mds_log_max_events: -1, //MDS_CACHE_SIZE / 3,
+ mds_log_max_segments: 100,
+ mds_log_max_expiring: 20,
mds_log_pad_entry: 128,//256,//64,
- mds_log_flush_on_shutdown: true,
mds_log_eopen_size: 100, // # open inodes per log entry
mds_bal_sample_interval: 3.0, // every 5 seconds
mds_bal_merge_rd: 1000,
mds_bal_merge_wr: 1000,
mds_bal_interval: 10, // seconds
- mds_bal_fragment_interval: 5, // seconds
+ mds_bal_fragment_interval: 2, // seconds
mds_bal_idle_threshold: 0, //.1,
mds_bal_max: -1,
mds_bal_max_until: -1,
mds_bal_minchunk: .001, // never take anything smaller than this
mds_trim_on_rejoin: true,
- mds_commit_on_shutdown: true,
mds_shutdown_check: 0, //30,
mds_verify_export_dirauth: true,
g_conf.debug_mds_log = atoi(args[++i]);
else
g_debug_after_conf.debug_mds_log = atoi(args[++i]);
+ else if (strcmp(args[i], "--debug_mds_log_expire") == 0)
+ if (!g_conf.debug_after)
+ g_conf.debug_mds_log_expire = atoi(args[++i]);
+ else
+ g_debug_after_conf.debug_mds_log_expire = atoi(args[++i]);
else if (strcmp(args[i], "--debug_mds_migrator") == 0)
if (!g_conf.debug_after)
g_conf.debug_mds_migrator = atoi(args[++i]);
else if (strcmp(args[i], "--mds_log") == 0)
g_conf.mds_log = atoi(args[++i]);
- else if (strcmp(args[i], "--mds_log_max_len") == 0)
- g_conf.mds_log_max_len = atoi(args[++i]);
- else if (strcmp(args[i], "--mds_log_read_inc") == 0)
- g_conf.mds_log_read_inc = atoi(args[++i]);
- else if (strcmp(args[i], "--mds_log_max_trimming") == 0)
- g_conf.mds_log_max_trimming = atoi(args[++i]);
-
- else if (strcmp(args[i], "--mds_commit_on_shutdown") == 0)
- g_conf.mds_commit_on_shutdown = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_log_max_events") == 0)
+ g_conf.mds_log_max_events = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_log_max_segments") == 0)
+ g_conf.mds_log_max_segments = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_log_max_expiring") == 0)
+ g_conf.mds_log_max_expiring = atoi(args[++i]);
+
else if (strcmp(args[i], "--mds_shutdown_check") == 0)
g_conf.mds_shutdown_check = atoi(args[++i]);
- else if (strcmp(args[i], "--mds_log_flush_on_shutdown") == 0)
- g_conf.mds_log_flush_on_shutdown = atoi(args[++i]);
else if (strcmp(args[i], "--mds_decay_halflife") == 0)
g_conf.mds_decay_halflife = atoi(args[++i]);
int debug_mds;
int debug_mds_balancer;
int debug_mds_log;
+ int debug_mds_log_expire;
int debug_mds_migrator;
int debug_buffer;
int debug_timer;
float mds_beacon_grace;
bool mds_log;
- int mds_log_max_len;
- int mds_log_max_trimming;
- int mds_log_read_inc;
+ int mds_log_max_events;
+ int mds_log_max_segments;
+ int mds_log_max_expiring;
int mds_log_pad_entry;
- bool mds_log_flush_on_shutdown;
int mds_log_eopen_size;
float mds_bal_sample_interval;
float mds_bal_minchunk;
bool mds_trim_on_rejoin;
- bool mds_commit_on_shutdown;
int mds_shutdown_check;
bool mds_verify_export_dirauth; // debug flag
// allowed? (not already submitted to kernel?)
if (block_lock.intersects(bio->start, bio->length)) {
- // dout(20) << "dequeue_io " << bio->start << "~" << bio->length
- // << " intersects block_lock " << block_lock << dendl;
+ dout(20) << "dequeue_io " << bio->start << "~" << bio->length
+ << " intersects block_lock " << block_lock << dendl;
break; // stop, or go with what we've got so far
}
// add to biols
int nv = bio->bl.buffers().size(); // how many iov's in this bio's bufferlist?
- if (num_iovs + nv >= IOV_MAX) break; // to many //g_conf.bdev_iov_max) break; // too many!
+ if (num_bio &&
+ num_iovs + nv >= IOV_MAX) break; // to many //g_conf.bdev_iov_max) break; // too many!
num_iovs += nv;
start = MIN(start, bio->start);
length += bio->length;
if (el_dir_forward) {
- //dout(20) << "dequeue_io fw dequeue io at " << el_pos << " " << *i->second << dendl;
+ dout(20) << "dequeue_io fw dequeue io at " << el_pos << " " << *i->second << dendl;
biols.push_back(bio); // add at back
} else {
- // dout(20) << "dequeue_io bw dequeue io at " << el_pos << " " << *i->second << dendl;
+ dout(20) << "dequeue_io bw dequeue io at " << el_pos << " " << *i->second << dendl;
biols.push_front(bio); // add at front
}
num_bio++;
left -= iov[n].iov_len;
n++;
- if (left == 0) break;
+ if (left == 0 ||
+ n == IOV_MAX) break;
}
int r = ::writev(fd, iov, n);
// put in a buffer
bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
+ bp.zero();
memcpy(bp.c_str(), (const char*)&sb, sizeof(sb));
}
* C_Contexts - set of Contexts
*/
class C_Contexts : public Context {
- std::list<Context*> clist;
-
public:
+ std::list<Context*> contexts;
+
void add(Context* c) {
- clist.push_back(c);
+ contexts.push_back(c);
}
void take(std::list<Context*>& ls) {
- clist.splice(clist.end(), ls);
+ contexts.splice(contexts.end(), ls);
}
void finish(int r) {
- finish_contexts(clist, r);
+ finish_contexts(contexts, r);
}
};
unsigned get_off() { return off; }
+ bool end() {
+ return p == ls.end();
+ }
+
void advance(unsigned o) {
//cout << this << " advance " << o << " from " << off << " (p_off " << p_off << " in " << p->length() << ")" << std::endl;
p_off += o;
unsigned gap = append_buffer.unused_tail_length();
if (gap > 0) {
if (gap > len) gap = len;
+ //cout << "append first char is " << data[0] << ", last char is " << data[len-1] << std::endl;
append_buffer.append(data, gap);
append(append_buffer, append_buffer.end() - gap, gap); // add segment to the list
len -= gap;
// ----------------------------------------------------------
-// new encoders
+// encoders
+
+// DEPRECATED, please use _(en|de)code_(simple|complex)
// raw
template<class T>
#include <string>
#include <ext/hash_map>
+
+// ==================================================================
+// simple
+
+
+// raw
+template<class T>
+inline void _encode_raw(const T& t, bufferlist& bl)
+{
+ bl.append((char*)&t, sizeof(t));
+}
+template<class T>
+inline void _decode_raw(T& t, bufferlist::iterator &p)
+{
+ p.copy(sizeof(t), (char*)&t);
+}
+
+#include <set>
+#include <map>
+#include <deque>
+#include <vector>
+#include <string>
+#include <ext/hash_map>
+
+// list
+template<class T>
+inline void _encode_simple(const std::list<T>& ls, bufferlist& bl)
+{
+ // should i pre- or post- count?
+ if (!ls.empty()) {
+ unsigned pos = bl.length();
+ uint32_t n = 0;
+ _encode_raw(n, bl);
+ for (typename std::list<T>::const_iterator p = ls.begin(); p != ls.end(); ++p) {
+ n++;
+ _encode_simple(*p, bl);
+ }
+ bl.copy_in(pos, sizeof(n), (char*)&n);
+ } else {
+ uint32_t n = ls.size(); // FIXME: this is slow on a list.
+ _encode_raw(n, bl);
+ for (typename std::list<T>::const_iterator p = ls.begin(); p != ls.end(); ++p)
+ _encode_simple(*p, bl);
+ }
+}
+template<class T>
+inline void _decode_simple(std::list<T>& ls, bufferlist::iterator& p)
+{
+ uint32_t n;
+ _decode_raw(n, p);
+ ls.clear();
+ while (n--) {
+ T v;
+ _decode_simple(v, p);
+ ls.push_back(v);
+ }
+}
+
+// deque
+template<class T>
+inline void _encode_simple(const std::deque<T>& ls, bufferlist& bl)
+{
+ uint32_t n = ls.size();
+ _encode_raw(n, bl);
+ for (typename std::deque<T>::const_iterator p = ls.begin(); p != ls.end(); ++p)
+ _encode_simple(*p, bl);
+}
+template<class T>
+inline void _decode_simple(std::deque<T>& ls, bufferlist::iterator& p)
+{
+ uint32_t n;
+ _decode_raw(n, p);
+ ls.clear();
+ while (n--) {
+ T v;
+ _decode_simple(v, p);
+ ls.push_back(v);
+ }
+}
+
+// set
+template<class T>
+inline void _encode_simple(const std::set<T>& s, bufferlist& bl)
+{
+ uint32_t n = s.size();
+ _encode_raw(n, bl);
+ for (typename std::set<T>::const_iterator p = s.begin(); p != s.end(); ++p)
+ _encode_simple(*p, bl);
+}
+template<class T>
+inline void _decode_simple(std::set<T>& s, bufferlist::iterator& p)
+{
+ uint32_t n;
+ _decode_raw(n, p);
+ s.clear();
+ while (n--) {
+ T v;
+ _decode_simple(v, p);
+ s.insert(v);
+ }
+}
+
+// vector
+template<class T>
+inline void _encode_simple(const std::vector<T>& v, bufferlist& bl)
+{
+ uint32_t n = v.size();
+ _encode_raw(n, bl);
+ for (typename std::vector<T>::const_iterator p = v.begin(); p != v.end(); ++p)
+ _encode_simple(*p, bl);
+}
+template<class T>
+inline void _decode_simple(std::vector<T>& v, bufferlist::iterator& p)
+{
+ uint32_t n;
+ _decode_raw(n, p);
+ v.resize(n);
+ for (uint32_t i=0; i<n; i++)
+ _decode_simple(v[i], p);
+}
+
+// map
+template<class T, class U>
+inline void _encode_simple(const std::map<T,U>& m, bufferlist& bl)
+{
+ uint32_t n = m.size();
+ _encode_raw(n, bl);
+ for (typename std::map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) {
+ _encode_simple(p->first, bl);
+ _encode_simple(p->second, bl);
+ }
+}
+template<class T, class U>
+inline void _decode_simple(std::map<T,U>& m, bufferlist::iterator& p)
+{
+ uint32_t n;
+ _decode_raw(n, p);
+ m.clear();
+ while (n--) {
+ T k;
+ _decode_simple(k, p);
+ _decode_simple(m[k], p);
+ }
+}
+
+// hash_map
+template<class T, class U>
+inline void _encode_simple(const __gnu_cxx::hash_map<T,U>& m, bufferlist& bl)
+{
+ uint32_t n = m.size();
+ _encode_raw(n, bl);
+ for (typename __gnu_cxx::hash_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) {
+ _encode_simple(p->first, bl);
+ _encode_simple(p->second, bl);
+ }
+}
+template<class T, class U>
+inline void _decode_simple(__gnu_cxx::hash_map<T,U>& m, bufferlist::iterator& p)
+{
+ uint32_t n;
+ _decode_raw(n, p);
+ m.clear();
+ while (n--) {
+ T k;
+ _decode_simple(k, p);
+ _decode_simple(m[k], p);
+ }
+}
+
+// string
+inline void _encode_simple(const std::string& s, bufferlist& bl)
+{
+ uint32_t len = s.length();
+ _encode_raw(len, bl);
+ bl.append(s.data(), len);
+}
+inline void _decode_simple(std::string& s, bufferlist::iterator& p)
+{
+ uint32_t len;
+ _decode_raw(len, p);
+ s.clear();
+ p.copy(len, s);
+}
+
+// const char* (encode only, string compatible)
+inline void _encode_simple(const char *s, bufferlist& bl)
+{
+ uint32_t len = strlen(s);
+ _encode_raw(len, bl);
+ bl.append(s, len);
+}
+
+// bufferptr (encapsulated)
+inline void _encode_simple(const buffer::ptr& bp, bufferlist& bl)
+{
+ uint32_t len = bp.length();
+ _encode_raw(len, bl);
+ bl.append(bp);
+}
+inline void _decode_simple(buffer::ptr& bp, bufferlist::iterator& p)
+{
+ uint32_t len;
+ _decode_raw(len, p);
+
+ bufferlist s;
+ p.copy(len, s);
+
+ if (s.buffers().size() == 1)
+ bp = s.buffers().front();
+ else
+ bp = buffer::copy(s.c_str(), s.length());
+}
+
+// bufferlist (encapsulated)
+inline void _encode_simple(const bufferlist& s, bufferlist& bl)
+{
+ uint32_t len = s.length();
+ _encode_raw(len, bl);
+ bl.append(s);
+}
+inline void _encode_simple_destructively(bufferlist& s, bufferlist& bl)
+{
+ uint32_t len = s.length();
+ _encode_raw(len, bl);
+ bl.claim_append(s);
+}
+inline void _decode_simple(bufferlist& s, bufferlist::iterator& p)
+{
+ uint32_t len;
+ _decode_raw(len, p);
+ s.clear();
+ p.copy(len, s);
+}
+
+// base
+template<class T>
+inline void _encode_simple(const T& t, bufferlist& bl)
+{
+ _encode_raw(t, bl);
+}
+template<class T>
+inline void _decode_simple(T& t, bufferlist::iterator& p)
+{
+ _decode_raw(t, p);
+}
+
+
+
+
+// ==================================================================
+// complex
+
// list
template<class T>
inline void _encode_complex(const std::list<T>& ls, bufferlist& bl)
{
uint32_t n = ls.size();
- _encoderaw(n, bl);
+ _encode_raw(n, bl);
for (typename std::list<T>::const_iterator p = ls.begin(); p != ls.end(); ++p)
_encode_complex(*p, bl);
}
template<class T>
-inline void _decode_complex(std::list<T>& ls, bufferlist& bl, int& off)
+inline void _decode_complex(std::list<T>& ls, bufferlist::iterator& p)
{
uint32_t n;
- _decoderaw(n, bl, off);
+ _decode_raw(n, p);
ls.clear();
while (n--) {
T v;
- _decode_complex(v, bl, off);
+ _decode_complex(v, p);
ls.push_back(v);
}
}
inline void _encode_complex(const std::deque<T>& ls, bufferlist& bl)
{
uint32_t n = ls.size();
- _encoderaw(n, bl);
+ _encode_raw(n, bl);
for (typename std::deque<T>::const_iterator p = ls.begin(); p != ls.end(); ++p)
_encode_complex(*p, bl);
}
template<class T>
-inline void _decode_complex(std::deque<T>& ls, bufferlist& bl, int& off)
+inline void _decode_complex(std::deque<T>& ls, bufferlist::iterator& p)
{
uint32_t n;
- _decoderaw(n, bl, off);
+ _decode_raw(n, p);
ls.clear();
while (n--) {
T v;
- _decode_complex(v, bl, off);
+ _decode_complex(v, p);
ls.push_back(v);
}
}
inline void _encode_complex(const std::set<T>& s, bufferlist& bl)
{
uint32_t n = s.size();
- _encoderaw(n, bl);
+ _encode_raw(n, bl);
for (typename std::set<T>::const_iterator p = s.begin(); p != s.end(); ++p)
_encode_complex(*p, bl);
}
template<class T>
-inline void _decode_complex(std::set<T>& s, bufferlist& bl, int& off)
+inline void _decode_complex(std::set<T>& s, bufferlist::iterator& p)
{
uint32_t n;
- _decoderaw(n, bl, off);
+ _decode_raw(n, p);
s.clear();
while (n--) {
T v;
- _decode_complex(v, bl, off);
+ _decode_complex(v, p);
s.insert(v);
}
}
inline void _encode_complex(const std::vector<T>& v, bufferlist& bl)
{
uint32_t n = v.size();
- _encoderaw(n, bl);
+ _encode_raw(n, bl);
for (typename std::vector<T>::const_iterator p = v.begin(); p != v.end(); ++p)
_encode_complex(*p, bl);
}
template<class T>
-inline void _decode_complex(std::vector<T>& v, bufferlist& bl, int& off)
+inline void _decode_complex(std::vector<T>& v, bufferlist::iterator& p)
{
uint32_t n;
- _decoderaw(n, bl, off);
+ _decode_raw(n, p);
v.resize(n);
for (uint32_t i=0; i<n; i++)
- _decode_complex(v[i], bl, off);
+ _decode_complex(v[i], p);
}
// map
inline void _encode_complex(const std::map<T,U>& m, bufferlist& bl)
{
uint32_t n = m.size();
- _encoderaw(n, bl);
+ _encode_raw(n, bl);
for (typename std::map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) {
- _encode(p->first, bl);
+ _encode_simple(p->first, bl);
_encode_complex(p->second, bl);
}
}
template<class T, class U>
-inline void _decode_complex(std::map<T,U>& m, bufferlist& bl, int& off)
+inline void _decode_complex(std::map<T,U>& m, bufferlist::iterator& p)
{
uint32_t n;
- _decoderaw(n, bl, off);
+ _decode_raw(n, p);
m.clear();
while (n--) {
T k;
- _decode(k, bl, off);
- _decode_complex(m[k], bl, off);
+ _decode_simple(k, p);
+ _decode_complex(m[k], p);
}
}
inline void _encode_complex(const __gnu_cxx::hash_map<T,U>& m, bufferlist& bl)
{
uint32_t n = m.size();
- _encoderaw(n, bl);
+ _encode_raw(n, bl);
for (typename __gnu_cxx::hash_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) {
- _encode(p->first, bl);
+ _encode_simple(p->first, bl);
_encode_complex(p->second, bl);
}
}
template<class T, class U>
-inline void _decode_complex(__gnu_cxx::hash_map<T,U>& m, bufferlist& bl, int& off)
+inline void _decode_complex(__gnu_cxx::hash_map<T,U>& m, bufferlist::iterator& p)
{
uint32_t n;
- _decoderaw(n, bl, off);
+ _decode_raw(n, p);
m.clear();
while (n--) {
T k;
- _decode(k, bl, off);
- _decode_complex(m[k], bl, off);
+ _decode_simple(k, p);
+ _decode_complex(m[k], p);
}
}
t._encode(bl);
}
template<class T>
-inline void _decode_complex(T& t, bufferlist& bl, int& off)
+inline void _decode_complex(T& t, bufferlist::iterator& p)
{
- t._decode(bl, off);
+ t._decode(p);
}
#endif
#include <list>
#include <iostream>
#include "buffer.h"
+#include "encodable.h"
/*
*
// pairs <f, b>:
// frag_t f is split by b bits.
// if child frag_t does not appear, it is not split.
- std::map<frag_t,__int32_t> _splits;
+ std::map<frag_t,int32_t> _splits;
public:
// -------------
}
// encoding
- void _encode(bufferlist& bl) {
+ void _encode(bufferlist& bl) const {
::_encode(_splits, bl);
}
void _decode(bufferlist& bl, int& off) {
::_decode(_splits, bl, off);
}
+ void _decode(bufferlist::iterator& p) {
+ ::_decode_simple(_splits, p);
+ }
void print(std::ostream& out) {
out << "fragtree_t(";
return true;
}
+ void lru_touch_entire_pintail() {
+ // promote entire pintail to the top lru
+ while (lru_pintail.get_length() > 0) {
+ LRUObject *o = lru_pintail.get_head();
+ lru_pintail.remove(o);
+ lru_top.insert_tail(o);
+ }
+ }
+
// expire -- expire a single item
LRUObject *lru_get_next_expire() {
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef __BUFFER_H
-#define __BUFFER_H
-
-#include <cassert>
-#include <string.h>
-
-#include <iostream>
-using namespace std;
-
-// bit masks
-#define BUFFER_MODE_NOCOPY 0
-#define BUFFER_MODE_COPY 1 // copy on create, my buffer
-
-#define BUFFER_MODE_NOFREE 0
-#define BUFFER_MODE_FREE 2
-
-#define BUFFER_MODE_CUSTOMFREE 4
-
-#define BUFFER_MODE_DEFAULT 3//(BUFFER_MODE_COPY|BUFFER_MODE_FREE)
-
-
-// debug crap
-#include "config.h"
-#define bdbout(x) if (x <= g_conf.debug_buffer) cout
-
-#include "common/Mutex.h"
-
-// HACK: in config.cc
-/*
- * WARNING: bufferlock placements are tricky for efficiency. note that only bufferptr and
- * buffer ever use buffer._ref, and only bufferptr should call ~buffer().
- *
- * So, I only need to protect:
- * - buffer()'s modification of buffer_total_alloc
- * - ~bufferptr() check of buffer._ref, and ~buffer's mod of buffer_total_alloc
- *
- * I don't protect
- * - buffer._get() .. increment is atomic on any sane architecture
- * - buffer._put() .. only called by ~bufferptr.
- * - ~buffer .. only called by ~bufferptr *** I HOPE!!
- */
-extern Mutex bufferlock;
-extern long buffer_total_alloc;
-
-
-typedef void (buffer_free_func_t)(void*,char*,unsigned);
-
-
-/*
- * buffer - the underlying buffer container. with a reference count.
- *
- * the buffer never shrinks.
- *
- * some invariants:
- * _len never shrinks
- * _len <= _alloc_len
- */
-class buffer {
- protected:
- //wtf
- //static Mutex bufferlock;
- //static long buffer_total_alloc;// = 0;
-
- private:
- // raw buffer alloc
- char *_dataptr;
- bool _myptr;
- unsigned _len;
- unsigned _alloc_len;
-
- // ref counts
- unsigned _ref;
- int _get() {
- bdbout(1) << "buffer.get " << *this << " get " << _ref+1 << endl;
- return ++_ref;
- }
- int _put() {
- bdbout(1) << "buffer.put " << *this << " put " << _ref-1 << endl;
- assert(_ref > 0);
- return --_ref;
- }
-
- // custom (de!)allocator
- buffer_free_func_t *free_func;
- void *free_func_arg;
-
- friend class bufferptr;
-
- public:
- // constructors
- buffer() : _dataptr(0), _myptr(true), _len(0), _alloc_len(0), _ref(0), free_func(0), free_func_arg(0) {
- bdbout(1) << "buffer.cons " << *this << endl;
- }
- buffer(unsigned a) : _dataptr(0), _myptr(true), _len(a), _alloc_len(a), _ref(0), free_func(0), free_func_arg(0) {
- bdbout(1) << "buffer.cons " << *this << endl;
- _dataptr = new char[a];
- bufferlock.Lock();
- buffer_total_alloc += _alloc_len;
- bufferlock.Unlock();
- bdbout(1) << "buffer.malloc " << (void*)_dataptr << endl;
- }
- ~buffer() {
- bdbout(1) << "buffer.des " << *this << " " << (void*)free_func << endl;
- if (free_func) {
- bdbout(1) << "buffer.custom_free_func " << free_func_arg << " " << (void*)_dataptr << endl;
- free_func( free_func_arg, _dataptr, _alloc_len );
- }
- else if (_dataptr && _myptr) {
- bdbout(1) << "buffer.free " << (void*)_dataptr << endl;
- delete[] _dataptr;
- buffer_total_alloc -= _alloc_len;
- }
- }
-
- buffer(const char *p, int l, int mode=BUFFER_MODE_DEFAULT, int alloc_len=0,
- buffer_free_func_t free_func=0, void* free_func_arg=0) :
- _dataptr(0),
- _myptr(false),
- _len(l),
- _ref(0),
- free_func(0), free_func_arg(0) {
-
- if (alloc_len)
- _alloc_len = alloc_len;
- else
- _alloc_len = l;
-
- _myptr = mode & BUFFER_MODE_FREE ? true:false;
- bdbout(1) << "buffer.cons " << *this << " mode = " << mode << ", myptr=" << _myptr << endl;
- if (mode & BUFFER_MODE_COPY) {
- _dataptr = new char[_alloc_len];
- bdbout(1) << "buffer.malloc " << (void*)_dataptr << endl;
- bufferlock.Lock();
- buffer_total_alloc += _alloc_len;
- bufferlock.Unlock();
- memcpy(_dataptr, p, l);
- bdbout(1) << "buffer.copy " << *this << endl;
- } else {
- _dataptr = (char*)p; // ugly
- bdbout(1) << "buffer.claim " << *this << " myptr=" << _myptr << endl;
- }
-
- if (mode & BUFFER_MODE_CUSTOMFREE && free_func) {
- this->free_func = free_func;
- this->free_func_arg = free_func_arg;
- }
- }
-
- // operators
- buffer& operator=(buffer& other) {
- assert(0); // not implemented, no reasonable assignment semantics.
- return *this;
- }
-
- char *c_str() {
- return _dataptr;
- }
-
- bool has_free_func() { return free_func != 0; }
-
- // accessor
- unsigned alloc_length() {
- return _alloc_len;
- }
- void set_length(unsigned l) {
- assert(l <= _alloc_len);
- _len = l;
- }
- unsigned length() { return _len; }
- unsigned unused_tail_length() { return _alloc_len - _len; }
-
- friend ostream& operator<<(ostream& out, buffer& b);
-};
-
-inline ostream& operator<<(ostream& out, buffer& b) {
- return out << "buffer(this=" << &b << " len=" << b._len << ", alloc=" << b._alloc_len << ", data=" << (void*)b._dataptr << " ref=" << b._ref << ")";
-}
-
-
-/*
- * smart pointer class for buffer
- *
- * we reference count the actual buffer.
- * we also let you refer to a subset of a buffer.
- * we implement the high-level buffer accessor methods.
- *
- * some invariants:
- * _off < _buffer->_len
- * _off + _len <= _buffer->_len
- */
-class bufferptr {
- private:
- buffer *_buffer;
- unsigned _len, _off;
-
- public:
- // empty cons
- bufferptr() :
- _buffer(0),
- _len(0),
- _off(0) { }
- // main cons - the entire buffer
- bufferptr(buffer *b) :
- _buffer(b),
- _len(b->_len),
- _off(0) {
- assert(_buffer->_ref == 0);
- _buffer->_get(); // this is always the first one.
- }
- // subset cons - a subset of another bufferptr (subset)
- bufferptr(const bufferptr& bp, unsigned len, unsigned off) {
- bufferlock.Lock();
- _buffer = bp._buffer;
- _len = len;
- _off = bp._off + off;
- _buffer->_get();
- assert(_off < _buffer->_len); // sanity checks
- assert(_off + _len <= _buffer->_len);
- bufferlock.Unlock();
- }
-
- // copy cons
- bufferptr(const bufferptr &other) {
- bufferlock.Lock();
- _buffer = other._buffer;
- _len = other._len;
- _off = other._off;
- if (_buffer) _buffer->_get();
- bufferlock.Unlock();
- }
-
- // assignment operator
- bufferptr& operator=(const bufferptr& other) {
- //assert(0);
- // discard old
- discard_buffer();
-
- // point to other
- bufferlock.Lock();
- _buffer = other._buffer;
- _len = other._len;
- _off = other._off;
- if (_buffer) _buffer->_get();
- bufferlock.Unlock();
- return *this;
- }
-
- ~bufferptr() {
- discard_buffer();
- }
-
- void discard_buffer() {
- if (_buffer) {
- bufferlock.Lock();
- if (_buffer->_put() == 0)
- delete _buffer;
- _buffer = 0;
- bufferlock.Unlock();
- }
- }
-
-
- // dereference to get the actual buffer
- buffer& operator*() {
- return *_buffer;
- }
-
-
- bool at_buffer_head() const {
- return _off == 0;
- }
- bool at_buffer_tail() const {
- return _off + _len == _buffer->_len;
- }
-
- // accessors for my subset
- char *c_str() {
- return _buffer->c_str() + _off;
- }
- unsigned length() const {
- return _len;
- }
- unsigned offset() const {
- return _off;
- }
- unsigned unused_tail_length() {
- if (!at_buffer_tail()) return 0;
- return _buffer->unused_tail_length();
- }
-
-
-
- // modifiers
- void set_offset(unsigned off) {
- assert(off <= _buffer->_alloc_len);
- _off = off;
- }
- void set_length(unsigned len) {
- assert(len >= 0 && _off + len <= _buffer->_alloc_len);
- if (_buffer->_len < _off + len)
- _buffer->_len = _off + len; // set new buffer len (_IF_ i'm expanding it)
- _len = len; // my len too
- }
- void zero() {
- //bzero((void*)c_str(), _len);
- memset((void*)c_str(), 0, _len);
- }
-
-
- // crope lookalikes
- void append(const char *p, unsigned len) {
- assert(len + _len + _off <= _buffer->_alloc_len); // FIXME later for auto-expansion?
-
- // copy
- memcpy(c_str() + _len, p, len);
- _buffer->_len += len;
- _len += len;
- }
- void copy_out(unsigned off, unsigned len, char *dest) {
- assert(off >= 0 && off <= _len);
- assert(len >= 0 && off + len <= _len);
- memcpy(dest, c_str() + off, len);
- }
- void copy_in(unsigned off, unsigned len, const char *src) {
- assert(off >= 0 && off <= _len);
- assert(len >= 0 && off + len <= _len);
- memcpy(c_str() + off, src, len);
- }
-
- friend ostream& operator<<(ostream& out, bufferptr& bp);
-};
-
-
-inline ostream& operator<<(ostream& out, bufferptr& bp) {
- return out << "bufferptr(len=" << bp._len << " off=" << bp._off
- << " cstr=" << (void*)bp.c_str()
- << " buf=" << *bp._buffer
- << ")";
-}
-
-
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef __BUFFERLIST_H
-#define __BUFFERLIST_H
-
-#include "buffer.h"
-
-#include <list>
-#include <map>
-#include <set>
-#include <vector>
-using namespace std;
-
-#include <ext/rope>
-using namespace __gnu_cxx;
-
-
-// debug crap
-#include "config.h"
-#define bdbout(x) if (x <= g_conf.debug_buffer) cout
-
-
-
-class bufferlist {
- private:
- /* local state limited to _buffers, and _len.
- * we maintain _len ourselves, so we must be careful when fiddling with buffers!
- */
- list<bufferptr> _buffers;
- unsigned _len;
-
- public:
- // cons/des
- bufferlist() : _len(0) {
- bdbout(1) << "bufferlist.cons " << this << endl;
- }
- bufferlist(const bufferlist& bl) : _len(0) {
- //assert(0); // o(n) and stupid
- bdbout(1) << "bufferlist.cons " << this << endl;
- _buffers = bl._buffers;
- _len = bl._len;
- }
- ~bufferlist() {
- bdbout(1) << "bufferlist.des " << this << endl;
- }
-
- bufferlist& operator=(bufferlist& bl) {
- //assert(0); // actually, this should be fine, just slow (O(n)) and stupid.
- bdbout(1) << "bufferlist.= " << this << endl;
- _buffers = bl._buffers;
- _len = bl._len;
- return *this;
- }
-
-
- // accessors
- list<bufferptr>& buffers() {
- return _buffers;
- }
- //list<buffer*>::iterator begin() { return _buffers.begin(); }
- //list<buffer*>::iterator end() { return _buffers.end(); }
-
- unsigned length() const {
-#if 0
- { // DEBUG: verify _len
- int len = 0;
- for (list<bufferptr>::iterator it = _buffers.begin();
- it != _buffers.end();
- it++) {
- len += (*it).length();
- }
- assert(len == _len);
- }
-#endif
- return _len;
- }
-
- void _rope(crope& r) {
- for (list<bufferptr>::iterator it = _buffers.begin();
- it != _buffers.end();
- it++)
- r.append((*it).c_str(), (*it).length());
- }
-
- // modifiers
- void clear() {
- _buffers.clear();
- _len = 0;
- }
- void push_front(bufferptr& bp) {
- _buffers.push_front(bp);
- _len += bp.length();
- }
- void push_front(buffer *b) {
- bufferptr bp(b);
- _buffers.push_front(bp);
- _len += bp.length();
- }
- void push_back(bufferptr& bp) {
- _buffers.push_back(bp);
- _len += bp.length();
- }
- void push_back(buffer *b) {
- bufferptr bp(b);
-
- _buffers.push_back(bp);
- _len += bp.length();
-
- }
- void zero() {
- for (list<bufferptr>::iterator it = _buffers.begin();
- it != _buffers.end();
- it++)
- it->zero();
- }
-
- // sort-of-like-assignment-op
- void claim(bufferlist& bl) {
- // free my buffers
- clear();
- claim_append(bl);
- }
- void claim_append(bufferlist& bl) {
- // steal the other guy's buffers
- _len += bl._len;
- _buffers.splice( _buffers.end(), bl._buffers );
- bl._len = 0;
- }
-
-
-
-
- // crope lookalikes
- void copy(unsigned off, unsigned len, char *dest) {
- assert(off >= 0);
- assert(off + len <= length());
- /*assert(off < length());
- if (off + len > length())
- len = length() - off;
- */
- // advance to off
- list<bufferptr>::iterator curbuf = _buffers.begin();
-
- // skip off
- while (off > 0) {
- assert(curbuf != _buffers.end());
- if (off >= (*curbuf).length()) {
- // skip this buffer
- off -= (*curbuf).length();
- curbuf++;
- } else {
- // somewhere in this buffer!
- break;
- }
- }
-
- // copy
- while (len > 0) {
- // is the rest ALL in this buffer?
- if (off + len <= (*curbuf).length()) {
- (*curbuf).copy_out(off, len, dest); // yup, last bit!
- break;
- }
-
- // get as much as we can from this buffer.
- unsigned howmuch = (*curbuf).length() - off;
- (*curbuf).copy_out(off, howmuch, dest);
-
- dest += howmuch;
- len -= howmuch;
- off = 0;
- curbuf++;
- assert(curbuf != _buffers.end());
- }
- }
-
- void copy_in(unsigned off, unsigned len, const char *src) {
- assert(off >= 0);
- assert(off + len <= length());
-
- // advance to off
- list<bufferptr>::iterator curbuf = _buffers.begin();
-
- // skip off
- while (off > 0) {
- assert(curbuf != _buffers.end());
- if (off >= (*curbuf).length()) {
- // skip this buffer
- off -= (*curbuf).length();
- curbuf++;
- } else {
- // somewhere in this buffer!
- break;
- }
- }
-
- // copy
- while (len > 0) {
- // is the rest ALL in this buffer?
- if (off + len <= (*curbuf).length()) {
- (*curbuf).copy_in(off, len, src); // yup, last bit!
- break;
- }
-
- // get as much as we can from this buffer.
- unsigned howmuch = (*curbuf).length() - off;
- (*curbuf).copy_in(off, howmuch, src);
-
- src += howmuch;
- len -= howmuch;
- off = 0;
- curbuf++;
- assert(curbuf != _buffers.end());
- }
- }
- void copy_in(unsigned off, unsigned len, bufferlist& bl) {
- unsigned left = len;
- for (list<bufferptr>::iterator i = bl._buffers.begin();
- i != bl._buffers.end();
- i++) {
- unsigned l = (*i).length();
- if (left < l) l = left;
- copy_in(off, l, (*i).c_str());
- left -= l;
- if (left == 0) break;
- off += l;
- }
- }
-
-
- void append(const char *data, unsigned len) {
- if (len == 0) return;
-
- unsigned alen = 0;
-
- // copy into the tail buffer?
- if (!_buffers.empty()) {
- unsigned avail = _buffers.back().unused_tail_length();
- if (avail > 0) {
- //cout << "copying up to " << len << " into tail " << avail << " bytes of tail buf" << endl;
- if (avail > len)
- avail = len;
- unsigned blen = _buffers.back().length();
- memcpy(_buffers.back().c_str() + blen, data, avail);
- blen += avail;
- _buffers.back().set_length(blen);
- _len += avail;
- data += avail;
- len -= avail;
- }
- alen = _buffers.back().length();
- }
- if (len == 0) return;
-
- // just add another buffer.
- // alloc a bit extra, in case we do a bunch of appends. FIXME be smarter!
- if (alen < 1024) alen = 1024;
- push_back(new buffer(data, len, BUFFER_MODE_DEFAULT, len+alen));
- }
- void append(bufferptr& bp) {
- push_back(bp);
- }
- void append(bufferptr& bp, unsigned len, unsigned off) {
- bufferptr tempbp(bp, len, off);
- push_back(tempbp);
- }
- void append(const bufferlist& bl) {
- bufferlist temp = bl; // copy list
- claim_append(temp); // and append
- }
-
-
- /*
- * return a contiguous ptr to whole bufferlist contents.
- */
- char *c_str() {
- if (_buffers.size() == 1) {
- return _buffers.front().c_str(); // good, we're already contiguous.
- }
- else if (_buffers.size() == 0) {
- return 0; // no buffers
- }
- else {
- // make one new contiguous buffer.
- bufferptr newbuf = new buffer(length());
- unsigned off = 0;
-
- for (list<bufferptr>::iterator it = _buffers.begin();
- it != _buffers.end();
- it++) {
- //assert((*(*it)).has_free_func() == false); // not allowed if there's a funky free_func.. -sage ...for debugging at least!
- memcpy(newbuf.c_str() + off,
- (*it).c_str(), (*it).length());
- off += (*it).length();
- }
- assert(off == newbuf.length());
-
- _buffers.clear();
- _buffers.push_back( newbuf );
-
- // now it'll work.
- return c_str();
- }
- }
-
-
- void substr_of(bufferlist& other, unsigned off, unsigned len) {
- assert(off + len <= other.length());
- clear();
-
- // skip off
- list<bufferptr>::iterator curbuf = other._buffers.begin();
- while (off > 0) {
- assert(curbuf != _buffers.end());
- if (off >= (*curbuf).length()) {
- // skip this buffer
- //cout << "skipping over " << *curbuf << endl;
- off -= (*curbuf).length();
- curbuf++;
- } else {
- // somewhere in this buffer!
- //cout << "somewhere in " << *curbuf << endl;
- break;
- }
- }
-
- while (len > 0) {
- // partial?
- if (off + len < (*curbuf).length()) {
- //cout << "copying partial of " << *curbuf << endl;
- _buffers.push_back( bufferptr( *curbuf, len, off ) );
- _len += len;
- break;
- }
-
- // through end
- //cout << "copying end (all?) of " << *curbuf << endl;
- unsigned howmuch = (*curbuf).length() - off;
- _buffers.push_back( bufferptr( *curbuf, howmuch, off ) );
- _len += howmuch;
- len -= howmuch;
- off = 0;
- curbuf++;
- }
- }
-
- // funky modifer
- void splice(unsigned off, unsigned len, bufferlist *claim_by=0 /*, bufferlist& replace_with */) { // fixme?
- assert(off < length());
- assert(len > 0);
- //cout << "splice off " << off << " len " << len << " ... mylen = " << length() << endl;
-
- // skip off
- list<bufferptr>::iterator curbuf = _buffers.begin();
- while (off > 0) {
- assert(curbuf != _buffers.end());
- if (off >= (*curbuf).length()) {
- // skip this buffer
- //cout << "off = " << off << " skipping over " << *curbuf << endl;
- off -= (*curbuf).length();
- curbuf++;
- } else {
- // somewhere in this buffer!
- //cout << "off = " << off << " somewhere in " << *curbuf << endl;
- break;
- }
- }
- assert(off >= 0);
-
- if (off) {
- // add a reference to the front bit
- // insert it before curbuf (which we'll hose)
- //cout << "keeping front " << off << " of " << *curbuf << endl;
- _buffers.insert( curbuf, bufferptr( *curbuf, off, 0 ) );
- _len += off;
- }
-
- while (len > 0) {
- // partial?
- if (off + len < (*curbuf).length()) {
- //cout << "keeping end of " << *curbuf << ", losing first " << off+len << endl;
- if (claim_by)
- claim_by->append( *curbuf, len, off );
- (*curbuf).set_offset( off+len + (*curbuf).offset() ); // ignore beginning big
- (*curbuf).set_length( (*curbuf).length() - (len+off) );
- _len -= off+len;
- //cout << " now " << *curbuf << endl;
- break;
- }
-
- // hose though the end
- unsigned howmuch = (*curbuf).length() - off;
- //cout << "discarding " << howmuch << " of " << *curbuf << endl;
- if (claim_by)
- claim_by->append( *curbuf, howmuch, off );
- _len -= (*curbuf).length();
- _buffers.erase( curbuf++ );
- len -= howmuch;
- off = 0;
- }
-
- // splice in *replace (implement me later?)
- }
-
- friend ostream& operator<<(ostream& out, bufferlist& bl);
-
-};
-
-inline ostream& operator<<(ostream& out, bufferlist& bl) {
- out << "bufferlist(len=" << bl.length() << endl;
- for (list<bufferptr>::iterator it = bl._buffers.begin();
- it != bl._buffers.end();
- it++)
- out << "\t" << *it << endl;
- out << ")" << endl;
- return out;
-}
-
-
-
-// encoder/decode helpers
-
-// string
-inline void _encode(const string& s, bufferlist& bl)
-{
- bl.append(s.c_str(), s.length()+1);
-}
-inline void _decode(string& s, bufferlist& bl, int& off)
-{
- s = bl.c_str() + off;
- off += s.length() + 1;
-}
-
-// bufferptr (encapsulated)
-inline void _encode(bufferptr& bp, bufferlist& bl)
-{
- size_t len = bp.length();
- bl.append((char*)&len, sizeof(len));
- bl.append(bp);
-}
-inline void _decode(bufferptr& bp, bufferlist& bl, int& off)
-{
- size_t len;
- bl.copy(off, sizeof(len), (char*)&len);
- off += sizeof(len);
- bufferlist s;
- s.substr_of(bl, off, len);
- off += len;
-
- if (s.buffers().size() == 1)
- bp = s.buffers().front();
- else
- bp = new buffer(s.c_str(), s.length());
-}
-
-// bufferlist (encapsulated)
-inline void _encode(const bufferlist& s, bufferlist& bl)
-{
- size_t len = s.length();
- bl.append((char*)&len, sizeof(len));
- bl.append(s);
-}
-inline void _decode(bufferlist& s, bufferlist& bl, int& off)
-{
- size_t len;
- bl.copy(off, sizeof(len), (char*)&len);
- off += sizeof(len);
- s.substr_of(bl, off, len);
- off += len;
-}
-
-
-// set<T>
-template<class T>
-inline void _encode(set<T>& s, bufferlist& bl)
-{
- int n = s.size();
- bl.append((char*)&n, sizeof(n));
- for (typename set<T>::iterator it = s.begin();
- it != s.end();
- it++) {
- T v = *it;
- bl.append((char*)&v, sizeof(v));
- n--;
- }
- assert(n==0);
-}
-template<class T>
-inline void _decode(set<T>& s, bufferlist& bl, int& off)
-{
- s.clear();
- int n;
- bl.copy(off, sizeof(n), (char*)&n);
- off += sizeof(n);
- for (int i=0; i<n; i++) {
- T v;
- bl.copy(off, sizeof(v), (char*)&v);
- off += sizeof(v);
- s.insert(v);
- }
- assert(s.size() == (unsigned)n);
-}
-
-// vector<T>
-template<class T>
-inline void _encode(vector<T>& s, bufferlist& bl)
-{
- int n = s.size();
- bl.append((char*)&n, sizeof(n));
- for (typename vector<T>::iterator it = s.begin();
- it != s.end();
- it++) {
- T v = *it;
- bl.append((char*)&v, sizeof(v));
- n--;
- }
- assert(n==0);
-}
-template<class T>
-inline void _decode(vector<T>& s, bufferlist& bl, int& off)
-{
- s.clear();
- int n;
- bl.copy(off, sizeof(n), (char*)&n);
- off += sizeof(n);
- s = vector<T>(n);
- for (int i=0; i<n; i++) {
- T v;
- bl.copy(off, sizeof(v), (char*)&v);
- off += sizeof(v);
- s[i] = v;
- }
- assert(s.size() == (unsigned)n);
-}
-
-// list<T>
-template<class T>
-inline void _encode(const list<T>& s, bufferlist& bl)
-{
- int n = s.size();
- bl.append((char*)&n, sizeof(n));
- for (typename list<T>::const_iterator it = s.begin();
- it != s.end();
- it++) {
- T v = *it;
- bl.append((char*)&v, sizeof(v));
- n--;
- }
- assert(n==0);
-}
-template<class T>
-inline void _decode(list<T>& s, bufferlist& bl, int& off)
-{
- s.clear();
- int n;
- bl.copy(off, sizeof(n), (char*)&n);
- off += sizeof(n);
- for (int i=0; i<n; i++) {
- T v;
- bl.copy(off, sizeof(v), (char*)&v);
- off += sizeof(v);
- s.push_back(v);
- }
- assert(s.size() == (unsigned)n);
-}
-
-// map<string,bufferptr>
-inline void _encode(map<string, bufferptr>& s, bufferlist& bl)
-{
- int n = s.size();
- bl.append((char*)&n, sizeof(n));
- for (map<string, bufferptr>::iterator it = s.begin();
- it != s.end();
- it++) {
- _encode(it->first, bl);
- _encode(it->second, bl);
- n--;
- }
- assert(n==0);
-}
-inline void _decode(map<string,bufferptr>& s, bufferlist& bl, int& off)
-{
- s.clear();
- int n;
- bl.copy(off, sizeof(n), (char*)&n);
- off += sizeof(n);
- for (int i=0; i<n; i++) {
- string k;
- _decode(k, bl, off);
- _decode(s[k], bl, off);
- }
- assert(s.size() == (unsigned)n);
-}
-
-
-// map<T,bufferlist>
-template<class T>
-inline void _encode(const map<T, bufferlist>& s, bufferlist& bl)
-{
- int n = s.size();
- bl.append((char*)&n, sizeof(n));
- for (typename map<T, bufferlist>::const_iterator it = s.begin();
- it != s.end();
- it++) {
- T k = it->first;
- bl.append((char*)&k, sizeof(k));
- _encode(it->second, bl);
- n--;
- }
- assert(n==0);
-}
-template<class T>
-inline void _decode(map<T,bufferlist>& s, bufferlist& bl, int& off)
-{
- s.clear();
- int n;
- bl.copy(off, sizeof(n), (char*)&n);
- off += sizeof(n);
- for (int i=0; i<n; i++) {
- T k;
- bl.copy(off, sizeof(k), (char*)&k);
- off += sizeof(k);
- bufferlist b;
- _decode(b, bl, off);
- s[k] = b;
- }
- assert(s.size() == (unsigned)n);
-}
-
-// map<T,U>
-template<class T, class U>
-inline void _encode(const map<T, U>& s, bufferlist& bl)
-{
- int n = s.size();
- bl.append((char*)&n, sizeof(n));
- for (typename map<T, U>::const_iterator it = s.begin();
- it != s.end();
- it++) {
- T k = it->first;
- U v = it->second;
- bl.append((char*)&k, sizeof(k));
- bl.append((char*)&v, sizeof(v));
- n--;
- }
- assert(n==0);
-}
-template<class T, class U>
-inline void _decode(map<T,U>& s, bufferlist& bl, int& off)
-{
- s.clear();
- int n;
- bl.copy(off, sizeof(n), (char*)&n);
- off += sizeof(n);
- for (int i=0; i<n; i++) {
- T k;
- U v;
- bl.copy(off, sizeof(k), (char*)&k);
- off += sizeof(k);
- bl.copy(off, sizeof(v), (char*)&v);
- off += sizeof(v);
- s[k] = v;
- }
- assert(s.size() == (unsigned)n);
-}
-
-
-
-
-#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __TRIPLE_H
+#define __TRIPLE_H
+
+template<class A, class B, class C>
+class triple {
+ public:
+ A first;
+ B second;
+ C third;
+ triple() {}
+ triple(A f, B s, C t) : first(f), second(s), third(t) {}
+};
+
+#endif
// base (immutable)
inodeno_t ino;
FileLayout layout; // ?immutable?
- dev_t rdev; // if special file
+ uint32_t rdev; // if special file
// affected by any inode change...
utime_t ctime; // inode change time
// perm (namespace permissions)
- mode_t mode;
+ uint32_t mode;
uid_t uid;
gid_t gid;
bool anchored; // auth only?
// file (data access)
- off_t size, max_size, allocated_size;
+ int64_t size, max_size, allocated_size;
utime_t mtime; // file data modify time.
utime_t atime; // file data access time.
xlist *_head;
item(T i) : _item(i), _prev(0), _next(0), _head(0) {}
+ ~item() {
+ remove_myself();
+ }
xlist* get_xlist() { return _head; }
void remove_myself() {
- if (_head) {
+ if (_head)
_head->remove(this);
- }
+ assert(_head == 0);
}
};
public:
xlist() : _front(0), _back(0), _size(0) {}
+ ~xlist() {
+ assert(_size == 0);
+ assert(_front == 0);
+ assert(_back == 0);
+ }
int size() { return _size; }
bool empty() {
T front() { return (T)_front->_item; }
T back() { return (T)_back->_item; }
+ void pop_front() {
+ assert(!empty());
+ remove(_front);
+ }
+ void pop_back() {
+ assert(!empty());
+ remove(_back);
+ }
+
class iterator {
private:
item *cur;
#include "MDS.h"
#include "MDLog.h"
+#include "LogSegment.h"
#include "events/EAnchorClient.h"
#include "messages/MAnchor.h"
*pending_create_prepare[ino].patid = atid;
pending_create_prepare.erase(ino);
- pending_commit.insert(atid);
-
if (onfinish) {
onfinish->finish(0);
delete onfinish;
*pending_destroy_prepare[ino].patid = atid;
pending_destroy_prepare.erase(ino);
- pending_commit.insert(atid);
-
if (onfinish) {
onfinish->finish(0);
delete onfinish;
*pending_update_prepare[ino].patid = atid;
pending_update_prepare.erase(ino);
- pending_commit.insert(atid);
-
if (onfinish) {
onfinish->finish(0);
delete onfinish;
// remove from committing list
assert(pending_commit.count(atid));
- pending_commit.erase(atid);
-
+ assert(pending_commit[atid]->pending_commit_atids.count(atid));
+
// log ACK.
- mds->mdlog->submit_entry(new EAnchorClient(ANCHOR_OP_ACK, atid));
-
- // kick any waiters
- if (ack_waiters.count(atid)) {
- dout(15) << "kicking waiters on atid " << atid << dendl;
- mds->queue_waiters(ack_waiters[atid]);
- ack_waiters.erase(atid);
- }
+ mds->mdlog->submit_entry(new EAnchorClient(ANCHOR_OP_ACK, atid),
+ new C_LoggedAck(this, atid));
}
break;
}
+void AnchorClient::_logged_ack(version_t atid)
+{
+ dout(10) << "_logged_ack" << dendl;
+
+ assert(pending_commit.count(atid));
+ assert(pending_commit[atid]->pending_commit_atids.count(atid));
+
+ pending_commit[atid]->pending_commit_atids.erase(atid);
+ pending_commit.erase(atid);
+
+ // kick any waiters (LogSegment trim)
+ if (ack_waiters.count(atid)) {
+ dout(15) << "kicking ack waiters on atid " << atid << dendl;
+ mds->queue_waiters(ack_waiters[atid]);
+ ack_waiters.erase(atid);
+ }
+}
+
/*
* public async interface
// COMMIT
-void AnchorClient::commit(version_t atid)
+void AnchorClient::commit(version_t atid, LogSegment *ls)
{
dout(10) << "commit " << atid << dendl;
- assert(pending_commit.count(atid));
- pending_commit.insert(atid);
+ assert(pending_commit.count(atid) == 0);
+ pending_commit[atid] = ls;
+ ls->pending_commit_atids.insert(atid);
// send message
MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, atid);
void AnchorClient::resend_commits()
{
- for (set<version_t>::iterator p = pending_commit.begin();
+ for (map<version_t,LogSegment*>::iterator p = pending_commit.begin();
p != pending_commit.end();
++p) {
- dout(10) << "resending commit on " << *p << dendl;
- MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, *p);
+ dout(10) << "resending commit on " << p->first << dendl;
+ MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, p->first);
mds->send_message_mds(req,
mds->mdsmap->get_anchortable(),
MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT);
class Context;
class MDS;
+class LogSegment;
class AnchorClient : public Dispatcher {
MDS *mds;
hash_map<inodeno_t, _pending_prepare> pending_update_prepare;
// pending commits
- set<version_t> pending_commit;
+ map<version_t, LogSegment*> pending_commit;
map<version_t, list<Context*> > ack_waiters;
void handle_anchor_reply(class MAnchor *m);
+ class C_LoggedAck : public Context {
+ AnchorClient *ac;
+ version_t atid;
+ public:
+ C_LoggedAck(AnchorClient *a, version_t t) : ac(a), atid(t) {}
+ void finish(int r) {
+ ac->_logged_ack(atid);
+ }
+ };
+ void _logged_ack(version_t atid);
+
public:
AnchorClient(MDS *m) : mds(m) {}
void prepare_destroy(inodeno_t ino, version_t *atid, Context *onfinish);
void prepare_update(inodeno_t ino, vector<Anchor>& trace, version_t *atid, Context *onfinish);
- void commit(version_t atid);
+ void commit(version_t atid, LogSegment *ls);
// for recovery (by other nodes)
void handle_mds_recovery(int mds); // called when someone else recovers
void resend_prepares(hash_map<inodeno_t, _pending_prepare>& prepares, int op);
// for recovery (by me)
- void got_journaled_agree(version_t atid) {
- pending_commit.insert(atid);
+ void got_journaled_agree(version_t atid, LogSegment *ls) {
+ pending_commit[atid] = ls;
}
void got_journaled_ack(version_t atid) {
pending_commit.erase(atid);
#include "MDS.h"
#include "MDCache.h"
+#include "LogSegment.h"
#include "messages/MLock.h"
void CDentry::add_waiter(int tag, Context *c)
{
// wait on the directory?
- if (tag & (WAIT_AUTHPINNABLE|WAIT_SINGLEAUTH)) {
+ if (tag & (WAIT_UNFREEZE|WAIT_SINGLEAUTH)) {
dir->add_waiter(tag, c);
return;
}
}
-void CDentry::_mark_dirty()
+void CDentry::_mark_dirty(LogSegment *ls)
{
// state+pin
if (!state_test(STATE_DIRTY)) {
state_set(STATE_DIRTY);
dir->inc_num_dirty();
get(PIN_DIRTY);
+ assert(ls);
}
+ if (ls)
+ ls->dirty_dentries.push_back(&xlist_dirty);
}
-void CDentry::mark_dirty(version_t pv)
+void CDentry::mark_dirty(version_t pv, LogSegment *ls)
{
dout(10) << " mark_dirty " << *this << dendl;
// i now live in this new dir version
assert(pv <= projected_version);
version = pv;
- _mark_dirty();
+ _mark_dirty(ls);
// mark dir too
- dir->mark_dirty(pv);
+ dir->mark_dirty(pv, ls);
}
dir->dec_num_dirty();
put(PIN_DIRTY);
+ xlist_dirty.remove_myself();
+
if (state_test(STATE_NEW))
state_clear(STATE_NEW);
}
if (auth_pins == 0)
get(PIN_AUTHPIN);
auth_pins++;
+
+ dout(10) << "auth_pin on " << *this
+ << " now " << auth_pins << "+" << nested_auth_pins
+ << dendl;
+
dir->adjust_nested_auth_pins(1);
}
auth_pins--;
if (auth_pins == 0)
put(PIN_AUTHPIN);
+
+ dout(10) << "auth_unpin on " << *this
+ << " now " << auth_pins << "+" << nested_auth_pins
+ << dendl;
+ assert(auth_pins >= 0);
+
dir->adjust_nested_auth_pins(-1);
}
void CDentry::adjust_nested_auth_pins(int by)
{
nested_auth_pins += by;
+
+ dout(15) << "adjust_nested_auth_pins by " << by
+ << " now " << auth_pins << "+" << nested_auth_pins
+ << dendl;
+ assert(nested_auth_pins >= 0);
+
dir->adjust_nested_auth_pins(by);
}
#include "include/types.h"
#include "include/buffer.h"
#include "include/lru.h"
+#include "include/xlist.h"
#include "mdstypes.h"
#include "SimpleLock.h"
class Anchor;
class CDentry;
+class LogSegment;
+
// define an ordering
bool operator<(const CDentry& l, const CDentry& r);
version_t version; // dir version when last touched.
version_t projected_version; // what it will be when i unlock/commit.
+ xlist<CDentry*>::item xlist_dirty;
+
off_t dir_offset;
int auth_pins, nested_auth_pins;
remote_ino(0), remote_d_type(0),
inode(0), dir(0),
version(0), projected_version(0),
+ xlist_dirty(this),
dir_offset(0),
auth_pins(0), nested_auth_pins(0),
lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { }
remote_ino(0), remote_d_type(0),
inode(in), dir(0),
version(0), projected_version(0),
+ xlist_dirty(this),
dir_offset(0),
auth_pins(0), nested_auth_pins(0),
lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { }
remote_ino(ino), remote_d_type(dt),
inode(in), dir(0),
version(0), projected_version(0),
+ xlist_dirty(this),
dir_offset(0),
auth_pins(0), nested_auth_pins(0),
lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { }
pair<int,int> authority();
version_t pre_dirty(version_t min=0);
- void _mark_dirty();
- void mark_dirty(version_t projected_dirv);
+ void _mark_dirty(LogSegment *ls);
+ void mark_dirty(version_t projected_dirv, LogSegment *ls);
void mark_clean();
void mark_new();
// -- exporting
// note: this assumes the dentry already exists.
// i.e., the name is already extracted... so we just need the other state.
- void encode_export_state(bufferlist& bl) {
- bl.append((char*)&state, sizeof(state));
- bl.append((char*)&version, sizeof(version));
- bl.append((char*)&projected_version, sizeof(projected_version));
+ void encode_export(bufferlist& bl) {
+ ::_encode_simple(state, bl);
+ ::_encode_simple(version, bl);
+ ::_encode_simple(projected_version, bl);
lock._encode(bl);
- ::_encode(replica_map, bl);
-
+ ::_encode_simple(replica_map, bl);
+ get(PIN_TEMPEXPORTING);
+ }
+ void finish_export() {
// twiddle
clear_replica_map();
replica_nonce = EXPORT_NONCE;
state_clear(CDentry::STATE_AUTH);
if (is_dirty())
mark_clean();
+ put(PIN_TEMPEXPORTING);
+ }
+ void abort_export() {
+ put(PIN_TEMPEXPORTING);
}
- void decode_import_state(bufferlist& bl, int& off, int from, int to) {
+ void decode_import(bufferlist::iterator& blp, LogSegment *ls) {
int nstate;
- bl.copy(off, sizeof(nstate), (char*)&nstate);
- off += sizeof(nstate);
- bl.copy(off, sizeof(version), (char*)&version);
- off += sizeof(version);
- bl.copy(off, sizeof(projected_version), (char*)&projected_version);
- off += sizeof(projected_version);
- lock._decode(bl, off);
- ::_decode(replica_map, bl, off);
+ ::_decode_simple(nstate, blp);
+ ::_decode_simple(version, blp);
+ ::_decode_simple(projected_version, blp);
+ lock._decode(blp);
+ ::_decode_simple(replica_map, blp);
// twiddle
state = 0;
state_set(CDentry::STATE_AUTH);
if (nstate & STATE_DIRTY)
- _mark_dirty();
+ _mark_dirty(ls);
if (!replica_map.empty())
get(PIN_REPLICATED);
- add_replica(from, EXPORT_NONCE);
- if (is_replica(to))
- remove_replica(to);
}
// -- locking --
#include "MDS.h"
#include "MDCache.h"
#include "MDSMap.h"
+#include "LogSegment.h"
#include "include/Context.h"
#include "common/Clock.h"
// -------------------------------------------------------------------
// CDir
-CDir::CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth)
+CDir::CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth) :
+ xlist_dirty(this)
{
inode = in;
frag = fg;
if (is_auth())
clear_replica_map();
if (is_dirty()) mark_clean();
- if (state_test(STATE_EXPORT)) put(PIN_EXPORT);
if (state_test(STATE_IMPORTBOUND)) put(PIN_IMPORTBOUND);
if (state_test(STATE_EXPORTBOUND)) put(PIN_EXPORTBOUND);
{
if (!replica_map.empty()) get(PIN_REPLICATED);
if (state_test(STATE_DIRTY)) get(PIN_DIRTY);
- if (state_test(STATE_EXPORT)) get(PIN_EXPORT);
if (state_test(STATE_EXPORTBOUND)) get(PIN_EXPORTBOUND);
if (state_test(STATE_IMPORTBOUND)) get(PIN_IMPORTBOUND);
}
put(PIN_DNWAITER);
}
+void CDir::add_ino_waiter(inodeno_t ino, Context *c)
+{
+ if (waiting_on_ino.empty())
+ get(PIN_INOWAITER);
+ waiting_on_ino[ino].push_back(c);
+ dout(10) << "add_ino_waiter ino " << ino << " " << c << " on " << *this << dendl;
+}
+
+void CDir::take_ino_waiting(inodeno_t ino, list<Context*>& ls)
+{
+ if (waiting_on_ino.empty()) return;
+ if (waiting_on_ino.count(ino) == 0) return;
+ dout(10) << "take_ino_waiting ino " << ino
+ << " x " << waiting_on_ino[ino].size()
+ << " on " << *this << dendl;
+ ls.splice(ls.end(), waiting_on_ino[ino]);
+ waiting_on_ino.erase(ino);
+ if (waiting_on_ino.empty())
+ put(PIN_INOWAITER);
+}
+
+void CDir::take_sub_waiting(list<Context*>& ls)
+{
+ dout(10) << "take_sub_waiting" << dendl;
+ for (hash_map<string, list<Context*> >::iterator p = waiting_on_dentry.begin();
+ p != waiting_on_dentry.end();
+ ++p)
+ ls.splice(ls.end(), p->second);
+ waiting_on_dentry.clear();
+ for (hash_map<inodeno_t, list<Context*> >::iterator p = waiting_on_ino.begin();
+ p != waiting_on_ino.end();
+ ++p)
+ ls.splice(ls.end(), p->second);
+ waiting_on_ino.clear();
+}
+
+
void CDir::add_waiter(int tag, Context *c)
{
list<Context*> finished;
take_waiting(mask, finished);
- //finish_contexts(finished, result);
- cache->mds->queue_waiters(finished);
+ if (result < 0)
+ finish_contexts(finished, result);
+ else
+ cache->mds->queue_waiters(finished);
}
return projected_version;
}
-void CDir::_mark_dirty()
+void CDir::_mark_dirty(LogSegment *ls)
{
if (!state_test(STATE_DIRTY)) {
state_set(STATE_DIRTY);
dout(10) << "mark_dirty (was clean) " << *this << " version " << version << dendl;
get(PIN_DIRTY);
+ assert(ls);
} else {
dout(10) << "mark_dirty (already dirty) " << *this << " version " << version << dendl;
}
+ if (ls)
+ ls->dirty_dirfrags.push_back(&xlist_dirty);
}
-void CDir::mark_dirty(version_t pv)
+void CDir::mark_dirty(version_t pv, LogSegment *ls)
{
assert(version < pv);
version = pv;
- _mark_dirty();
+ _mark_dirty(ls);
}
void CDir::mark_clean()
if (state_test(STATE_DIRTY)) {
state_clear(STATE_DIRTY);
put(PIN_DIRTY);
+
+ xlist_dirty.remove_myself();
}
}
if (!can_auth_pin() && !ignore_authpinnability) {
dout(7) << "fetch waiting for authpinnable" << dendl;
- add_waiter(WAIT_AUTHPINNABLE, c);
+ add_waiter(WAIT_UNFREEZE, c);
return;
}
+// IMPORT/EXPORT
+
+void CDir::encode_export(bufferlist& bl)
+{
+ ::_encode_simple(version, bl);
+ ::_encode_simple(committed_version, bl);
+ ::_encode_simple(committed_version_equivalent, bl);
+
+ ::_encode_simple(state, bl);
+ ::_encode_simple(dir_rep, bl);
+
+ ::_encode_simple(pop_me, bl);
+ ::_encode_simple(pop_auth_subtree, bl);
+
+ ::_encode_simple(dir_rep_by, bl);
+ ::_encode_simple(replica_map, bl);
+
+ get(PIN_TEMPEXPORTING);
+}
+
+void CDir::finish_export(utime_t now)
+{
+ pop_auth_subtree_nested -= pop_auth_subtree;
+ pop_me.zero(now);
+ pop_auth_subtree.zero(now);
+ put(PIN_TEMPEXPORTING);
+}
+
+void CDir::decode_import(bufferlist::iterator& blp)
+{
+ ::_decode_simple(version, blp);
+ ::_decode_simple(committed_version, blp);
+ ::_decode_simple(committed_version_equivalent, blp);
+ committing_version = committed_version;
+ projected_version = version;
+
+ unsigned s;
+ ::_decode_simple(s, blp);
+ state &= MASK_STATE_IMPORT_KEPT;
+ state |= (s & MASK_STATE_EXPORTED);
+ if (is_dirty()) get(PIN_DIRTY);
+
+ ::_decode_simple(dir_rep, blp);
+
+ ::_decode_simple(pop_me, blp);
+ ::_decode_simple(pop_auth_subtree, blp);
+ pop_auth_subtree_nested += pop_auth_subtree;
+
+ ::_decode_simple(dir_rep_by, blp);
+ ::_decode_simple(replica_map, blp);
+ if (!replica_map.empty()) get(PIN_REPLICATED);
+
+ replica_nonce = 0; // no longer defined
+}
+
+
+
/********************************
* AUTHORITY
get(PIN_AUTHPIN);
auth_pins++;
- dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << dendl;
+ dout(10) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << dendl;
// nest pins?
if (is_subtree_root()) return; // no.
if (auth_pins == 0)
put(PIN_AUTHPIN);
- dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << dendl;
+ dout(10) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << dendl;
assert(auth_pins >= 0);
- // pending freeze?
- if (state_test(STATE_FREEZINGTREE|STATE_FREEZINGDIR) &&
- auth_pins == 1 &&
- nested_auth_pins == 0)
- finish_waiting(WAIT_FREEZEABLE);
+ maybe_finish_freeze(); // pending freeze?
// nest?
if (is_subtree_root()) return; // no.
{
nested_auth_pins += inc;
- dout(10) << "adjust_nested_auth_pins " << inc << " on " << *this
+ dout(15) << "adjust_nested_auth_pins " << inc << " on " << *this
<< " count now " << auth_pins << " + " << nested_auth_pins << dendl;
assert(nested_auth_pins >= 0);
-
- // pending freeze?
- if (state_test(STATE_FREEZINGTREE|STATE_FREEZINGDIR) &&
- auth_pins == 1 &&
- nested_auth_pins == 0)
- finish_waiting(WAIT_FREEZEABLE);
+
+ maybe_finish_freeze(); // pending freeze?
// adjust my inode?
if (is_subtree_root())
// FREEZE TREE
-class C_MDS_FreezeTree : public Context {
- CDir *dir;
- Context *con;
-public:
- C_MDS_FreezeTree(CDir *dir, Context *c) {
- this->dir = dir;
- this->con = c;
- }
- virtual void finish(int r) {
- dir->freeze_tree_finish(con);
- }
-};
-
-void CDir::freeze_tree(Context *c)
+bool CDir::freeze_tree()
{
assert(!is_frozen());
assert(!is_freezing());
auth_pin();
-
- if (is_freezeable()) {
+ if (is_freezeable(true)) {
_freeze_tree();
auth_unpin();
- if (c) {
- c->finish(0);
- delete c;
- }
+ return true;
} else {
state_set(STATE_FREEZINGTREE);
dout(10) << "freeze_tree waiting " << *this << dendl;
- add_waiter(WAIT_FREEZEABLE, new C_MDS_FreezeTree(this, c));
- }
+ return false;
+ }
}
void CDir::_freeze_tree()
{
dout(10) << "_freeze_tree " << *this << dendl;
-
- // there shouldn't be any conflicting auth_pins (except the 'freezing' one)
assert(is_freezeable(true));
// twiddle state
inode->auth_pin();
}
-void CDir::freeze_tree_finish(Context *c)
-{
- // still freezing? (we may have been canceled)
- if (!is_freezing()) {
- dout(10) << "freeze_tree_finish no longer freezing, done on " << *this << dendl;
- c->finish(-1);
- delete c;
- return;
- }
-
- // freezeable now?
- if (!is_freezeable(true)) {
- dout(10) << "freeze_tree_finish still waiting " << *this << dendl;
- assert(state_test(STATE_FREEZINGTREE));
- add_waiter(WAIT_FREEZEABLE, new C_MDS_FreezeTree(this, c));
- return;
- }
-
- dout(10) << "freeze_tree_finish " << *this << dendl;
- _freeze_tree();
- auth_unpin();
- if (c) {
- c->finish(0);
- delete c;
- }
-}
-
void CDir::unfreeze_tree()
{
dout(10) << "unfreeze_tree " << *this << dendl;
// waiters?
finish_waiting(WAIT_UNFREEZE);
} else {
+ finish_waiting(WAIT_FROZEN, -1);
+
// freezing. stop it.
assert(state_test(STATE_FREEZINGTREE));
state_clear(STATE_FREEZINGTREE);
auth_unpin();
- // cancel freeze waiters
finish_waiting(WAIT_UNFREEZE);
- finish_waiting(WAIT_FREEZEABLE, -1);
}
}
// FREEZE DIR
-class C_MDS_FreezeDir : public Context {
- CDir *dir;
- Context *con;
-public:
- C_MDS_FreezeDir(CDir *dir, Context *c) {
- this->dir = dir;
- this->con = c;
- }
- virtual void finish(int r) {
- dir->freeze_dir_finish(con);
- }
-};
-
-void CDir::freeze_dir(Context *c)
+bool CDir::freeze_dir()
{
assert(!is_frozen());
assert(!is_freezing());
auth_pin();
- if (is_freezeable_dir()) {
+ if (is_freezeable_dir(true)) {
_freeze_dir();
auth_unpin();
- if (c) {
- c->finish(0);
- delete c;
- }
+ return true;
} else {
state_set(STATE_FREEZINGDIR);
dout(10) << "freeze_dir + wait " << *this << dendl;
- add_waiter(WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c));
+ return false;
}
}
void CDir::_freeze_dir()
-{
+{
dout(10) << "_freeze_dir " << *this << dendl;
-
assert(is_freezeable_dir(true));
state_clear(STATE_FREEZINGDIR);
inode->auth_pin(); // auth_pin for duration of freeze
}
-void CDir::freeze_dir_finish(Context *c)
-{
- // still freezing? (we may have been canceled)
- if (!is_freezing()) {
- dout(10) << "freeze_dir_finish no longer freezing, done on " << *this << dendl;
- c->finish(-1);
- delete c;
- return;
- }
-
- // freezeable now?
- if (!is_freezeable_dir(true)) {
- dout(10) << "freeze_dir_finish still waiting " << *this << dendl;
- state_set(STATE_FREEZINGDIR);
- add_waiter(WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c));
- return;
- }
-
- // freeze now
- dout(10) << "freeze_dir_finish " << *this << dendl;
- _freeze_dir();
- auth_unpin();
- if (c) {
- c->finish(0);
- delete c;
- }
-}
void CDir::unfreeze_dir()
{
if (is_auth() && !is_subtree_root())
inode->auth_unpin();
- // waiters?
finish_waiting(WAIT_UNFREEZE);
} else {
+ finish_waiting(WAIT_FROZEN, -1);
+
// still freezing. stop.
assert(state_test(STATE_FREEZINGDIR));
state_clear(STATE_FREEZINGDIR);
auth_unpin();
- // cancel freeze waiters
finish_waiting(WAIT_UNFREEZE);
- finish_waiting(WAIT_FREEZEABLE, -1);
}
}
class CDirDiscover;
-
ostream& operator<<(ostream& out, class CDir& dir);
-// CDir
-
-
-
class CDir : public MDSCacheObject {
public:
// -- pins --
static const int PIN_DNWAITER = 1;
- static const int PIN_CHILD = 2;
- static const int PIN_FROZEN = 3;
- static const int PIN_EXPORT = 5;
+ static const int PIN_INOWAITER = 2;
+ static const int PIN_CHILD = 3;
+ static const int PIN_FROZEN = 4;
+ static const int PIN_SUBTREE = 5;
static const int PIN_IMPORTING = 7;
- static const int PIN_EXPORTING = 8;
static const int PIN_IMPORTBOUND = 9;
static const int PIN_EXPORTBOUND = 10;
static const int PIN_STICKY = 11;
+ static const int PIN_SUBTREETEMP = 12; // used by MDCache::trim_non_auth()
const char *pin_name(int p) {
switch (p) {
case PIN_DNWAITER: return "dnwaiter";
+ case PIN_INOWAITER: return "inowaiter";
case PIN_CHILD: return "child";
case PIN_FROZEN: return "frozen";
- case PIN_EXPORT: return "export";
- case PIN_EXPORTING: return "exporting";
+ case PIN_SUBTREE: return "subtree";
case PIN_IMPORTING: return "importing";
case PIN_IMPORTBOUND: return "importbound";
case PIN_EXPORTBOUND: return "exportbound";
case PIN_STICKY: return "sticky";
+ case PIN_SUBTREETEMP: return "subtreetemp";
default: return generic_pin_name(p);
}
}
static const unsigned STATE_FREEZINGDIR = (1<< 5);
static const unsigned STATE_COMMITTING = (1<< 6); // mid-commit
static const unsigned STATE_FETCHING = (1<< 7); // currenting fetching
- static const unsigned STATE_DELETED = (1<< 8);
- static const unsigned STATE_EXPORT = (1<< 9);
static const unsigned STATE_IMPORTBOUND = (1<<10);
static const unsigned STATE_EXPORTBOUND = (1<<11);
static const unsigned STATE_EXPORTING = (1<<12);
static const unsigned MASK_STATE_EXPORTED =
(STATE_COMPLETE|STATE_DIRTY);
static const unsigned MASK_STATE_IMPORT_KEPT =
- (STATE_EXPORT
- |STATE_IMPORTING
+ (
+ STATE_IMPORTING
|STATE_IMPORTBOUND|STATE_EXPORTBOUND
|STATE_FROZENTREE
|STATE_STICKY);
|STATE_IMPORTBOUND|STATE_EXPORTBOUND
|STATE_FROZENTREE
|STATE_FROZENDIR
- |STATE_EXPORT
|STATE_STICKY);
static const unsigned MASK_STATE_FRAGMENT_KEPT =
(STATE_DIRTY |
STATE_COMPLETE |
- STATE_EXPORT |
STATE_EXPORTBOUND |
STATE_IMPORTBOUND);
// -- wait masks --
static const int WAIT_DENTRY = (1<<0); // wait for item to be in cache
static const int WAIT_COMPLETE = (1<<1); // wait for complete dir contents
- static const int WAIT_FREEZEABLE = (1<<2); // auth pins removed
+ static const int WAIT_FROZEN = (1<<2); // auth pins removed
static const int WAIT_DNLOCK_OFFSET = 4;
static const int WAIT_ANY = (0xffffffff);
- static const int WAIT_ATFREEZEROOT = (WAIT_AUTHPINNABLE|WAIT_UNFREEZE);
+ static const int WAIT_ATFREEZEROOT = (WAIT_UNFREEZE);
static const int WAIT_ATSUBTREEROOT = (WAIT_SINGLEAUTH);
//int hack_num_accessed;
public:
- typedef hash_map<string, CDentry*> map_t; // there is a bug somewhere, valgrind me.
- //typedef map<string, CDentry*> map_t;
+ //typedef hash_map<string, CDentry*> map_t; // there is a bug somewhere, valgrind me.
+ typedef map<string, CDentry*> map_t;
protected:
// contents
map_t items; // non-null AND null
version_t committed_version_equivalent; // in case of, e.g., temporary file
version_t projected_version;
+ xlist<CDir*>::item xlist_dirty;
+
// lock nesting, freeze
int auth_pins;
int nested_auth_pins;
// for giving to clients
void get_dist_spec(set<int>& ls, int auth) {
- //if (( pop_auth_subtree.get(META_POP_IRD).get() >
- //g_conf.mds_bal_replicate_threshold)) {
- //if (!cached_by.empty() && inode.ino > 1) generic_dout(1) << "distributed spec for " << *this << endl;
if (is_rep()) {
for (map<int,int>::iterator p = replicas_begin();
p != replicas_end();
void set_committed_version(version_t v) { committed_version = v; }
version_t pre_dirty(version_t min=0);
- void _mark_dirty();
- void mark_dirty(version_t pv);
+ void _mark_dirty(LogSegment *ls);
+ void mark_dirty(version_t pv, LogSegment *ls);
void mark_clean();
void mark_complete() { state_set(STATE_COMPLETE); }
// -- waiters --
protected:
hash_map< string, list<Context*> > waiting_on_dentry;
+ hash_map< inodeno_t, list<Context*> > waiting_on_ino;
public:
bool is_waiting_for_dentry(const string& dn) {
void add_dentry_waiter(const string& dentry, Context *c);
void take_dentry_waiting(const string& dentry, list<Context*>& ls);
+ bool is_waiting_for_ino(inodeno_t ino) {
+ return waiting_on_ino.count(ino);
+ }
+ void add_ino_waiter(inodeno_t ino, Context *c);
+ void take_ino_waiting(inodeno_t ino, list<Context*>& ls);
+
+ void take_sub_waiting(list<Context*>& ls); // dentry or ino
+
void add_waiter(int mask, Context *c);
void take_waiting(int mask, list<Context*>& ls); // may include dentry waiters
void finish_waiting(int mask, int result = 0); // ditto
+ // -- import/export --
+ void encode_export(bufferlist& bl);
+ void finish_export(utime_t now);
+ void abort_export() {
+ put(PIN_TEMPEXPORTING);
+ }
+ void decode_import(bufferlist::iterator& blp);
+
+
// -- auth pins --
bool can_auth_pin() { return is_auth() && !(is_frozen() || is_freezing()); }
int is_auth_pinned() { return auth_pins; }
void adjust_nested_auth_pins(int inc);
// -- freezing --
- void freeze_tree(Context *c);
- void freeze_tree_finish(Context *c);
- void unfreeze_tree();
+ bool freeze_tree();
void _freeze_tree();
+ void unfreeze_tree();
- void freeze_dir(Context *c);
- void freeze_dir_finish(Context *c);
+ bool freeze_dir();
void _freeze_dir();
void unfreeze_dir();
+ void maybe_finish_freeze() {
+ if (auth_pins != 1 || nested_auth_pins != 0)
+ return;
+ if (state_test(STATE_FREEZINGTREE)) {
+ _freeze_tree();
+ auth_unpin();
+ finish_waiting(WAIT_FROZEN);
+ }
+ if (state_test(STATE_FREEZINGDIR)) {
+ _freeze_dir();
+ auth_unpin();
+ finish_waiting(WAIT_FROZEN);
+ }
+ }
+
bool is_freezing() { return is_freezing_tree() || is_freezing_dir(); }
bool is_freezing_tree();
bool is_freezing_tree_root() { return state & STATE_FREEZINGTREE; }
dirfrag_t get_dirfrag() { return dirfrag; }
-
void _encode(bufferlist& bl) {
bl.append((char*)&dirfrag, sizeof(dirfrag));
bl.append((char*)&nonce, sizeof(nonce));
};
-// export
-
-class CDirExport {
- struct {
- dirfrag_t dirfrag;
- uint32_t nden; // num dentries (including null ones)
- version_t version;
- version_t committed_version;
- version_t committed_version_equivalent;
- uint32_t state;
- dirfrag_load_vec_t pop_me;
- dirfrag_load_vec_t pop_auth_subtree;
- int32_t dir_rep;
- } st;
- map<int,int> replicas;
- set<int> rep_by;
-
- public:
- CDirExport() {}
- CDirExport(CDir *dir, utime_t now) {
- memset(&st, 0, sizeof(st));
-
- assert(dir->get_version() == dir->get_projected_version());
-
- st.dirfrag = dir->dirfrag();
- st.nden = dir->items.size();
- st.version = dir->version;
- st.committed_version = dir->committed_version;
- st.committed_version_equivalent = dir->committed_version_equivalent;
- st.state = dir->state;
- st.dir_rep = dir->dir_rep;
-
- st.pop_me = dir->pop_me;
- st.pop_auth_subtree = dir->pop_auth_subtree;
- dir->pop_auth_subtree_nested -= dir->pop_auth_subtree;
- dir->pop_me.zero(now);
- dir->pop_auth_subtree.zero(now);
-
- rep_by = dir->dir_rep_by;
- replicas = dir->replica_map;
- }
-
- dirfrag_t get_dirfrag() { return st.dirfrag; }
- uint32_t get_nden() { return st.nden; }
-
- void update_dir(CDir *dir) {
- assert(dir->dirfrag() == st.dirfrag);
-
- // set committed_version at old version
- dir->committing_version =
- dir->committed_version = st.committed_version;
- dir->committed_version_equivalent = st.committed_version_equivalent;
- dir->projected_version =
- dir->version = st.version;
-
- // twiddle state
- dir->state = (dir->state & CDir::MASK_STATE_IMPORT_KEPT) | // remember import flag, etc.
- (st.state & CDir::MASK_STATE_EXPORTED);
- dir->dir_rep = st.dir_rep;
-
- dir->pop_me = st.pop_me;
- dir->pop_auth_subtree = st.pop_auth_subtree;
- dir->pop_auth_subtree_nested += dir->pop_auth_subtree;
-
- dir->replica_nonce = 0; // no longer defined
-
- if (!dir->replica_map.empty())
- generic_dout(0) << "replicas not empty non import, " << *dir << ", " << dir->replica_map << dendl;
-
- dir->dir_rep_by = rep_by;
- dir->replica_map = replicas;
- generic_dout(12) << "replicas in export is " << replicas << ", dir now " << dir->replica_map << dendl;
- if (!replicas.empty())
- dir->get(CDir::PIN_REPLICATED);
- if (dir->is_dirty()) {
- dir->get(CDir::PIN_DIRTY);
- }
- }
-
-
- void _encode(bufferlist& bl) {
- bl.append((char*)&st, sizeof(st));
- ::_encode(replicas, bl);
- ::_encode(rep_by, bl);
- }
-
- int _decode(bufferlist& bl, int off = 0) {
- bl.copy(off, sizeof(st), (char*)&st);
- off += sizeof(st);
- ::_decode(replicas, bl, off);
- ::_decode(rep_by, bl, off);
- return off;
- }
-
-};
-
-
#endif
#include "MDCache.h"
#include "AnchorTable.h"
+#include "LogSegment.h"
+
#include "common/Clock.h"
#include "messages/MLock.h"
out << " v" << in.get_version();
+ if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH";
+ if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance;
+ if (in.is_frozen_inode()) out << " FROZEN";
+
// locks
out << " " << in.authlock;
out << " " << in.linklock;
return projected_inode.back();
}
-void CInode::pop_and_dirty_projected_inode()
+void CInode::pop_and_dirty_projected_inode(LogSegment *ls)
{
assert(!projected_inode.empty());
dout(15) << "pop_and_dirty_projected_inode " << projected_inode.front()
<< " v" << projected_inode.front()->version << dendl;
- mark_dirty(projected_inode.front()->version);
+ mark_dirty(projected_inode.front()->version, ls);
inode = *projected_inode.front();
delete projected_inode.front();
projected_inode.pop_front();
return parent->pre_dirty();
}
-void CInode::_mark_dirty()
+void CInode::_mark_dirty(LogSegment *ls)
{
if (!state_test(STATE_DIRTY)) {
state_set(STATE_DIRTY);
get(PIN_DIRTY);
+ assert(ls);
}
+
+ // move myself to this segment's dirty list
+ if (ls)
+ ls->dirty_inodes.push_back(&xlist_dirty);
}
-void CInode::mark_dirty(version_t pv) {
+void CInode::mark_dirty(version_t pv, LogSegment *ls) {
dout(10) << "mark_dirty " << *this << dendl;
// touch my private version
assert(inode.version < pv);
inode.version = pv;
- _mark_dirty();
+ _mark_dirty(ls);
// mark dentry too
- parent->mark_dirty(pv);
+ parent->mark_dirty(pv, ls);
}
if (state_test(STATE_DIRTY)) {
state_clear(STATE_DIRTY);
put(PIN_DIRTY);
+
+ // remove myself from ls dirty list
+ xlist_dirty.remove_myself();
}
}
_decode(tm, bl, off);
if (inode.mtime < tm) {
inode.mtime = tm;
- dirlock.set_updated();
+ if (is_auth()) {
+ dout(10) << "decode_lock_state auth got mtime " << tm << " > my " << inode.mtime
+ << ", setting dirlock updated flag on " << *this
+ << dendl;
+ dirlock.set_updated();
+ }
}
if (0) {
map<frag_t,int> dfsz;
}
}
+void CInode::clear_dirty_scattered(int type)
+{
+ dout(10) << "clear_dirty_scattered " << type << " on " << *this << dendl;
+ switch (type) {
+ case LOCK_OTYPE_IDIR:
+ xlist_dirty_inode_mtime.remove_myself();
+ break;
+ default:
+ assert(0);
+ }
+}
bool CInode::is_frozen()
{
- if (parent && parent->dir->is_frozen())
- return true;
+ if (is_frozen_inode()) return true;
+ if (parent && parent->dir->is_frozen()) return true;
return false;
}
bool CInode::is_frozen_dir()
{
- if (parent && parent->dir->is_frozen_dir())
- return true;
+ if (parent && parent->dir->is_frozen_dir()) return true;
return false;
}
bool CInode::is_freezing()
{
- if (parent && parent->dir->is_freezing())
- return true;
+ if (is_freezing_inode()) return true;
+ if (parent && parent->dir->is_freezing()) return true;
return false;
}
void CInode::add_waiter(int tag, Context *c)
{
+ dout(10) << "add_waiter tag " << tag
+ << " !ambig " << !state_test(STATE_AMBIGUOUSAUTH)
+ << " !frozen " << !is_frozen_inode()
+ << " !freezing " << !is_freezing_inode()
+ << dendl;
// wait on the directory?
- if (tag & (WAIT_AUTHPINNABLE|WAIT_SINGLEAUTH)) {
+ // make sure its not the inode that is explicitly ambiguous|freezing|frozen
+ if (((tag & WAIT_SINGLEAUTH) && !state_test(STATE_AMBIGUOUSAUTH)) ||
+ ((tag & WAIT_UNFREEZE) && !is_frozen_inode() && !is_freezing_inode())) {
parent->dir->add_waiter(tag, c);
return;
}
MDSCacheObject::add_waiter(tag, c);
}
+bool CInode::freeze_inode(int auth_pin_allowance)
+{
+ assert(auth_pin_allowance > 0); // otherwise we need to adjust parent's nested_auth_pins
+ assert(auth_pins >= auth_pin_allowance);
+ if (auth_pins > auth_pin_allowance) {
+ dout(10) << "freeze_inode - waiting for auth_pins to drop to " << auth_pin_allowance << dendl;
+ auth_pin_freeze_allowance = auth_pin_allowance;
+ get(PIN_FREEZING);
+ state_set(STATE_FREEZING);
+ return false;
+ }
+
+ dout(10) << "freeze_inode - frozen" << dendl;
+ assert(auth_pins == auth_pin_allowance);
+ get(PIN_FROZEN);
+ state_set(STATE_FROZEN);
+ return true;
+}
+
+void CInode::unfreeze_inode(list<Context*>& finished)
+{
+ dout(10) << "unfreeze_inode" << dendl;
+ if (state_test(STATE_FREEZING)) {
+ state_clear(STATE_FREEZING);
+ put(PIN_FREEZING);
+ } else if (state_test(STATE_FROZEN)) {
+ state_clear(STATE_FROZEN);
+ put(PIN_FROZEN);
+ } else
+ assert(0);
+ take_waiting(WAIT_UNFREEZE, finished);
+}
+
// auth_pins
bool CInode::can_auth_pin() {
+ if (is_freezing_inode() || is_frozen_inode()) return false;
if (parent)
return parent->can_auth_pin();
return true;
get(PIN_AUTHPIN);
auth_pins++;
- dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << dendl;
+ dout(10) << "auth_pin on " << *this
+ << " now " << auth_pins << "+" << nested_auth_pins
+ << dendl;
if (parent)
parent->adjust_nested_auth_pins( 1 );
if (auth_pins == 0)
put(PIN_AUTHPIN);
- dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << dendl;
+ dout(10) << "auth_unpin on " << *this
+ << " now " << auth_pins << "+" << nested_auth_pins
+ << dendl;
assert(auth_pins >= 0);
-
+
if (parent)
parent->adjust_nested_auth_pins( -1 );
+
+ if (is_freezing_inode() &&
+ auth_pins == auth_pin_freeze_allowance) {
+ dout(10) << "auth_unpin freezing!" << dendl;
+ get(PIN_FROZEN);
+ put(PIN_FREEZING);
+ state_clear(STATE_FREEZING);
+ state_set(STATE_FROZEN);
+ finish_waiting(WAIT_FROZEN);
+ }
}
void CInode::adjust_nested_auth_pins(int a)
{
if (!parent) return;
nested_auth_pins += a;
+
+ dout(15) << "adjust_nested_auth_pins by " << a
+ << " now " << auth_pins << "+" << nested_auth_pins
+ << dendl;
+ assert(nested_auth_pins >= 0);
+
parent->adjust_nested_auth_pins(a);
}
+
+// IMPORT/EXPORT
+
+void CInode::encode_export(bufferlist& bl)
+{
+ ::_encode_simple(inode, bl);
+ ::_encode_simple(symlink, bl);
+ dirfragtree._encode(bl);
+
+ bool dirty = is_dirty();
+ ::_encode_simple(dirty, bl);
+
+ ::_encode_simple(pop, bl);
+
+ ::_encode_simple(replica_map, bl);
+
+ map<int,Capability::Export> cap_map;
+ export_client_caps(cap_map);
+ ::_encode_simple(cap_map, bl);
+
+ authlock._encode(bl);
+ linklock._encode(bl);
+ dirfragtreelock._encode(bl);
+ filelock._encode(bl);
+ dirlock._encode(bl);
+
+ get(PIN_TEMPEXPORTING);
+}
+
+void CInode::finish_export(utime_t now)
+{
+ pop.zero(now);
+
+ // just in case!
+ dirlock.clear_updated();
+
+ put(PIN_TEMPEXPORTING);
+}
+
+void CInode::decode_import(bufferlist::iterator& p,
+ set<int>& new_client_caps,
+ LogSegment *ls)
+{
+ utime_t old_mtime = inode.mtime;
+ ::_decode_simple(inode, p);
+ if (old_mtime > inode.mtime) {
+ assert(dirlock.is_updated());
+ inode.mtime = old_mtime; // preserve our mtime, if it is larger
+ }
+
+ ::_decode_simple(symlink, p);
+ dirfragtree._decode(p);
+
+ bool dirty;
+ ::_decode_simple(dirty, p);
+ if (dirty)
+ _mark_dirty(ls);
+
+ ::_decode_simple(pop, p);
+
+ ::_decode_simple(replica_map, p);
+ if (!replica_map.empty()) get(PIN_REPLICATED);
+
+ map<int,Capability::Export> cap_map;
+ ::_decode_simple(cap_map, p);
+ merge_client_caps(cap_map, new_client_caps);
+
+ authlock._decode(p);
+ linklock._decode(p);
+ dirfragtreelock._decode(p);
+ filelock._decode(p);
+ dirlock._decode(p);
+}
class CInode;
class CInodeDiscover;
class MDCache;
-
+class LogSegment;
ostream& operator<<(ostream& out, CInode& in);
static const int PIN_BATCHOPENJOURNAL = 9;
static const int PIN_SCATTERED = 10;
static const int PIN_STICKYDIRS = 11;
+ static const int PIN_PURGING = -12;
+ static const int PIN_FREEZING = 13;
+ static const int PIN_FROZEN = 14;
const char *pin_name(int p) {
switch (p) {
case PIN_BATCHOPENJOURNAL: return "batchopenjournal";
case PIN_SCATTERED: return "scattered";
case PIN_STICKYDIRS: return "stickydirs";
+ case PIN_FREEZING: return "freezing";
+ case PIN_FROZEN: return "frozen";
default: return generic_pin_name(p);
}
}
static const int STATE_UNANCHORING = (1<<4);
static const int STATE_OPENINGDIR = (1<<5);
static const int STATE_REJOINUNDEF = (1<<6); // inode contents undefined.
+ static const int STATE_FREEZING = (1<<7);
+ static const int STATE_FROZEN = (1<<8);
+ static const int STATE_AMBIGUOUSAUTH = (1<<9);
// -- waiters --
//static const int WAIT_SLAVEAGREE = (1<<0);
static const int WAIT_ANCHORED = (1<<2);
static const int WAIT_UNANCHORED = (1<<3);
static const int WAIT_CAPS = (1<<4);
+ static const int WAIT_FROZEN = (1<<5);
static const int WAIT_AUTHLOCK_OFFSET = 5;
static const int WAIT_LINKLOCK_OFFSET = 5 + SimpleLock::WAIT_BITS;
}
inode_t *project_inode();
- void pop_and_dirty_projected_inode();
+ void pop_and_dirty_projected_inode(LogSegment *ls);
// -- cache infrastructure --
private:
utime_t replica_caps_wanted_keep_until;
- private:
+ // LogSegment xlists i (may) belong to
+ xlist<CInode*>::item xlist_dirty;
+public:
+ xlist<CInode*>::item xlist_open_file;
+ xlist<CInode*>::item xlist_dirty_inode_mtime;
+ xlist<CInode*>::item xlist_purging_inode;
+
+private:
// auth pin
int auth_pins;
int nested_auth_pins;
+public:
+ int auth_pin_freeze_allowance;
public:
inode_load_vec_t pop;
stickydir_ref(0),
parent(0), force_auth(CDIR_AUTH_DEFAULT),
replica_caps_wanted(0),
+ xlist_dirty(this), xlist_open_file(this),
+ xlist_dirty_inode_mtime(this), xlist_purging_inode(this),
auth_pins(0), nested_auth_pins(0),
versionlock(this, LOCK_OTYPE_IVERSION, WAIT_VERSIONLOCK_OFFSET),
authlock(this, LOCK_OTYPE_IAUTH, WAIT_AUTHLOCK_OFFSET),
bool is_root() { return inode.ino == MDS_INO_ROOT; }
bool is_stray() { return MDS_INO_IS_STRAY(inode.ino); }
+ bool is_base() { return inode.ino < MDS_INO_BASE; }
+
+ // note: this overloads MDSCacheObject
+ bool is_ambiguous_auth() {
+ return state_test(STATE_AMBIGUOUSAUTH) ||
+ MDSCacheObject::is_ambiguous_auth();
+ }
inodeno_t ino() const { return inode.ino; }
version_t get_version() { return inode.version; }
version_t pre_dirty();
- void _mark_dirty();
- void mark_dirty(version_t projected_dirv);
+ void _mark_dirty(LogSegment *ls);
+ void mark_dirty(version_t projected_dirv, LogSegment *ls);
void mark_clean();
void add_waiter(int tag, Context *c);
+ // -- import/export --
+ void encode_export(bufferlist& bl);
+ void finish_export(utime_t now);
+ void abort_export() {
+ put(PIN_TEMPEXPORTING);
+ }
+ void decode_import(bufferlist::iterator& p,
+ set<int>& new_client_caps,
+ LogSegment *ls);
+
+
// -- locks --
public:
LocalLock versionlock;
FileLock filelock;
ScatterLock dirlock;
+
SimpleLock* get_lock(int type) {
switch (type) {
case LOCK_OTYPE_IFILE: return &filelock;
void encode_lock_state(int type, bufferlist& bl);
void decode_lock_state(int type, bufferlist& bl);
+ void clear_dirty_scattered(int type);
// -- caps -- (new)
// client caps
client_caps = cl;
}
*/
- void take_client_caps(map<int,Capability::Export>& cl) {
+ void clear_client_caps() {
if (!client_caps.empty())
put(PIN_CAPS);
+ client_caps.clear();
+ }
+ void export_client_caps(map<int,Capability::Export>& cl) {
for (map<int,Capability>::iterator it = client_caps.begin();
it != client_caps.end();
it++) {
cl[it->first] = it->second.make_export();
}
- client_caps.clear();
}
void merge_client_caps(map<int,Capability::Export>& cl, set<int>& new_client_caps) {
if (client_caps.empty() && !cl.empty())
// -- freeze --
+ bool is_freezing_inode() { return state_test(STATE_FREEZING); }
+ bool is_frozen_inode() { return state_test(STATE_FROZEN); }
bool is_frozen();
bool is_frozen_dir();
bool is_freezing();
+ bool freeze_inode(int auth_pin_allowance=0);
+ void unfreeze_inode(list<Context*>& finished);
+
// -- reference counting --
void bad_put(int by) {
};
-// export
-
-class CInodeExport {
-
- struct st_ {
- inode_t inode;
-
- inode_load_vec_t pop;
-
- bool is_dirty; // dirty inode?
-
- int num_caps;
- } st;
-
- string symlink;
- fragtree_t dirfragtree;
-
- map<int,int> replicas;
- map<int,Capability::Export> cap_map;
-
- bufferlist locks;
-
-public:
- CInodeExport() {}
- CInodeExport(CInode *in, utime_t now) {
- st.inode = in->inode;
- symlink = in->symlink;
- dirfragtree = in->dirfragtree;
-
- st.is_dirty = in->is_dirty();
- replicas = in->replica_map;
-
- in->authlock._encode(locks);
- in->linklock._encode(locks);
- in->dirfragtreelock._encode(locks);
- in->filelock._encode(locks);
- in->dirlock._encode(locks);
-
- st.pop = in->pop;
- in->pop.zero(now);
-
- // steal WRITER caps from inode
- in->take_client_caps(cap_map);
- //remaining_issued = in->get_caps_issued();
- }
-
- inodeno_t get_ino() { return st.inode.ino; }
-
- void update_inode(CInode *in, set<int>& new_client_caps) {
- // treat scatterlocked mtime special, since replica may have newer info
- if (in->dirlock.get_state() == LOCK_SCATTER ||
- in->dirlock.get_state() == LOCK_GLOCKC ||
- in->dirlock.get_state() == LOCK_GTEMPSYNCC)
- st.inode.mtime = MAX(in->inode.mtime, st.inode.mtime);
-
- in->inode = st.inode;
- in->symlink = symlink;
- in->dirfragtree = dirfragtree;
-
- in->pop = st.pop;
-
- if (st.is_dirty)
- in->_mark_dirty();
-
- in->replica_map = replicas;
- if (!replicas.empty())
- in->get(CInode::PIN_REPLICATED);
-
- int off = 0;
- in->authlock._decode(locks, off);
- in->linklock._decode(locks, off);
- in->dirfragtreelock._decode(locks, off);
- in->filelock._decode(locks, off);
- in->dirlock._decode(locks, off);
-
- // caps
- in->merge_client_caps(cap_map, new_client_caps);
- }
-
- void _encode(bufferlist& bl) {
- st.num_caps = cap_map.size();
-
- ::_encode(st, bl);
- ::_encode(symlink, bl);
- dirfragtree._encode(bl);
- ::_encode(replicas, bl);
- ::_encode(locks, bl);
- ::_encode(cap_map, bl);
- }
-
- int _decode(bufferlist& bl, int off = 0) {
- ::_decode(st, bl, off);
- ::_decode(symlink, bl, off);
- dirfragtree._decode(bl, off);
- ::_decode(replicas, bl, off);
- ::_decode(locks, bl, off);
- ::_decode(cap_map, bl, off);
-
- return off;
- }
-};
-
-
#endif
void ClientMap::save(Context *onsave, version_t needv)
{
dout(10) << "save needv " << needv << ", v " << version << dendl;
+
+ if (needv && committing >= needv) {
+ assert(committing > committed);
+ commit_waiters[committing].push_back(onsave);
+ return;
+ }
+
commit_waiters[version].push_back(onsave);
- if (needv && committing >= needv) return;
-
bufferlist bl;
init_inode();
mds->filer->write(inode,
0, bl.length(), bl,
0,
- 0, new C_CM_Save(this, version));
+ 0, new C_CM_Save(this, version));
}
void ClientMap::_save_finish(version_t v)
map<version_t, list<Context*> > commit_waiters;
public:
- ClientMap(MDS *m) : mds(m),
- version(0), projected(0), committing(0), committed(0) {}
-
version_t get_version() { return version; }
version_t get_projected() { return projected; }
version_t get_committing() { return committing; }
// client id -> tid -> result code
map<int, set<tid_t> > completed_requests; // completed client requests
map<int, map<tid_t, Context*> > waiting_for_trim;
+ version_t requestmapv;
public:
void add_completed_request(metareqid_t ri) {
completed_requests[ri.client].insert(ri.tid);
+ requestmapv++;
}
void trim_completed_requests(int client,
tid_t mintid) { // zero means trim all!
}
+
+ ClientMap(MDS *m) : mds(m),
+ version(0), projected(0), committing(0), committed(0),
+ requestmapv(0) {}
+
+
// -- encoding --
void encode(bufferlist& bl) {
bl.append((char*)&version, sizeof(version));
projected = committing = committed = version;
}
+
// -- loading, saving --
inode_t inode;
list<Context*> waiting_for_load;
version_t get_version() { return version; }
version_t get_committed_version() { return committed_version; }
+ version_t get_committing_version() { return committing_version; }
// load/save from disk (hack)
bool is_undef() { return state == STATE_UNDEF; }
if (!object->can_auth_pin()) {
// wait
dout(10) << " can't auth_pin (freezing?), waiting to authpin " << *object << dendl;
- object->add_waiter(MDSCacheObject::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr));
+ object->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
mds->locker->drop_locks(mdr);
mdr->drop_local_auth_pins();
return false;
mds->send_message_mds(req, p->first, MDS_PORT_SERVER);
// put in waiting list
- assert(mdr->waiting_on_slave.count(p->first) == 0);
- mdr->waiting_on_slave.insert(p->first);
+ assert(mdr->more()->waiting_on_slave.count(p->first) == 0);
+ mdr->more()->waiting_on_slave.insert(p->first);
}
return false;
}
public:
C_MDL_RequestInodeFileCaps(Locker *l, CInode *i) : locker(l), in(i) {}
void finish(int r) {
+ in->put(CInode::PIN_PTRWAITER);
if (!in->is_auth())
locker->request_inode_file_caps(in);
}
// wait for single auth
if (in->is_ambiguous_auth()) {
+ in->get(CInode::PIN_PTRWAITER);
in->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH,
new C_MDL_RequestInodeFileCaps(this, in));
return;
if (cap->is_null()) {
dout(7) << " cap for client" << client << " is now null, removing from " << *in << dendl;
in->remove_client_cap(client);
+ if (!in->is_any_caps())
+ in->xlist_open_file.remove_myself(); // unpin logsegment
if (!in->is_auth())
request_inode_file_caps(in);
if (!lock->get_parent()->can_auth_pin()) {
dout(7) << "try_simple_eval can't auth_pin, waiting on " << *lock->get_parent() << dendl;
//if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH))
- lock->get_parent()->add_waiter(MDSCacheObject::WAIT_AUTHPINNABLE, new C_Locker_SimpleEval(this, lock));
+ lock->get_parent()->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_SimpleEval(this, lock));
return;
}
// wait!
dout(7) << "simple_rdlock_try waiting on " << *lock << " on " << *lock->get_parent() << dendl;
- lock->add_waiter(SimpleLock::WAIT_RD, con);
+ if (con) lock->add_waiter(SimpleLock::WAIT_RD, con);
return false;
}
// send lock request
int auth = lock->get_parent()->authority().first;
- mdr->slaves.insert(auth);
+ mdr->more()->slaves.insert(auth);
MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_XLOCK);
r->set_lock_type(lock->get_type());
lock->get_parent()->set_object_info(r->get_object_info());
public:
C_Locker_ScatterEval(Locker *l, ScatterLock *lk) : locker(l), lock(lk) {}
void finish(int r) {
+ lock->get_parent()->put(CInode::PIN_PTRWAITER);
locker->try_scatter_eval(lock);
}
};
// unstable and ambiguous auth?
if (!lock->is_stable() &&
lock->get_parent()->is_ambiguous_auth()) {
- dout(7) << "scatter_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << dendl;
+ dout(7) << "try_scatter_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << dendl;
//if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH))
+ lock->get_parent()->get(CInode::PIN_PTRWAITER);
lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_ScatterEval(this, lock));
return;
}
if (!lock->get_parent()->can_auth_pin()) {
dout(7) << "try_scatter_eval can't auth_pin, waiting on " << *lock->get_parent() << dendl;
//if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH))
- lock->get_parent()->add_waiter(MDSCacheObject::WAIT_AUTHPINNABLE, new C_Locker_ScatterEval(this, lock));
+ lock->get_parent()->get(CInode::PIN_PTRWAITER);
+ lock->get_parent()->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_ScatterEval(this, lock));
return;
}
auth, MDS_PORT_LOCKER);
}
lock->set_state(LOCK_LOCK);
- //lock->get_parent()->put(CInode::PIN_SCATTERED);
}
} else {
dout(7) << "scatter_eval finished lock gather/un-wrlock on " << *lock
<< " on " << *lock->get_parent() << dendl;
lock->set_state(LOCK_LOCK);
- //lock->get_parent()->put(CInode::PIN_SCATTERED);
lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_STABLE);
lock->get_parent()->auth_unpin();
}
send_lock_message(lock, LOCK_AC_SCATTER, data);
}
lock->set_state(LOCK_SCATTER);
- //lock->get_parent()->get(CInode::PIN_SCATTERED);
lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE);
lock->get_parent()->auth_unpin();
}
void Locker::scatter_writebehind(ScatterLock *lock)
{
CInode *in = (CInode*)lock->get_parent();
- dout(10) << "scatter_writebehind on " << *lock << " on " << *in << dendl;
+ dout(10) << "scatter_writebehind " << in->inode.mtime << " on " << *lock << " on " << *in << dendl;
// journal write-behind.
inode_t *pi = in->project_inode();
+ pi->mtime = in->inode.mtime; // make sure an intermediate version isn't goofing us up
pi->version = in->pre_dirty();
EUpdate *le = new EUpdate(mds->mdlog, "scatter writebehind");
le->metablob.add_primary_dentry(in->get_parent_dn(), true, 0, pi);
mds->mdlog->submit_entry(le);
- mds->mdlog->wait_for_sync(new C_Locker_ScatterWB(this, lock));
+ mds->mdlog->wait_for_sync(new C_Locker_ScatterWB(this, lock, mds->mdlog->get_current_segment()));
}
-void Locker::scatter_writebehind_finish(ScatterLock *lock)
+void Locker::scatter_writebehind_finish(ScatterLock *lock, LogSegment *ls)
{
CInode *in = (CInode*)lock->get_parent();
dout(10) << "scatter_writebehind_finish on " << *lock << " on " << *in << dendl;
- in->pop_and_dirty_projected_inode();
+ in->pop_and_dirty_projected_inode(ls);
lock->clear_updated();
scatter_eval_gather(lock);
}
assert(lock->get_parent()->is_auth());
assert(lock->is_stable());
- if (((CInode*)lock->get_parent())->has_subtree_root_dirfrag()) {
+ CInode *in = (CInode*)lock->get_parent();
+ if (in->has_subtree_root_dirfrag() && !in->is_base()) {
// i _should_ be scattered.
if (!lock->is_rdlocked() &&
- !lock->is_xlocked()) {
+ !lock->is_xlocked() &&
+ lock->get_state() != LOCK_SCATTER) {
dout(10) << "scatter_eval no rdlocks|xlocks, am subtree root inode, scattering" << dendl;
scatter_scatter(lock);
+ autoscattered.push_back(&lock->xlistitem_autoscattered);
}
} else {
// i _should_ be sync.
+ lock->xlistitem_autoscattered.remove_myself();
if (!lock->is_wrlocked() &&
- !lock->is_xlocked()) {
+ !lock->is_xlocked() &&
+ lock->get_state() != LOCK_SYNC) {
dout(10) << "scatter_eval no wrlocks|xlocks, not subtree root inode, syncing" << dendl;
scatter_sync(lock);
}
}
}
+void Locker::note_autoscattered(ScatterLock *lock)
+{
+ dout(10) << "note_autoscattered " << *lock << " on " << *lock->get_parent() << dendl;
+ autoscattered.push_back(&lock->xlistitem_autoscattered);
+}
+
+
+/*
+ * this is called by LogSegment::try_to_trim() when trying to
+ * flush dirty scattered data (e.g. inode->dirlock mtime) back
+ * to the auth node.
+ */
+void Locker::scatter_try_unscatter(ScatterLock *lock, Context *c)
+{
+ dout(10) << "scatter_try_unscatter " << *lock << " on " << *lock->get_parent() << dendl;
+ assert(!lock->get_parent()->is_auth());
+ assert(!lock->get_parent()->is_ambiguous_auth());
+
+ // request unscatter?
+ int auth = lock->get_parent()->authority().first;
+ if (lock->get_state() == LOCK_SCATTER &&
+ mds->mdsmap->get_state(auth) >= MDSMap::STATE_ACTIVE)
+ mds->send_message_mds(new MLock(lock, LOCK_AC_REQUNSCATTER, mds->get_nodeid()),
+ auth, MDS_PORT_LOCKER);
+
+ // wait...
+ lock->add_waiter(SimpleLock::WAIT_STABLE, c);
+}
+
void Locker::scatter_sync(ScatterLock *lock)
{
lock->init_gather();
} else {
if (!lock->is_wrlocked()) {
- //lock->get_parent()->put(CInode::PIN_SCATTERED);
break; // do it now, we're fine
}
}
assert(lock->get_parent()->is_auth());
assert(lock->is_stable());
+ lock->set_last_scatter(g_clock.now());
+
switch (lock->get_state()) {
case LOCK_SYNC:
if (!lock->is_rdlocked() &&
send_lock_message(lock, LOCK_AC_SCATTER, data);
}
lock->set_state(LOCK_SCATTER);
- //lock->get_parent()->get(CInode::PIN_SCATTERED);
lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE);
}
case LOCK_SCATTER:
if (!lock->is_wrlocked() &&
!lock->get_parent()->is_replicated()) {
- //lock->get_parent()->put(CInode::PIN_SCATTERED);
break; // do it.
}
case LOCK_SCATTER:
if (!lock->is_wrlocked() &&
!lock->get_parent()->is_replicated()) {
- //lock->get_parent()->put(CInode::PIN_SCATTERED);
break; // do it.
}
-
-
void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m)
{
int from = m->get_asker();
// -- replica --
case LOCK_AC_SYNC:
assert(lock->get_state() == LOCK_LOCK);
-
lock->set_state(LOCK_SYNC);
lock->decode_locked_state(m->get_data());
lock->clear_updated();
case LOCK_AC_LOCK:
assert(lock->get_state() == LOCK_SCATTER ||
lock->get_state() == LOCK_SYNC);
-
+
// wait for wrlocks to close?
if (lock->is_wrlocked()) {
assert(lock->get_state() == LOCK_SCATTER);
<< " on " << *lock->get_parent() << dendl;
lock->set_state(LOCK_GLOCKS);
} else {
- //if (lock->get_state() == LOCK_SCATTER)
- //lock->get_parent()->put(CInode::PIN_SCATTERED);
-
+ dout(7) << "handle_scatter_lock has no rd|wrlocks, sending lockack for " << *lock
+ << " on " << *lock->get_parent() << dendl;
+
// encode and reply
bufferlist data;
lock->encode_locked_state(data);
lock->decode_locked_state(m->get_data());
lock->clear_updated();
lock->set_state(LOCK_SCATTER);
- //lock->get_parent()->get(CInode::PIN_SCATTERED);
lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE);
break;
case LOCK_AC_REQSCATTER:
if (lock->is_stable()) {
- dout(7) << "handle_scatter_lock got scatter request on " << *lock << " on " << *lock->get_parent()
- << dendl;
+ /* NOTE: we can do this _even_ if !can_auth_pin (i.e. freezing)
+ * because the replica should be holding an auth_pin if they're
+ * doing this (and thus, we are freezing, not frozen, and indefinite
+ * starvation isn't an issue).
+ */
+ dout(7) << "handle_scatter_lock got scatter request on " << *lock
+ << " on " << *lock->get_parent() << dendl;
scatter_scatter(lock);
} else {
- dout(7) << "handle_scatter_lock ignoring scatter request on " << *lock << " on " << *lock->get_parent()
- << dendl;
+ dout(7) << "handle_scatter_lock ignoring scatter request on " << *lock
+ << " on " << *lock->get_parent() << dendl;
}
break;
+ case LOCK_AC_REQUNSCATTER:
+ if (!lock->is_stable()) {
+ dout(7) << "handle_scatter_lock ignoring now-unnecessary unscatter request on " << *lock
+ << " on " << *lock->get_parent() << dendl;
+ } else if (lock->get_parent()->can_auth_pin()) {
+ dout(7) << "handle_scatter_lock got unscatter request on " << *lock
+ << " on " << *lock->get_parent() << dendl;
+ scatter_lock(lock);
+ } else {
+ dout(7) << "handle_scatter_lock DROPPING unscatter request on " << *lock
+ << " on " << *lock->get_parent() << dendl;
+ /* FIXME: if we can't auth_pin here, this request is effectively lost... */
+ }
}
delete m;
+void Locker::scatter_unscatter_autoscattered()
+{
+ /*
+ * periodically unscatter autoscattered locks
+ */
+
+ dout(10) << "scatter_unscatter_autoscattered" << dendl;
+
+ utime_t now = g_clock.now();
+ int n = autoscattered.size();
+ while (!autoscattered.empty()) {
+ ScatterLock *lock = autoscattered.front();
+
+ // stop?
+ if (lock->get_state() == LOCK_SCATTER &&
+ now - lock->get_last_scatter() < 10.0)
+ break;
+
+ autoscattered.pop_front();
+
+ if (lock->get_state() == LOCK_SCATTER &&
+ lock->get_parent()->is_replicated()) {
+ if (((CInode*)lock->get_parent())->is_frozen() ||
+ ((CInode*)lock->get_parent())->is_freezing()) {
+ // hrm.. requeue.
+ dout(10) << "last_scatter " << lock->get_last_scatter()
+ << ", now " << now << ", but frozen|freezing, requeueing" << dendl;
+ autoscattered.push_back(&lock->xlistitem_autoscattered);
+ } else {
+ dout(10) << "last_scatter " << lock->get_last_scatter()
+ << ", now " << now << ", locking" << dendl;
+ scatter_lock(lock);
+ }
+ }
+ if (--n == 0) break;
+ }
+}
+
// ==========================================================================
public:
C_Locker_FileEval(Locker *l, FileLock *lk) : locker(l), lock(lk) {}
void finish(int r) {
+ lock->get_parent()->put(CInode::PIN_PTRWAITER);
locker->try_file_eval(lock);
}
};
in->is_ambiguous_auth()) {
dout(7) << "try_file_eval not stable and ambiguous auth, waiting on " << *in << dendl;
//if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH))
+ in->get(CInode::PIN_PTRWAITER);
in->add_waiter(CInode::WAIT_SINGLEAUTH, new C_Locker_FileEval(this, lock));
return;
}
if (!lock->get_parent()->can_auth_pin()) {
dout(7) << "try_file_eval can't auth_pin, waiting on " << *in << dendl;
//if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH))
- in->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_Locker_FileEval(this, lock));
+ in->get(CInode::PIN_PTRWAITER);
+ in->add_waiter(CInode::WAIT_UNFREEZE, new C_Locker_FileEval(this, lock));
return;
}
class MClientRequest;
-
class Anchor;
class Capability;
+class LogSegment;
class SimpleLock;
class FileLock;
void dispatch(Message *m);
void handle_lock(MLock *m);
+
protected:
void send_lock_message(SimpleLock *lock, int msg);
void send_lock_message(SimpleLock *lock, int msg, const bufferlist &data);
void dentry_anon_rdlock_trace_finish(vector<CDentry*>& trace);
// scatter
+protected:
+ xlist<ScatterLock*> autoscattered;
+
public:
void try_scatter_eval(ScatterLock *lock);
void scatter_eval(ScatterLock *lock); // public for MDCache::adjust_subtree_auth()
void scatter_eval_gather(ScatterLock *lock);
+ void scatter_unscatter_autoscattered();
+ void scatter_try_unscatter(ScatterLock *lock, Context *c);
+ void note_autoscattered(ScatterLock *lock);
+
+ void scatter_lock(ScatterLock *lock); // called by LogSegment::try_to_expire
+
protected:
void handle_scatter_lock(ScatterLock *lock, MLock *m);
+ void _scatter_replica_lock(ScatterLock *lock, int auth);
void scatter_sync(ScatterLock *lock);
- void scatter_lock(ScatterLock *lock);
void scatter_scatter(ScatterLock *lock);
void scatter_tempsync(ScatterLock *lock);
bool scatter_rdlock_start(ScatterLock *lock, MDRequest *mdr);
class C_Locker_ScatterWB : public Context {
Locker *locker;
ScatterLock *lock;
+ LogSegment *ls;
public:
- C_Locker_ScatterWB(Locker *l, ScatterLock *sl) : locker(l), lock(sl) {}
+ C_Locker_ScatterWB(Locker *l, ScatterLock *sl, LogSegment *s) : locker(l), lock(sl), ls(s) {}
void finish(int r) {
- locker->scatter_writebehind_finish(lock);
+ locker->scatter_writebehind_finish(lock, ls);
}
};
- void scatter_writebehind_finish(ScatterLock *lock);
+ void scatter_writebehind_finish(ScatterLock *lock, LogSegment *ls);
// local
protected:
#include "include/Context.h"
class MDS;
+class LogSegment;
// generic log event
class LogEvent {
friend class MDLog;
public:
+ LogSegment *_segment;
+
LogEvent(int t) :
- _type(t), _start_off(0), _end_off(0) { }
+ _type(t), _start_off(0), _end_off(0), _segment(0) { }
virtual ~LogEvent() { }
int get_type() { return _type; }
}
/*** live journal ***/
-
- /* obsolete() - is this entry committed to primary store, such that
- * we can expire it from the journal?
- */
- virtual bool has_expired(MDS *m) {
- return true;
- }
-
- /* expire() - prod MDS into committing the relevant state so that this
- * entry can be expired from the jorunal.
+ /* update_segment() - adjust any state we need to in the LogSegment
*/
- virtual void expire(MDS *m, Context *c) {
- assert(0);
- c->finish(0);
- delete c;
- }
+ virtual void update_segment() { }
-
/*** recovery ***/
/* replay() - replay given event. this is idempotent.
*/
virtual void replay(MDS *m) { assert(0); }
+
};
inline ostream& operator<<(ostream& out, LogEvent& le) {
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __LOGSEGMENT_H
+#define __LOGSEGMENT_H
+
+#include "include/xlist.h"
+#include "include/interval_set.h"
+#include "include/Context.h"
+
+#include <ext/hash_set>
+using __gnu_cxx::hash_set;
+
+class CDir;
+class CInode;
+class CDentry;
+class MDS;
+class MDSlaveUpdate;
+
+class LogSegment {
+ public:
+ off_t offset;
+ int num_events;
+
+ // dirty items
+ xlist<CDir*> dirty_dirfrags;
+ xlist<CInode*> dirty_inodes;
+ xlist<CDentry*> dirty_dentries;
+
+ xlist<CInode*> open_files;
+ xlist<CInode*> dirty_inode_mtimes;
+
+ xlist<MDSlaveUpdate*> slave_updates;
+
+ //xlist<CInode*> purging_inodes;
+ map<CInode*, map<off_t,off_t> > purging_inodes;
+
+ // committed anchor transactions
+ hash_set<version_t> pending_commit_atids;
+
+ // client request ids
+ map<int, tid_t> last_client_tids;
+
+ // table version
+ version_t allocv;
+ version_t clientmapv;
+ version_t anchortablev;
+
+ // try to expire
+ C_Gather *try_to_expire(MDS *mds);
+
+ // cons
+ LogSegment(off_t off) : offset(off), num_events(0),
+ allocv(0), clientmapv(0), anchortablev(0)
+ { }
+};
+
+#endif
if (mds->get_nodeid() == 0)
beat_epoch++;
- // load
+ // my load
mds_load_t load = get_load();
mds_load[ mds->get_nodeid() ] = load;
//dout(0) << " load is " << load << " have " << mds_load.size() << dendl;
- unsigned cluster_size = mds->get_mds_map()->get_num_mds();
+ unsigned cluster_size = mds->get_mds_map()->get_num_in_mds();
if (mds_load.size() == cluster_size) {
// let's go!
//export_empties(); // no!
if (!dir->is_auth()) continue;
dout(0) << "do_fragmenting splitting " << *dir << dendl;
- mds->mdcache->split_dir(dir, 3);
+ mds->mdcache->split_dir(dir, 4);
}
split_queue.clear();
}
dout(5) << " do_rebalance: cluster loads are" << dendl;
+ mds->mdcache->migrator->clear_export_queue();
+
// rescale! turn my mds_load back into meta_load units
double load_fac = 1.0;
if (mds_load[whoami].mds_load() > 0) {
// hit inode
in->pop.get(type).hit(now);
- if (in->get_parent_dir())
- hit_dir(now, in->get_parent_dir(), type, who);
+ if (in->get_parent_dn())
+ hit_dir(now, in->get_parent_dn()->get_dir(), type, who);
}
/*
// hit me
lru.lru_set_midpoint(g_conf.mds_cache_mid);
did_shutdown_log_cap = false;
- shutdown_commits = 0;
}
MDCache::~MDCache()
// add to lru, inode map
assert(inode_map.count(in->ino()) == 0); // should be no dup inos!
inode_map[ in->ino() ] = in;
+
+ if (in->ino() < MDS_INO_BASE) {
+ base_inodes.insert(in);
+ if (in->ino() == MDS_INO_ROOT)
+ set_root(in);
+ if (in->ino() == MDS_INO_STRAY(mds->get_nodeid()))
+ stray = in;
+ }
}
void MDCache::remove_inode(CInode *o)
// remove from inode map
inode_map.erase(o->ino());
+ if (o->ino() < MDS_INO_BASE) {
+ assert(base_inodes.count(o));
+ base_inodes.erase(o);
+
+ if (o == root) root = 0;
+ if (o == stray) stray = 0;
+ }
+
// delete it
delete o;
-
- if (o == root) root = 0;
- if (o == stray) stray = 0;
}
root->force_auth = pair<int,int>(0, CDIR_AUTH_UNKNOWN);
- set_root( root );
add_inode( root );
return root;
}
} else {
// request inode from root mds
- if (waiting_for_root.empty()) {
- dout(7) << "discovering root" << dendl;
-
- filepath want;
- MDiscover *req = new MDiscover(whoami,
- MDS_INO_ROOT,
- want,
- false); // there _is_ no base dir for the root inode
- mds->send_message_mds(req, 0, MDS_PORT_CACHE);
- } else {
- dout(7) << "waiting for root" << dendl;
- }
-
- // wait
- waiting_for_root.push_back(c);
-
+ discover_base_ino(MDS_INO_ROOT, c, 0);
}
}
CInode *MDCache::create_stray_inode(int whose)
{
if (whose < 0) whose = mds->get_nodeid();
- stray = new CInode(this, whose == mds->get_nodeid());
- memset(&stray->inode, 0, sizeof(inode_t));
- stray->inode.ino = MDS_INO_STRAY(whose);
+
+ CInode *in = new CInode(this, whose == mds->get_nodeid());
+ memset(&in->inode, 0, sizeof(inode_t));
+ in->inode.ino = MDS_INO_STRAY(whose);
// make it up (FIXME)
- stray->inode.mode = 0755 | INODE_MODE_DIR;
- stray->inode.size = 0;
- stray->inode.ctime =
- stray->inode.mtime = g_clock.now();
+ in->inode.mode = 0755 | INODE_MODE_DIR;
+ in->inode.size = 0;
+ in->inode.ctime =
+ in->inode.mtime = g_clock.now();
- stray->inode.nlink = 1;
- stray->inode.layout = g_OSD_MDDirLayout;
+ in->inode.nlink = 1;
+ in->inode.layout = g_OSD_MDDirLayout;
- add_inode( stray );
+ add_inode( in );
- return stray;
+ return in;
}
void MDCache::open_local_stray()
dout(10) << "open_foreign_stray mds" << who << " " << ino << dendl;
assert(!have_inode(ino));
- // discover
- filepath want;
- MDiscover *req = new MDiscover(mds->get_nodeid(),
- ino,
- want,
- false); // there _is_ no base dir for the stray inode
- mds->send_message_mds(req, who, MDS_PORT_CACHE);
-
- // wait
- waiting_for_stray[ino].push_back(c);
+ discover_base_ino(ino, c, who);
}
CDir *root;
if (dir->ino() < MDS_INO_BASE) {
root = dir; // bootstrap hack.
- if (subtrees.count(root) == 0)
+ if (subtrees.count(root) == 0) {
subtrees[root].clear();
+ root->get(CDir::PIN_SUBTREE);
+ }
} else {
root = get_subtree_root(dir); // subtree root
}
dout(10) << " new subtree at " << *dir << dendl;
assert(subtrees.count(dir) == 0);
subtrees[dir].clear(); // create empty subtree bounds list for me.
+ dir->get(CDir::PIN_SUBTREE);
// set dir_auth
dir->set_dir_auth(auth);
eval_subtree_root(dir);
}
- // adjust export pins
- adjust_export_state(dir);
- for (set<CDir*>::iterator p = subtrees[dir].begin();
- p != subtrees[dir].end();
- ++p)
- adjust_export_state(*p);
-
show_subtrees();
}
-/*
- * any "export" point must be pinned in cache to ensure a proper
- * chain of delegation. we do this by pinning when a dir is nonauth
- * but the inode is auth.
- *
- * import points don't need to be pinned the same way simply because the
- * exporting mds is pinning the exprot (as above) thus the dir is
- * always open on the importer.
- */
-void MDCache::adjust_export_state(CDir *dir)
-{
- // be auth bit agnostic, so that we work during recovery
- // (before recalc_auth_bits)
- if (dir->authority().first != mds->get_nodeid() &&
- dir->inode->authority().first == mds->get_nodeid()) {
- // export.
- if (!dir->state_test(CDir::STATE_EXPORT)) {
- dout(10) << "adjust_export_state pinning new export " << *dir << dendl;
- dir->state_set(CDir::STATE_EXPORT);
- dir->get(CDir::PIN_EXPORT);
- }
- }
- else {
- // not export.
- if (dir->state_test(CDir::STATE_EXPORT)) {
- dout(10) << "adjust_export_state unpinning old export " << *dir << dendl;
- dir->state_clear(CDir::STATE_EXPORT);
- dir->put(CDir::PIN_EXPORT);
- }
- }
-}
-
void MDCache::try_subtree_merge(CDir *dir)
{
dout(7) << "try_subtree_merge " << *dir << dendl;
class C_MDC_SubtreeMergeWB : public Context {
MDCache *mdcache;
CInode *in;
+ LogSegment *ls;
public:
- C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i) : mdcache(mdc), in(i) {}
+ C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, LogSegment *s) : mdcache(mdc), in(i), ls(s) {}
void finish(int r) {
- mdcache->subtree_merge_writebehind_finish(in);
+ mdcache->subtree_merge_writebehind_finish(in, ls);
}
};
subtrees[parent].insert(*p);
// we are no longer a subtree or bound
+ dir->put(CDir::PIN_SUBTREE);
subtrees.erase(dir);
subtrees[parent].erase(dir);
le->metablob.add_primary_dentry(in->get_parent_dn(), true, 0, pi);
mds->mdlog->submit_entry(le);
- mds->mdlog->wait_for_sync(new C_MDC_SubtreeMergeWB(this, in));
+ mds->mdlog->wait_for_sync(new C_MDC_SubtreeMergeWB(this, in,
+ mds->mdlog->get_current_segment()));
}
}
show_subtrees(15);
}
-void MDCache::subtree_merge_writebehind_finish(CInode *in)
+void MDCache::subtree_merge_writebehind_finish(CInode *in, LogSegment *ls)
{
dout(10) << "subtree_merge_writebehind_finish on " << in << dendl;
- in->pop_and_dirty_projected_inode();
+ in->pop_and_dirty_projected_inode(ls);
in->auth_unpin();
}
CDir *root;
if (dir->ino() < MDS_INO_BASE) {
root = dir; // bootstrap hack.
- if (subtrees.count(root) == 0)
+ if (subtrees.count(root) == 0) {
subtrees[root].clear();
+ root->get(CDir::PIN_SUBTREE);
+ }
} else {
root = get_subtree_root(dir); // subtree root
}
dout(10) << " new subtree at " << *dir << dendl;
assert(subtrees.count(dir) == 0);
subtrees[dir].clear(); // create empty subtree bounds list for me.
+ dir->get(CDir::PIN_SUBTREE);
// set dir_auth
dir->set_dir_auth(auth);
p = n;
}
- // adjust export pins
- adjust_export_state(dir);
- for (set<CDir*>::iterator p = subtrees[dir].begin();
- p != subtrees[dir].end();
- ++p)
- adjust_export_state(*p);
-
// bound should now match.
verify_subtree_bounds(dir, bounds);
assert(subtrees.count(dir));
assert(subtrees[dir].empty());
subtrees.erase(dir);
+ dir->put(CDir::PIN_SUBTREE);
if (dir->get_parent_dir()) {
CDir *p = get_subtree_root(dir->get_parent_dir());
assert(subtrees[p].count(dir));
CDir *dir = *p;
// un-force dir to subtree root
- if (dir->dir_auth == pair<int,int>(dir->dir_auth.first, dir->dir_auth.first))
+ if (dir->dir_auth == pair<int,int>(dir->dir_auth.first, dir->dir_auth.first)) {
adjust_subtree_auth(dir, dir->dir_auth.first);
+ try_subtree_merge_at(dir);
+ }
}
show_subtrees();
// ====================================================================
// import map, recovery
-/*
- * take note of where we write import_maps in the log, as we need
- * to take care not to expire them until an updated map is safely flushed.
- */
-class C_MDS_WroteSubtreeMap : public Context {
- MDCache *mdcache;
- off_t end_off;
-public:
- C_MDS_WroteSubtreeMap(MDCache *mc, off_t eo) : mdcache(mc), end_off(eo) { }
- void finish(int r) {
- mdcache->_logged_subtree_map(end_off);
- }
-};
-
-void MDCache::log_subtree_map(Context *onsync)
+ESubtreeMap *MDCache::create_subtree_map()
{
- dout(10) << "log_subtree_map " << num_subtrees() << " subtrees, "
+ dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
<< num_subtrees_fullauth() << " fullauth"
<< dendl;
}
//le->metablob.print(cout);
-
- Context *fin = new C_MDS_WroteSubtreeMap(this, mds->mdlog->get_write_pos());
- mds->mdlog->writing_subtree_map = true;
- mds->mdlog->submit_entry(le);
- mds->mdlog->wait_for_sync(fin);
- if (onsync)
- mds->mdlog->wait_for_sync(onsync);
-}
-
-void MDCache::_logged_subtree_map(off_t off)
-{
- dout(10) << "_logged_subtree_map at " << off << dendl;
- mds->mdlog->subtree_maps.insert(off);
- mds->mdlog->writing_subtree_map = false;
-
- mds->mdlog->kick_subtree_map(); // just in case the last segment was empty.
+ return le;
}
}
// [resolving]
if (uncommitted_slave_updates.count(who)) {
- for (map<metareqid_t, EMetaBlob>::iterator p = uncommitted_slave_updates[who].begin();
+ for (map<metareqid_t, MDSlaveUpdate>::iterator p = uncommitted_slave_updates[who].begin();
p != uncommitted_slave_updates[who].end();
++p) {
dout(10) << " including uncommitted " << p->first << dendl;
// make note of recovery set
mds->mdsmap->get_recovery_mds_set(recovery_set);
recovery_set.erase(mds->get_nodeid());
- dout(1) << "my recovery peers will be " << recovery_set << dendl;
+ dout(1) << "handle_mds_failure mds" << who << " : recovery peers are " << recovery_set << dendl;
// adjust my recovery lists
wants_resolve.erase(who); // MDS will ask again
got_resolve.erase(who); // i'll get another.
+
+ rejoin_sent.erase(who); // i need to send another
rejoin_ack_gather.erase(who); // i'll need/get another.
+
+ dout(10) << " wants_resolve " << wants_resolve << dendl;
+ dout(10) << " got_resolve " << got_resolve << dendl;
+ dout(10) << " rejoin_sent " << rejoin_sent << dendl;
+ dout(10) << " rejoin_gather " << rejoin_gather << dendl;
+ dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
+
// tell the migrator too.
migrator->handle_mds_failure_or_stop(who);
- // kick any dir discovers that are waiting
- hash_map<inodeno_t,set<int> >::iterator p = dir_discovers.begin();
- while (p != dir_discovers.end()) {
- hash_map<inodeno_t,set<int> >::iterator n = p;
- n++;
-
- // waiting on this mds?
- if (p->second.count(who)) {
- CInode *in = get_inode(p->first);
- assert(in);
-
- // take waiters
- list<Context*> waiters;
- in->take_waiting(CInode::WAIT_DIR, waiters);
- mds->queue_waiters(waiters);
- dout(10) << "kicking WAIT_DIR on " << *in << dendl;
-
- // remove from mds list
- p->second.erase(who);
- if (p->second.empty())
- dir_discovers.erase(p);
- }
- p = n;
- }
-
+ // kick any discovers that are waiting
+ kick_discovers(who);
+
// clean up any requests slave to/from this node
list<MDRequest*> finish;
for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
// failed node is slave?
if (!p->second->committing) {
- if (p->second->witnessed.count(who)) {
+ if (p->second->more()->witnessed.count(who)) {
dout(10) << " master request " << *p->second << " no longer witnessed by slave mds" << who
<< dendl;
// discard this peer's prepare (if any)
- p->second->witnessed.erase(who);
+ p->second->more()->witnessed.erase(who);
}
- if (p->second->waiting_on_slave.count(who)) {
+ if (p->second->more()->waiting_on_slave.count(who)) {
dout(10) << " master request " << *p->second << " waiting for slave mds" << who
<< " to recover" << dendl;
// retry request when peer recovers
- p->second->waiting_on_slave.erase(who);
+ p->second->more()->waiting_on_slave.erase(who);
mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, p->second));
}
}
dout(7) << "ambiguous import succeeded on " << *dir << dendl;
migrator->import_finish(dir);
}
+ my_ambiguous_imports.erase(p); // no longer ambiguous.
}
p = next;
}
else {
dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
disambiguate_imports();
-
if (mds->is_resolve()) {
recalc_auth_bits();
trim_non_auth();
if (mds->is_resolve()) {
// replay
assert(uncommitted_slave_updates[from].count(*p));
- uncommitted_slave_updates[from][*p].replay(mds);
+ uncommitted_slave_updates[from][*p].commit.replay(mds);
uncommitted_slave_updates[from].erase(*p);
// log commit
mds->mdlog->submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", *p, from, ESlaveUpdate::OP_COMMIT));
if (mds->is_resolve()) {
assert(uncommitted_slave_updates[from].count(*p));
+ uncommitted_slave_updates[from][*p].rollback.replay(mds);
uncommitted_slave_updates[from].erase(*p);
mds->mdlog->submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", *p, from, ESlaveUpdate::OP_ROLLBACK));
} else {
MDRequest *mdr = request_get(*p);
- if (mdr->slave_commit) {
- mdr->slave_commit->finish(-1);
- delete mdr->slave_commit;
- mdr->slave_commit = 0;
+ if (mdr->more()->slave_commit) {
+ mdr->more()->slave_commit->finish(-1);
+ delete mdr->more()->slave_commit;
+ mdr->more()->slave_commit = 0;
}
if (mdr->slave_request)
mdr->aborted = true;
}
assert(my_ambiguous_imports.empty());
- // verify all my subtrees are unambiguous!
- for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
- p != subtrees.end();
- ++p) {
- CDir *dir = p->first;
- if (dir->is_ambiguous_dir_auth())
- dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl;
- assert(!dir->is_ambiguous_dir_auth());
+ if (mds->is_resolve()) {
+ // verify all my subtrees are unambiguous!
+ for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
+ p != subtrees.end();
+ ++p) {
+ CDir *dir = p->first;
+ if (dir->is_ambiguous_dir_auth()) {
+ dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl;
+ show_subtrees();
+ }
+ assert(!dir->is_ambiguous_dir_auth());
+ }
}
show_subtrees();
/*
* rejoin phase!
+ *
+ * this initiates rejoin. it shoudl be called before we get any
+ * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
+ *
* we start out by sending rejoins to everyone in the recovery set.
*
* if we are rejoin, send for all regions in our cache.
p != recovery_set.end();
++p) {
if (*p == mds->get_nodeid()) continue; // nothing to myself!
+ if (rejoin_sent.count(*p)) continue; // already sent a rejoin to this node!
if (mds->is_rejoin()) {
rejoin_gather.insert(*p);
rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK);
rejoin_walk(dir, rejoins[auth]);
}
+
+ // rejoin root inodes, too
+ for (map<int, MMDSCacheRejoin*>::iterator p = rejoins.begin();
+ p != rejoins.end();
+ ++p) {
+ if (mds->is_rejoin()) {
+ // weak
+ if (p->first == 0 && root)
+ p->second->add_weak_inode(root->ino());
+ if (get_inode(MDS_INO_STRAY(p->first)))
+ p->second->add_weak_inode(MDS_INO_STRAY(p->first));
+ } else {
+ // strong
+ if (p->first == 0 && root) {
+ p->second->add_weak_inode(root->ino());
+ p->second->add_strong_inode(root->ino(), root->get_replica_nonce(),
+ root->get_caps_wanted(),
+ root->authlock.get_state(),
+ root->linklock.get_state(),
+ root->dirfragtreelock.get_state(),
+ root->filelock.get_state(),
+ root->dirlock.get_state());
+ }
+ if (CInode *in = get_inode(MDS_INO_STRAY(p->first))) {
+ p->second->add_weak_inode(in->ino());
+ p->second->add_strong_inode(in->ino(), in->get_replica_nonce(),
+ in->get_caps_wanted(),
+ in->authlock.get_state(),
+ in->linklock.get_state(),
+ in->dirfragtreelock.get_state(),
+ in->filelock.get_state(),
+ in->dirlock.get_state());
+ }
+ }
+ }
if (!mds->is_rejoin()) {
- // strong.
+ // i am survivor. send strong rejoin.
// note request authpins, xlocks
for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
p != active_requests.end();
}
// send the messages
- assert(rejoin_ack_gather.empty());
for (map<int,MMDSCacheRejoin*>::iterator p = rejoins.begin();
p != rejoins.end();
++p) {
- mds->send_message_mds(p->second, p->first, MDS_PORT_CACHE);
+ assert(rejoin_sent.count(p->first) == 0);
+ assert(rejoin_ack_gather.count(p->first) == 0);
+ rejoin_sent.insert(p->first);
rejoin_ack_gather.insert(p->first);
+ mds->send_message_mds(p->second, p->first, MDS_PORT_CACHE);
}
// nothing?
if (mds->is_rejoin() && rejoins.empty()) {
- dout(10) << "nothing left to rejoin" << dendl;
+ dout(10) << "nothing to rejoin" << dendl;
mds->rejoin_done();
}
}
p != dir->items.end();
++p) {
CDentry *dn = p->second;
- assert(dn->is_primary());
dout(15) << " add_weak_primary_dentry " << *dn << dendl;
+ assert(dn->is_primary());
+ assert(dn->inode->is_dir());
rejoin->add_weak_primary_dentry(dir->dirfrag(), p->first, dn->get_inode()->ino());
dn->get_inode()->get_nested_dirfrags(nested);
+
+ if (dn->get_inode()->dirlock.is_updated()) {
+ // include full inode to shed any dirtyscattered state
+ rejoin->add_full_inode(dn->get_inode()->inode,
+ dn->get_inode()->symlink,
+ dn->get_inode()->dirfragtree);
+ dn->get_inode()->dirlock.clear_updated();
+ }
}
} else {
// STRONG
* the sender
* - is recovering from their journal.
* - may have incorrect (out of date) inode contents
+ * - will include full inodes IFF they contain dirty scatterlock content
*
* if the sender didn't trim_non_auth(), they
* - may have incorrect (out of date) dentry/inode linkage
}
}
+ // full inodes?
+ // dirty scatterlock content!
+ for (list<MMDSCacheRejoin::inode_full>::iterator p = weak->full_inodes.begin();
+ p != weak->full_inodes.end();
+ ++p) {
+ CInode *in = get_inode(p->inode.ino);
+ if (!in) continue;
+ if (p->inode.mtime > in->inode.mtime) in->inode.mtime = p->inode.mtime;
+ dout(10) << " got dirty inode scatterlock content " << *in << dendl;
+ in->dirlock.set_updated();
+ }
+
// walk weak map
for (map<dirfrag_t, map<string, MMDSCacheRejoin::dn_weak> >::iterator p = weak->weak.begin();
p != weak->weak.end();
++p) {
CDir *dir = get_dirfrag(p->first);
+ if (!dir) dout(0) << " missing dirfrag " << p->first << dendl;
assert(dir);
int nonce = dir->add_replica(from);
assert(dn);
assert(dn->is_primary());
- if (survivor) dentry_remove_replica(dn, from);
+ if (survivor && dn->is_replica(from))
+ dentry_remove_replica(dn, from); // this induces a lock gather completion
int dnonce = dn->add_replica(from);
dout(10) << " have " << *dn << dendl;
if (ack)
CInode *in = dn->get_inode();
assert(in);
- if (survivor) inode_remove_replica(in, from);
+ if (survivor && in->is_replica(from))
+ inode_remove_replica(in, from); // this induces a lock gather completion
int inonce = in->add_replica(from);
dout(10) << " have " << *in << dendl;
}
}
- if (survivor)
- rejoin_scour_survivor_replicas(from, ack);
+ // weak base inodes? (root, stray, etc.)
+ for (set<inodeno_t>::iterator p = weak->weak_inodes.begin();
+ p != weak->weak_inodes.end();
+ ++p) {
+ CInode *in = get_inode(*p);
+ assert(in); // hmm fixme wrt stray?
+ if (survivor && in->is_replica(from))
+ inode_remove_replica(in, from); // this induces a lock gather completion
+ int inonce = in->add_replica(from);
+ dout(10) << " have base " << *in << dendl;
+
+ if (ack)
+ ack->add_strong_inode(in->ino(),
+ inonce,
+ 0,
+ in->authlock.get_replica_state(),
+ in->linklock.get_replica_state(),
+ in->dirfragtreelock.get_replica_state(),
+ in->filelock.get_replica_state(),
+ in->dirlock.get_replica_state());
+ }
if (survivor) {
- // send ack
+ // survivor. do everything now.
+ rejoin_scour_survivor_replicas(from, ack);
mds->send_message_mds(ack, from, MDS_PORT_CACHE);
} else {
// done?
+ assert(rejoin_gather.count(from));
rejoin_gather.erase(from);
if (rejoin_gather.empty()) {
rejoin_gather_finish();
*
* @pathmap - map of inodeno to full pathnames. we remove items from this map
* as we discover we have them.
- * @retry - non-completion callback context. called when a pass of fetches
- * completes. deleted if we are done (i.e. pathmap is empty).
+ *
+ * returns a C_Gather* is there is work to do. caller is responsible for setting
+ * the C_Gather completer.
*/
-bool MDCache::parallel_fetch(map<inodeno_t,string>& pathmap,
- Context *retry)
+C_Gather *MDCache::parallel_fetch(map<inodeno_t,string>& pathmap)
{
dout(10) << "parallel_fetch on " << pathmap.size() << " paths" << dendl;
if (pathmap.empty()) {
dout(10) << "parallel_fetch done" << dendl;
assert(fetch_queue.empty());
- delete retry;
- return true;
+ return false;
}
// do a parallel fetch
- C_Gather *gather = new C_Gather(retry);
+ C_Gather *gather = new C_Gather;
for (set<CDir*>::iterator p = fetch_queue.begin();
p != fetch_queue.end();
++p) {
(*p)->fetch(gather->new_sub());
}
- return false;
+ return gather;
}
}
}
+ // base inodes? (root, stray, etc.)
+ for (set<inodeno_t>::iterator p = strong->weak_inodes.begin();
+ p != strong->weak_inodes.end();
+ ++p) {
+ CInode *in = get_inode(*p);
+ dout(10) << " have base " << *in << dendl;
+ in->add_replica(from);
+ }
+
// send missing?
if (missing) {
+ // we expect a FULL soon.
mds->send_message_mds(missing, from, MDS_PORT_CACHE);
} else {
// done?
+ assert(rejoin_gather.count(from));
rejoin_gather.erase(from);
if (rejoin_gather.empty()) {
rejoin_gather_finish();
}
// done?
+ assert(rejoin_ack_gather.count(from));
rejoin_ack_gather.erase(from);
if (mds->is_rejoin() &&
rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too.
}
// done?
+ assert(rejoin_gather.count(from));
rejoin_gather.erase(from);
if (rejoin_gather.empty()) {
rejoin_gather_finish();
}
};
+
+
void MDCache::rejoin_gather_finish()
{
dout(10) << "rejoin_gather_finish" << dendl;
rejoin_trim_undef_inodes();
// fetch paths?
- if (!cap_import_paths.empty() &&
- !parallel_fetch(cap_import_paths, new C_MDC_RejoinGatherFinish(this)))
- return;
+ // do this before ack, since some inodes we may have already gotten
+ // from surviving MDSs.
+ if (!cap_import_paths.empty()) {
+ C_Gather *gather = parallel_fetch(cap_import_paths);
+ if (gather) {
+ gather->set_finisher(new C_MDC_RejoinGatherFinish(this));
+ return;
+ }
+ }
// process cap imports
// ino -> client -> frommds -> capex
}
}
+ // root inodes too
+ if (root)
+ for (map<int,int>::iterator r = root->replicas_begin();
+ r != root->replicas_end();
+ ++r) {
+ ack[r->first]->add_full_inode(root->inode, root->symlink, root->dirfragtree);
+ ack[r->first]->add_strong_inode(root->ino(), r->second, 0,
+ root->authlock.get_replica_state(),
+ root->linklock.get_replica_state(),
+ root->dirfragtreelock.get_replica_state(),
+ root->filelock.get_replica_state(),
+ root->dirlock.get_replica_state());
+ }
+ if (stray)
+ for (map<int,int>::iterator r = stray->replicas_begin();
+ r != stray->replicas_end();
+ ++r) {
+ ack[r->first]->add_full_inode(stray->inode, stray->symlink, stray->dirfragtree);
+ ack[r->first]->add_strong_inode(stray->ino(), r->second, 0,
+ stray->authlock.get_replica_state(),
+ stray->linklock.get_replica_state(),
+ stray->dirfragtreelock.get_replica_state(),
+ stray->filelock.get_replica_state(),
+ stray->dirlock.get_replica_state());
+ }
+
// send acks
for (map<int,MMDSCacheRejoin*>::iterator p = ack.begin();
p != ack.end();
{
assert(root == 0);
root = in;
+ base_inodes.insert(in);
}
class C_MDC_PurgeFinish : public Context {
MDCache *mdc;
- inodeno_t ino;
- off_t newsize;
+ CInode *in;
+ off_t newsize, oldsize;
public:
- C_MDC_PurgeFinish(MDCache *c, inodeno_t i, off_t s) : mdc(c), ino(i), newsize(s) {}
+ C_MDC_PurgeFinish(MDCache *c, CInode *i, off_t ns, off_t os) :
+ mdc(c), in(i), newsize(ns), oldsize(os) {}
void finish(int r) {
- mdc->purge_inode_finish(ino, newsize);
+ mdc->purge_inode_finish(in, newsize, oldsize);
}
};
class C_MDC_PurgeFinish2 : public Context {
MDCache *mdc;
- inodeno_t ino;
- off_t newsize;
+ CInode *in;
+ off_t newsize, oldsize;
public:
- C_MDC_PurgeFinish2(MDCache *c, inodeno_t i, off_t s) : mdc(c), ino(i), newsize(s) {}
+ C_MDC_PurgeFinish2(MDCache *c, CInode *i, off_t ns, off_t os) :
+ mdc(c), in(i), newsize(ns), oldsize(os) {}
void finish(int r) {
- mdc->purge_inode_finish_2(ino, newsize);
+ mdc->purge_inode_finish_2(in, newsize, oldsize);
}
};
* will be called by on unlink or rmdir or truncate
* caller responsible for journaling an appropriate EUpdate
*/
-void MDCache::purge_inode(inode_t *inode, off_t newsize)
+void MDCache::purge_inode(CInode *in, off_t newsize, off_t oldsize, LogSegment *ls)
{
- dout(10) << "purge_inode " << inode->ino << " size " << inode->size
- << " -> " << newsize
+ dout(10) << "purge_inode " << oldsize << " -> " << newsize
+ << " on " << *in
<< dendl;
- // take note
- assert(purging[inode->ino].count(newsize) == 0);
- purging[inode->ino][newsize] = *inode;
+ assert(oldsize >= newsize);
- assert(inode->size >= newsize);
- _do_purge_inode(inode, newsize);
+ purging[in][newsize] = oldsize;
+ purging_ls[in][newsize] = ls;
+ ls->purging_inodes[in][newsize] = oldsize;
+
+ _do_purge_inode(in, newsize, oldsize);
}
-void MDCache::_do_purge_inode(inode_t *inode, off_t newsize)
+void MDCache::_do_purge_inode(CInode *in, off_t newsize, off_t oldsize)
{
+ in->get(CInode::PIN_PURGING);
+
// remove
- if (inode->size > 0) {
- mds->filer->remove(*inode, newsize, inode->size,
- 0, new C_MDC_PurgeFinish(this, inode->ino, newsize));
+ if (in->inode.size > 0) {
+ mds->filer->remove(in->inode, newsize, oldsize,
+ 0, new C_MDC_PurgeFinish(this, in, newsize, oldsize));
} else {
// no need, empty file, just log it
- purge_inode_finish(inode->ino, newsize);
+ purge_inode_finish(in, newsize, oldsize);
}
}
-void MDCache::purge_inode_finish(inodeno_t ino, off_t newsize)
+void MDCache::purge_inode_finish(CInode *in, off_t newsize, off_t oldsize)
{
- dout(10) << "purge_inode_finish " << ino << " to " << newsize
- << " - logging our completion" << dendl;
+ dout(10) << "purge_inode_finish " << oldsize << " -> " << newsize
+ << " on " << *in << dendl;
// log completion
- mds->mdlog->submit_entry(new EPurgeFinish(ino, newsize),
- new C_MDC_PurgeFinish2(this, ino, newsize));
+ mds->mdlog->submit_entry(new EPurgeFinish(in->ino(), newsize, oldsize),
+ new C_MDC_PurgeFinish2(this, in, newsize, oldsize));
}
-void MDCache::purge_inode_finish_2(inodeno_t ino, off_t newsize)
+void MDCache::purge_inode_finish_2(CInode *in, off_t newsize, off_t oldsize)
{
- dout(10) << "purge_inode_finish_2 " << ino << " to " << newsize << dendl;
+ dout(10) << "purge_inode_finish_2 " << oldsize << " -> " << newsize
+ << " on " << *in << dendl;
// remove from purging list
- purging[ino].erase(newsize);
- if (purging[ino].empty())
- purging.erase(ino);
+ LogSegment *ls = purging_ls[in][newsize];
+ purging[in].erase(newsize);
+ purging_ls[in].erase(newsize);
+ if (purging[in].empty()) {
+ purging.erase(in);
+ purging_ls.erase(in);
+ }
+
+ assert(ls->purging_inodes.count(in));
+ assert(ls->purging_inodes[in].count(newsize));
+ assert(ls->purging_inodes[in][newsize] == oldsize);
+ ls->purging_inodes[in].erase(newsize);
+ if (ls->purging_inodes[in].empty())
+ ls->purging_inodes.erase(in);
+ in->put(CInode::PIN_PURGING);
+
// tell anyone who cares (log flusher?)
- list<Context*> ls;
- ls.swap(waiting_for_purge[ino][newsize]);
- waiting_for_purge[ino].erase(newsize);
- if (waiting_for_purge[ino].empty())
- waiting_for_purge.erase(ino);
- finish_contexts(ls, 0);
+ if (purging.count(in) == 0 ||
+ purging[in].rbegin()->first < newsize) {
+ list<Context*> ls;
+ ls.swap(waiting_for_purge[in][newsize]);
+ waiting_for_purge[in].erase(newsize);
+ if (waiting_for_purge[in].empty())
+ waiting_for_purge.erase(in);
+ finish_contexts(ls, 0);
+ }
}
-void MDCache::add_recovered_purge(const inode_t& inode, off_t newsize)
+void MDCache::add_recovered_purge(CInode *in, off_t newsize, off_t oldsize, LogSegment *ls)
{
- assert(purging[inode.ino].count(newsize) == 0);
- purging[inode.ino][newsize] = inode;
+ assert(purging[in].count(newsize) == 0);
+ purging[in][newsize] = oldsize;
+ purging_ls[in][newsize] = ls;
+ ls->purging_inodes[in][newsize] = oldsize;
}
-void MDCache::remove_recovered_purge(inodeno_t ino, off_t newsize)
+void MDCache::remove_recovered_purge(CInode *in, off_t newsize, off_t oldsize)
{
- purging[ino].erase(newsize);
+ purging[in].erase(newsize);
}
void MDCache::start_recovered_purges()
{
dout(10) << "start_recovered_purges (" << purging.size() << " purges)" << dendl;
- for (map<inodeno_t, map<off_t,inode_t> >::iterator p = purging.begin();
+ for (map<CInode*, map<off_t, off_t> >::iterator p = purging.begin();
p != purging.end();
++p) {
- for (map<off_t,inode_t>::iterator q = p->second.begin();
+ for (map<off_t,off_t>::iterator q = p->second.begin();
q != p->second.end();
++q) {
- dout(10) << "start_recovered_purges " << p->first
- << " size " << q->second.size
- << " to " << q->first << dendl;
- _do_purge_inode(&q->second, q->first);
+ dout(10) << "start_recovered_purges "
+ << q->second << " -> " << q->first
+ << " on " << *p->first
+ << dendl;
+ _do_purge_inode(p->first, q->first, q->second);
}
}
}
map<int, MCacheExpire*> expiremap;
- // DENTRIES from the LRU
-
+ // trim dentries from the LRU
while (lru.lru_get_size() > (unsigned)max) {
CDentry *dn = (CDentry*)lru.lru_expire();
if (!dn) break;
trim_dentry(dn, expiremap);
}
- // trim root inode+dir?
- if (max == 0 && // only if we're trimming everything!
- lru.lru_get_size() == 0) {
- hash_map<inodeno_t,CInode*>::iterator p = inode_map.begin();
- while (p != inode_map.end()) {
- hash_map<inodeno_t,CInode*>::iterator n = p;
- n++;
-
- CInode *in = p->second;
-
+ // trim base inodes?
+ if (max == 0) {
+ set<CInode*>::iterator p = base_inodes.begin();
+ while (p != base_inodes.end()) {
+ CInode *in = *p++;
list<CDir*> ls;
in->get_dirfrags(ls);
- for (list<CDir*>::iterator q = ls.begin();
- q != ls.end();
- ++q)
- if ((*q)->get_num_ref() == 0)
- trim_dirfrag(*q, *q, expiremap);
-
- // root inode?
+ for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
+ CDir *dir = *p;
+ if (dir->get_num_ref() == 1) // subtree pin
+ trim_dirfrag(dir, 0, expiremap);
+ }
if (in->get_num_ref() == 0)
- trim_inode(0, in, 0, expiremap); // hrm, FIXME
-
- p = n;
- }
+ trim_inode(0, in, 0, expiremap);
+ }
}
- // send!
+ // send any expire messages
send_expire_messages(expiremap);
return true;
void MDCache::trim_dirfrag(CDir *dir, CDir *con, map<int, MCacheExpire*>& expiremap)
{
- assert(dir->get_num_ref() == 0);
-
dout(15) << "trim_dirfrag " << *dir << dendl;
+ if (dir->is_subtree_root()) {
+ assert(!dir->is_auth() ||
+ (!dir->is_replicated() && dir->inode->is_base()));
+ remove_subtree(dir); // remove from subtree map
+ }
+ assert(dir->get_num_ref() == 0);
+
CInode *in = dir->get_inode();
if (!dir->is_auth()) {
}
}
- if (dir->is_subtree_root())
- remove_subtree(dir); // remove from subtree map
in->close_dirfrag(dir->dirfrag().frag);
}
* the only non-auth items that remain are those that are needed to
* attach our own subtrees to the root.
*
+ * when we are done, all dentries will be in the top bit of the lru.
+ *
* why we have to do this:
* we may not have accurate linkage for non-auth items. which means we will
* know which subtree it falls into, and can not be sure to declare it to the
{
dout(7) << "trim_non_auth" << dendl;
+ // temporarily pin all subtree roots
+ for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
+ p != subtrees.end();
+ p++)
+ p->first->get(CDir::PIN_SUBTREETEMP);
+
// note first auth item we see.
// when we see it the second time, stop.
CDentry *first_auth = 0;
}
}
+ // move everything in the pintail to the top bit of the lru.
+ lru.lru_touch_entire_pintail();
+
+ // unpin all subtrees
+ for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
+ p != subtrees.end();
+ p++)
+ p->first->put(CDir::PIN_SUBTREETEMP);
+
show_subtrees();
}
// =========================================================================================
// shutdown
-class C_MDC_ShutdownCommit : public Context {
- MDCache *mdc;
-public:
- C_MDC_ShutdownCommit(MDCache *mdc) {
- this->mdc = mdc;
- }
- void finish(int r) {
- mdc->shutdown_commits--;
- }
-};
-
class C_MDC_ShutdownCheck : public Context {
MDCache *mdc;
public:
if (g_conf.mds_shutdown_check)
mds->timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this));
+
+ // g_conf.debug_mds = 10;
}
return true;
}
- // commit dirs?
- if (g_conf.mds_commit_on_shutdown) {
-
- if (shutdown_commits < 0) {
- dout(1) << "shutdown_pass committing all dirty dirs" << dendl;
- shutdown_commits = 0;
-
- for (hash_map<inodeno_t, CInode*>::iterator it = inode_map.begin();
- it != inode_map.end();
- it++) {
- CInode *in = it->second;
- if (!in->is_dir()) continue;
-
- // commit any dirty dirfrag that's ours
- list<CDir*> dfs;
- in->get_dirfrags(dfs);
- for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
- CDir *dir = *p;
- if (dir->is_auth() && dir->is_dirty()) {
- dir->commit(0, new C_MDC_ShutdownCommit(this));
- shutdown_commits++;
- }
- }
- }
- }
+ // flush batching eopens, so that we can properly expire them.
+ mds->server->journal_opens(); // hrm, this is sort of a hack.
- // commits?
- if (shutdown_commits > 0) {
- dout(7) << "shutdown_commits still waiting for " << shutdown_commits << dendl;
- return false;
- }
+ // flush what we can from the log
+ mds->mdlog->set_max_events(0);
+ mds->mdlog->trim();
+
+ if (mds->mdlog->get_num_segments() > 1) {
+ dout(7) << "still >1 segments, waiting for log to trim" << dendl;
+ return false;
}
- // flush anything we can from the cache
trim(0);
dout(5) << "lru size now " << lru.lru_get_size() << dendl;
- // flush batching eopens, so that we can properly expire them.
- mds->server->journal_opens(); // hrm, this is sort of a hack.
-
- // flush what we can from the log
- mds->mdlog->trim(0);
-
// SUBTREES
- // send all imports back to 0.
if (!subtrees.empty() &&
mds->get_nodeid() != 0 &&
!migrator->is_exporting() //&&
//!migrator->is_importing()
) {
- // export to root
dout(7) << "looking for subtrees to export to mds0" << dendl;
list<CDir*> ls;
for (map<CDir*, set<CDir*> >::iterator it = subtrees.begin();
if (!dir->is_full_dir_auth()) continue;
ls.push_back(dir);
}
+ int max = 5; // throttle shutdown exports.. hack!
for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
CDir *dir = *p;
- dout(7) << "sending " << *dir << " back to mds0" << dendl;
- migrator->export_dir(dir, 0);
+ int dest = dir->get_inode()->authority().first;
+ if (dest > 0 && !mds->mdsmap->is_active(dest)) dest = 0;
+ dout(7) << "sending " << *dir << " back to mds" << dest << dendl;
+ migrator->export_dir(dir, dest);
+ if (--max == 0) break;
}
}
+
// subtrees map not empty yet?
if (!subtrees.empty()) {
dout(7) << "still have " << num_subtrees() << " subtrees" << dendl;
show_subtrees();
migrator->show_importing();
migrator->show_exporting();
- //show_cache();
+ if (!migrator->is_importing() && !migrator->is_exporting())
+ show_cache();
return false;
}
assert(subtrees.empty());
assert(!migrator->is_importing());
+
// empty out stray contents
// FIXME
dout(7) << "FIXME: i need to empty out stray dir contents..." << dendl;
- // (wait for) flush log?
- if (g_conf.mds_log_flush_on_shutdown) {
- if (mds->mdlog->get_non_subtreemap_events()) {
- dout(7) << "waiting for log to flush .. " << mds->mdlog->get_num_events()
- << " (" << mds->mdlog->get_non_subtreemap_events() << ")" << dendl;
- return false;
- }
+ // (only do this once!)
+ if (!mds->mdlog->is_capped()) {
+ dout(7) << "capping the log" << dendl;
+ mds->mdlog->cap();
+ mds->mdlog->trim();
}
-
- // cap log?
- if (g_conf.mds_log_flush_on_shutdown) {
-
- // (only do this once!)
- if (!mds->mdlog->is_capped()) {
- dout(7) << "capping the log" << dendl;
- mds->mdlog->cap();
- // note that this won't flush right away, so we'll make at least one more pass
- }
-
- if (mds->mdlog->get_num_events()) {
- dout(7) << "waiting for log to flush (including subtree_map, now) .. " << mds->mdlog->get_num_events()
- << " (" << mds->mdlog->get_non_subtreemap_events() << ")" << dendl;
- return false;
- }
-
- if (!did_shutdown_log_cap) {
- // flush journal header
- dout(7) << "writing header for (now-empty) journal" << dendl;
- assert(mds->mdlog->empty());
- mds->mdlog->write_head(0);
- // NOTE: filer active checker below will block us until this completes.
- did_shutdown_log_cap = true;
- return false;
- }
+
+ if (!mds->mdlog->empty()) {
+ dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events()
+ << " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
+ return false;
+ }
+
+ if (!did_shutdown_log_cap) {
+ // flush journal header
+ dout(7) << "writing header for (now-empty) journal" << dendl;
+ assert(mds->mdlog->empty());
+ mds->mdlog->write_head(0);
+ // NOTE: filer active checker below will block us until this completes.
+ did_shutdown_log_cap = true;
+ return false;
}
// filer active?
return false;
}
-
- // done?
+ // trim what we can from the cache
if (lru.lru_get_size() > 0) {
dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << dendl;
show_cache();
Context *MDCache::_get_waiter(MDRequest *mdr, Message *req)
{
- if (mdr)
+ if (mdr) {
+ dout(20) << "_get_waiter retryrequest" << dendl;
return new C_MDS_RetryRequest(this, mdr);
- else
+ } else {
+ dout(20) << "_get_waiter retrymessage" << dendl;
return new C_MDS_RetryMessage(mds, req);
+ }
}
int MDCache::path_traverse(MDRequest *mdr, Message *req, // who
// parent dir frozen_dir?
if (cur->is_frozen_dir()) {
dout(7) << "traverse: " << *cur->get_parent_dir() << " is frozen_dir, waiting" << dendl;
- cur->get_parent_dir()->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req));
+ cur->get_parent_dn()->get_dir()->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req));
return 1;
}
-
curdir = cur->get_or_open_dirfrag(this, fg);
} else {
// discover?
- assert(!cur->is_auth());
- if (cur->is_ambiguous_auth()) {
- dout(10) << "traverse: need dirfrag " << fg << ", waiting for single auth on " << *cur << dendl;
- cur->add_waiter(CInode::WAIT_SINGLEAUTH, _get_waiter(mdr, req));
- return 1;
- } else if (dir_discovers.count(cur->ino())) {
- dout(10) << "traverse: need dirfrag " << fg << ", already doing discover for " << *cur << dendl;
- assert(cur->is_waiter_for(CInode::WAIT_DIR));
- } else {
- filepath want = path.postfixpath(depth);
- dout(10) << "traverse: need dirfrag " << fg << ", doing discover, want " << want.get_path()
- << " from " << *cur << dendl;
- mds->send_message_mds(new MDiscover(mds->get_nodeid(),
- cur->ino(),
- want,
- true, // need this dir!
- onfail == MDS_TRAVERSE_DISCOVERXLOCK),
- cur->authority().first, MDS_PORT_CACHE);
- dir_discovers[cur->ino()].insert(cur->authority().first);
- }
- cur->add_waiter(CInode::WAIT_DIR, _get_waiter(mdr, req));
+ dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl;
+ discover_path(cur, path.postfixpath(depth), _get_waiter(mdr, req),
+ onfail == MDS_TRAVERSE_DISCOVERXLOCK);
if (mds->logger) mds->logger->inc("tdis");
return 1;
}
// must read directory hard data (permissions, x bit) to traverse
if (!noperm &&
- !mds->locker->simple_rdlock_try(&cur->authlock, _get_waiter(mdr, req)))
+ !mds->locker->simple_rdlock_try(&cur->authlock, 0)) {
+ dout(7) << "traverse: waiting on authlock rdlock on " << *cur << dendl;
+ cur->authlock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req));
return 1;
+ }
// check permissions?
// XXX
<< req->get_source() << " dn " << *dn << dendl;
} else {
dout(10) << "traverse: REP replicating to " << req->get_source() << " dn " << *dn << dendl;
- MDiscoverReply *reply = new MDiscoverReply(curdir->ino());
+ MDiscoverReply *reply = new MDiscoverReply(curdir->dirfrag());
reply->add_dentry( dn->replicate_to( from ) );
if (dn->is_primary())
reply->add_inode( dn->inode->replicate_to( from ) );
if ((onfail == MDS_TRAVERSE_DISCOVER ||
onfail == MDS_TRAVERSE_DISCOVERXLOCK)) {
- // discover?
- filepath want = path.postfixpath(depth);
-
- if (curdir->is_waiting_for_dentry(path[depth])) {
- dout(7) << "traverse: already waiting for discover " << want.get_path()
- << " from " << *curdir << dendl;
- }
- else if (curdir->is_ambiguous_auth()) {
- dout(7) << "traverse: waiting for single auth on " << *curdir << dendl;
- curdir->add_waiter(CDir::WAIT_SINGLEAUTH, _get_waiter(mdr, req));
- return 1;
- }
- else {
- dout(7) << "traverse: discover " << want << " from " << *curdir << dendl;
- touch_inode(cur);
-
- mds->send_message_mds(new MDiscover(mds->get_nodeid(),
- cur->ino(),
- want,
- false,
- onfail == MDS_TRAVERSE_DISCOVERXLOCK),
- dauth.first, MDS_PORT_CACHE);
- }
-
- // delay processing of current request.
- curdir->add_dentry_waiter(path[depth], _get_waiter(mdr, req));
+ dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
+ discover_path(curdir, path.postfixpath(depth), _get_waiter(mdr, req),
+ onfail == MDS_TRAVERSE_DISCOVERXLOCK);
if (mds->logger) mds->logger->inc("tdis");
return 1;
}
dout(7) << "traverse: waiting for single auth in " << *curdir << dendl;
curdir->add_waiter(CDir::WAIT_SINGLEAUTH, _get_waiter(mdr, req));
return 1;
- } else {
- dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;
-
- // request replication?
- if (mdr && mdr->client_request && curdir->is_rep()) {
- dout(15) << "traverse: REP fw to mds" << dauth << ", requesting rep under "
- << *curdir << " req " << *(MClientRequest*)req << dendl;
- mdr->client_request->set_mds_wants_replica_in_dirino(curdir->ino());
- req->clear_payload(); // reencode!
- }
-
- if (mdr)
- request_forward(mdr, dauth.first, req->get_dest_port());
- else
- mds->forward_message_mds(req, dauth.first, req->get_dest_port());
-
- if (mds->logger) mds->logger->inc("tfw");
- return 2;
+ }
+
+ dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;
+
+ // request replication?
+ if (mdr && mdr->client_request && curdir->is_rep()) {
+ dout(15) << "traverse: REP fw to mds" << dauth << ", requesting rep under "
+ << *curdir << " req " << *(MClientRequest*)req << dendl;
+ mdr->client_request->set_mds_wants_replica_in_dirino(curdir->ino());
+ req->clear_payload(); // reencode!
}
+
+ if (mdr)
+ request_forward(mdr, dauth.first, req->get_dest_port());
+ else
+ mds->forward_message_mds(req, dauth.first, req->get_dest_port());
+
+ if (mds->logger) mds->logger->inc("tfw");
+ return 2;
}
- if (onfail == MDS_TRAVERSE_FAIL) {
+ if (onfail == MDS_TRAVERSE_FAIL)
return -ENOENT; // not necessarily exactly true....
- }
}
assert(0); // i shouldn't get here
int auth = diri->authority().first;
if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) {
- // discover it
- filepath want; // no dentries, i just want the dir open
- MDiscover *dis = new MDiscover(mds->get_nodeid(),
- diri->ino(),
- want,
- true); // need the base dir open
- dis->set_base_dir_frag(approxfg);
- mds->send_message_mds(dis, auth, MDS_PORT_CACHE);
- dir_discovers[diri->ino()].insert(auth);
- diri->add_waiter(CInode::WAIT_DIR, fin);
+ discover_dir_frag(diri, approxfg, fin);
} else {
// mds is down or recovering. forge a replica!
forge_replica_dir(diri, approxfg, auth);
return;
}
- if (!in->is_auth()) {
+ CDir *dir = in->get_dirfrag(frag);
+
+ if (!dir && !in->is_auth()) {
dout(10) << "opening remote dirfrag " << frag << " under " << *in << dendl;
/* FIXME: we re-query the anchortable just to avoid a fragtree update race */
open_remote_dirfrag(in, frag,
return;
}
- CDir *dir = in->get_or_open_dirfrag(this, frag);
+ if (!dir && in->is_auth())
+ dir = in->get_or_open_dirfrag(this, frag);
+
assert(dir);
if (dir->is_auth()) {
if (dir->is_complete()) {
// hmm, discover.
dout(10) << "have remote dirfrag " << *dir << ", discovering "
<< anchortrace[i].ino << dendl;
-
- MDiscover *dis = new MDiscover(mds->get_nodeid(),
- dir->dirfrag(),
- anchortrace[i].ino,
- true); // being conservative here.
- mds->send_message_mds(dis, dir->authority().first, MDS_PORT_CACHE);
+ discover_ino(dir, anchortrace[i].ino,
+ new C_MDC_OpenRemoteIno(this, ino, anchortrace, mdr, onfinish));
}
}
dout(7) << "request_finish " << *mdr << dendl;
// slave finisher?
- if (mdr->slave_commit) {
- mdr->slave_commit->finish(0);
- delete mdr->slave_commit;
- mdr->slave_commit = 0;
+ if (mdr->more()->slave_commit) {
+ mdr->more()->slave_commit->finish(0);
+ delete mdr->more()->slave_commit;
+ mdr->more()->slave_commit = 0;
}
if (mdr->client_request && mds->logger) {
// clean up slaves
// (will implicitly drop remote dn pins)
- for (set<int>::iterator p = mdr->slaves.begin();
- p != mdr->slaves.end();
+ for (set<int>::iterator p = mdr->more()->slaves.begin();
+ p != mdr->more()->slaves.end();
++p) {
MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_FINISH);
mds->send_message_mds(r, *p, MDS_PORT_SERVER);
// drop (local) auth pins
mdr->drop_local_auth_pins();
+ // drop stickydirs
+ for (set<CInode*>::iterator p = mdr->stickydirs.begin();
+ p != mdr->stickydirs.end();
+ ++p)
+ (*p)->put_stickydirs();
+
// drop cache pins
for (set<MDSCacheObject*>::iterator it = mdr->pins.begin();
it != mdr->pins.end();
if (!in->can_auth_pin() &&
!mdr->is_auth_pinned(in)) {
dout(7) << "anchor_create not authpinnable, waiting on " << *in << dendl;
- in->add_waiter(CInode::WAIT_AUTHPINNABLE, onfinish);
+ in->add_waiter(CInode::WAIT_UNFREEZE, onfinish);
return;
}
MDCache *cache;
CInode *in;
version_t atid;
- version_t pdv;
+ LogSegment *ls;
public:
- C_MDC_AnchorCreateLogged(MDCache *c, CInode *i, version_t t, version_t v) :
- cache(c), in(i), atid(t), pdv(v) {}
+ C_MDC_AnchorCreateLogged(MDCache *c, CInode *i, version_t t, LogSegment *s) :
+ cache(c), in(i), atid(t), ls(s) {}
void finish(int r) {
- cache->_anchor_create_logged(in, atid, pdv);
+ cache->_anchor_create_logged(in, atid, ls);
}
};
dout(10) << "_anchor_create_prepared " << *in << " atid " << atid << dendl;
assert(in->inode.anchored == false);
- // predirty, prepare log entry
- version_t pdv = in->pre_dirty();
-
- EUpdate *le = new EUpdate(mds->mdlog, "anchor_create");
- le->metablob.add_dir_context(in->get_parent_dir());
-
// update the logged inode copy
- inode_t *pi = le->metablob.add_dentry(in->parent, true);
+ inode_t *pi = in->project_inode();
pi->anchored = true;
- pi->version = pdv;
+ pi->version = in->pre_dirty();
// note anchor transaction
+ EUpdate *le = new EUpdate(mds->mdlog, "anchor_create");
+ le->metablob.add_dir_context(in->get_parent_dir());
+ le->metablob.add_primary_dentry(in->parent, true, 0, pi);
le->metablob.add_anchor_transaction(atid);
-
- // log + wait
- mds->mdlog->submit_entry(le, new C_MDC_AnchorCreateLogged(this, in, atid, pdv));
+ mds->mdlog->submit_entry(le, new C_MDC_AnchorCreateLogged(this, in, atid,
+ mds->mdlog->get_current_segment()));
}
-void MDCache::_anchor_create_logged(CInode *in, version_t atid, version_t pdv)
+void MDCache::_anchor_create_logged(CInode *in, version_t atid, LogSegment *ls)
{
- dout(10) << "_anchor_create_logged pdv " << pdv << " on " << *in << dendl;
+ dout(10) << "_anchor_create_logged on " << *in << dendl;
// unpin
assert(in->state_test(CInode::STATE_ANCHORING));
in->auth_unpin();
// apply update to cache
- in->inode.anchored = true;
- in->mark_dirty(pdv);
+ in->pop_and_dirty_projected_inode(ls);
// tell the anchortable we've committed
- mds->anchorclient->commit(atid);
+ mds->anchorclient->commit(atid, ls);
// trigger waiters
in->finish_waiting(CInode::WAIT_ANCHORED, 0);
if (!in->can_auth_pin()/* &&
!mdr->is_auth_pinned(in)*/) {
dout(7) << "anchor_destroy not authpinnable, waiting on " << *in << dendl;
- in->add_waiter(CInode::WAIT_AUTHPINNABLE, onfinish);
+ in->add_waiter(CInode::WAIT_UNFREEZE, onfinish);
return;
}
MDCache *cache;
CInode *in;
version_t atid;
- version_t pdv;
+ LogSegment *ls;
public:
- C_MDC_AnchorDestroyLogged(MDCache *c, CInode *i, version_t t, version_t v) :
- cache(c), in(i), atid(t), pdv(v) {}
+ C_MDC_AnchorDestroyLogged(MDCache *c, CInode *i, version_t t, LogSegment *l) :
+ cache(c), in(i), atid(t), ls(l) {}
void finish(int r) {
- cache->_anchor_destroy_logged(in, atid, pdv);
+ cache->_anchor_destroy_logged(in, atid, ls);
}
};
assert(in->inode.anchored == true);
- // predirty, prepare log entry
- version_t pdv = in->pre_dirty();
-
- EUpdate *le = new EUpdate(mds->mdlog, "anchor_destroy");
- le->metablob.add_dir_context(in->get_parent_dir());
-
// update the logged inode copy
- inode_t *pi = le->metablob.add_dentry(in->parent, true);
+ inode_t *pi = in->project_inode();
pi->anchored = true;
- pi->version = pdv;
-
- // note anchor transaction
- le->metablob.add_anchor_transaction(atid);
+ pi->version = in->pre_dirty();
// log + wait
- mds->mdlog->submit_entry(le, new C_MDC_AnchorDestroyLogged(this, in, atid, pdv));
+ EUpdate *le = new EUpdate(mds->mdlog, "anchor_destroy");
+ le->metablob.add_dir_context(in->get_parent_dir());
+ le->metablob.add_primary_dentry(in->parent, true, 0, pi);
+ le->metablob.add_anchor_transaction(atid);
+ mds->mdlog->submit_entry(le, new C_MDC_AnchorDestroyLogged(this, in, atid, mds->mdlog->get_current_segment()));
}
-void MDCache::_anchor_destroy_logged(CInode *in, version_t atid, version_t pdv)
+void MDCache::_anchor_destroy_logged(CInode *in, version_t atid, LogSegment *ls)
{
- dout(10) << "_anchor_destroy_logged pdv " << pdv << " on " << *in << dendl;
+ dout(10) << "_anchor_destroy_logged on " << *in << dendl;
// unpin
assert(in->state_test(CInode::STATE_UNANCHORING));
in->auth_unpin();
// apply update to cache
- in->inode.anchored = false;
- in->inode.version = pdv;
-
+ in->pop_and_dirty_projected_inode(ls);
+
// tell the anchortable we've committed
- mds->anchorclient->commit(atid);
+ mds->anchorclient->commit(atid, ls);
// trigger waiters
in->finish_waiting(CInode::WAIT_UNANCHORED, 0);
MDCache *cache;
CDentry *dn;
version_t pdv;
+ LogSegment *ls;
public:
- C_MDC_PurgeStray(MDCache *c, CDentry *d, version_t v) : cache(c), dn(d), pdv(v) { }
+ C_MDC_PurgeStray(MDCache *c, CDentry *d, version_t v, LogSegment *s) :
+ cache(c), dn(d), pdv(v), ls(s) { }
void finish(int r) {
- cache->_purge_stray_logged(dn, pdv);
+ cache->_purge_stray_logged(dn, pdv, ls);
}
};
EUpdate *le = new EUpdate(mds->mdlog, "purge_stray");
le->metablob.add_dir_context(dn->dir);
le->metablob.add_null_dentry(dn, true);
- le->metablob.add_inode_truncate(dn->inode->inode, 0);
- mds->mdlog->submit_entry(le, new C_MDC_PurgeStray(this, dn, pdv));
+ le->metablob.add_inode_truncate(dn->inode->ino(), 0, dn->inode->inode.size);
+
+ mds->mdlog->submit_entry(le, new C_MDC_PurgeStray(this, dn, pdv, mds->mdlog->get_current_segment()));
+
+
}
-void MDCache::_purge_stray_logged(CDentry *dn, version_t pdv)
+void MDCache::_purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls)
{
dout(10) << "_purge_stray_logged " << *dn << " " << *dn->inode << dendl;
CInode *in = dn->inode;
// dirty+unlink dentry
- dn->dir->mark_dirty(pdv);
+ dn->dir->mark_dirty(pdv, ls);
dn->dir->unlink_inode(dn);
dn->dir->remove_dentry(dn);
// purge+remove inode
- purge_inode(&in->inode, 0);
+ purge_inode(in, 0, in->inode.size, ls);
remove_inode(in);
}
-// REPLICAS
+// ========================================================================================
+// DISCOVER
+/*
+
+ - for all discovers (except base_inos, e.g. root, stray), waiters are attached
+ to the parent metadata object in the cache (pinning it).
+
+ - the discover is also registered under the per-mds discover_ hashes, so that
+ waiters can be kicked in the event of a failure. that is, every discover will
+ be followed by a reply, unless the remote node fails..
+
+ - each discover_reply must reliably decrement the discover_ counts.
+
+ - base_inos are the exception. those waiters are under waiting_for_base_ino.
+
+*/
+
+void MDCache::discover_base_ino(inodeno_t want_ino,
+ Context *onfinish,
+ int from)
+{
+ dout(7) << "discover_base_ino " << want_ino << " from mds" << from << dendl;
+ if (waiting_for_base_ino[from].count(want_ino) == 0) {
+ filepath want_path;
+ MDiscover *dis = new MDiscover(mds->get_nodeid(),
+ want_ino,
+ want_path,
+ false);
+ mds->send_message_mds(dis, from, MDS_PORT_CACHE);
+ }
+
+ waiting_for_base_ino[from][want_ino].push_back(onfinish);
+}
+
+
+void MDCache::discover_dir_frag(CInode *base,
+ frag_t approx_fg,
+ Context *onfinish,
+ int from)
+{
+ if (from < 0) from = base->authority().first;
+
+ dout(7) << "discover_dir_frag " << base->ino() << " " << approx_fg
+ << " from mds" << from << dendl;
+
+ if (!base->is_waiter_for(CInode::WAIT_DIR) || !onfinish) { // this is overly conservative
+ filepath want_path;
+ MDiscover *dis = new MDiscover(mds->get_nodeid(),
+ base->ino(),
+ want_path,
+ true); // need the base dir open
+ dis->set_base_dir_frag(approx_fg);
+ mds->send_message_mds(dis, from, MDS_PORT_CACHE);
+ }
+
+ // register + wait
+ if (onfinish)
+ base->add_waiter(CInode::WAIT_DIR, onfinish);
+ discover_dir[from][base->ino()]++;
+}
+
+void MDCache::discover_path(CInode *base,
+ filepath want_path,
+ Context *onfinish,
+ bool want_xlocked,
+ int from)
+{
+ if (from < 0) from = base->authority().first;
+
+ dout(7) << "discover_path " << base->ino() << " " << want_path << " from mds" << from
+ << (want_xlocked ? " want_xlocked":"")
+ << dendl;
+
+ if (base->is_ambiguous_auth()) {
+ dout(10) << " waiting for single auth on " << *base << dendl;
+ base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish);
+ return;
+ }
+
+ if (!base->is_waiter_for(CInode::WAIT_DIR) || !onfinish) { // this is overly conservative
+ MDiscover *dis = new MDiscover(mds->get_nodeid(),
+ base->ino(),
+ want_path,
+ true, // we want the base dir; we are relative to ino.
+ want_xlocked);
+ mds->send_message_mds(dis, from, MDS_PORT_CACHE);
+ }
+
+ // register + wait
+ if (onfinish) base->add_waiter(CInode::WAIT_DIR, onfinish);
+ discover_dir[from][base->ino()]++;
+}
+
+void MDCache::discover_path(CDir *base,
+ filepath want_path,
+ Context *onfinish,
+ bool want_xlocked)
+{
+ int from = base->authority().first;
+
+ dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " from mds" << from
+ << (want_xlocked ? " want_xlocked":"")
+ << dendl;
+
+ if (base->is_ambiguous_auth()) {
+ dout(7) << " waiting for single auth on " << *base << dendl;
+ base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish);
+ return;
+ }
+
+ if (!base->is_waiting_for_dentry(want_path[0]) || !onfinish) {
+ MDiscover *dis = new MDiscover(mds->get_nodeid(),
+ base->ino(),
+ want_path,
+ false, // no base dir; we are relative to dir
+ want_xlocked);
+ mds->send_message_mds(dis, from, MDS_PORT_CACHE);
+ }
+
+ // register + wait
+ if (onfinish) base->add_dentry_waiter(want_path[0], onfinish);
+ discover_dir_sub[from][base->dirfrag()]++;
+}
+
+void MDCache::discover_ino(CDir *base,
+ inodeno_t want_ino,
+ Context *onfinish,
+ bool want_xlocked)
+{
+ int from = base->authority().first;
+
+ dout(7) << "discover_ino " << base->dirfrag() << " " << want_ino << " from mds" << from
+ << (want_xlocked ? " want_xlocked":"")
+ << dendl;
+
+ if (!base->is_waiting_for_ino(want_ino)) {
+ MDiscover *dis = new MDiscover(mds->get_nodeid(),
+ base->dirfrag(),
+ want_ino,
+ want_xlocked);
+ mds->send_message_mds(dis, from, MDS_PORT_CACHE);
+ }
+
+ // register + wait
+ base->add_ino_waiter(want_ino, onfinish);
+ discover_dir_sub[from][base->dirfrag()]++;
+}
+
+
+
+void MDCache::kick_discovers(int who)
+{
+ list<Context*> waiters;
+
+ for (hash_map<inodeno_t, list<Context*> >::iterator p = waiting_for_base_ino[who].begin();
+ p != waiting_for_base_ino[who].end();
+ ++p) {
+ dout(10) << "kick_discovers on base ino " << p->first << dendl;
+ mds->queue_waiters(p->second);
+ }
+ waiting_for_base_ino.erase(who);
+
+ for (hash_map<inodeno_t,int>::iterator p = discover_dir[who].begin();
+ p != discover_dir[who].end();
+ ++p) {
+ CInode *in = get_inode(p->first);
+ if (!in) continue;
+ dout(10) << "kick_discovers dir waiters on " << *in << dendl;
+ in->take_waiting(CInode::WAIT_DIR, waiters);
+ }
+ discover_dir.erase(who);
+
+ for (hash_map<dirfrag_t,int>::iterator p = discover_dir_sub[who].begin();
+ p != discover_dir_sub[who].end();
+ ++p) {
+ CDir *dir = get_dirfrag(p->first);
+ if (!dir) continue;
+ dout(10) << "kick_discovers dentry+ino waiters on " << *dir << dendl;
+ dir->take_sub_waiting(waiters);
+ }
+ discover_dir_sub.erase(who);
+
+ mds->queue_waiters(waiters);
+}
+
void MDCache::handle_discover(MDiscover *dis)
CInode *cur = 0;
- MDiscoverReply *reply = new MDiscoverReply(dis->get_base_ino());
+ MDiscoverReply *reply = new MDiscoverReply(dis);
// get started.
if (dis->get_base_ino() == MDS_INO_ROOT) {
if (!cur) {
dout(7) << "handle_discover mds" << dis->get_asker()
<< " don't have base ino " << dis->get_base_ino()
- << ", dropping" << dendl;
- delete reply;
- return;
+ << dendl;
+ reply->set_flag_error_dir();
}
if (dis->wants_base_dir()) {
dout(7) << "handle_discover mds" << dis->get_asker()
- << " has " << *cur
<< " wants basedir+" << dis->get_want().get_path()
+ << " has " << *cur
<< dendl;
} else {
dout(7) << "handle_discover mds" << dis->get_asker()
- << " has " << *cur
<< " wants " << dis->get_want().get_path()
+ << " has " << *cur
<< dendl;
}
}
assert(reply);
- assert(cur);
// add content
// do some fidgeting to include a dir if they asked for the base dir, or just root.
for (unsigned i = 0;
- i < dis->get_want().depth() || dis->get_want().depth() == 0;
+ cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0);
i++) {
// -- figure out the dir
} else {
// requester explicity specified the frag
fg = dis->get_base_dir_frag();
- assert(dis->wants_base_dir() || dis->get_base_ino() < MDS_INO_BASE);
+ assert(dis->wants_base_dir() || dis->get_want_ino() || dis->get_base_ino() < MDS_INO_BASE);
}
CDir *curdir = cur->get_dirfrag(fg);
if ((!curdir && !cur->is_auth()) ||
(curdir && !curdir->is_auth())) {
- if (curdir) {
- dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl;
- reply->set_dir_auth_hint(curdir->authority().first);
- } else {
- dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for " << *cur << dendl;
- reply->set_dir_auth_hint(cur->authority().first);
+
+ /* before:
+ * ONLY set flag if empty!!
+ * otherwise requester will wake up waiter(s) _and_ continue with discover,
+ * resulting in duplicate discovers in flight,
+ * which can wreak havoc when discovering rename srcdn (which may move)
+ */
+
+ if (reply->is_empty()) {
+ // only hint if empty.
+ // someday this could be better, but right now the waiter logic isn't smart enough.
+
+ // hint
+ if (curdir) {
+ dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl;
+ reply->set_dir_auth_hint(curdir->authority().first);
+ } else {
+ dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
+ << *cur << dendl;
+ reply->set_dir_auth_hint(cur->authority().first);
+ }
+
+ // note error dentry, if any
+ // NOTE: important, as it allows requester to issue an equivalent discover
+ // to whomever we hint at.
+ if (dis->get_want().depth() > i)
+ reply->set_error_dentry(dis->get_dentry(i));
}
- reply->set_wanted_xlocks_hint(dis->wants_xlocked());
-
- // set hint (+ dentry, if there is one)
- if (dis->get_want().depth() > i)
- reply->set_error_dentry(dis->get_dentry(i));
+
break;
}
reply->add_dir( curdir->replicate_to(dis->get_asker()) );
dout(7) << "handle_discover added dir " << *curdir << dendl;
}
- if (dis->get_want().depth() == 0) break;
-
- // lookup inode?
+
+ // lookup
CDentry *dn = 0;
if (dis->get_want_ino()) {
+ // lookup by ino
CInode *in = get_inode(dis->get_want_ino());
if (in && in->is_auth() && in->get_parent_dn()->get_dir() == curdir)
dn = in->get_parent_dn();
- } else {
+ } else if (dis->get_want().depth() > 0) {
// lookup dentry
dn = curdir->lookup( dis->get_dentry(i) );
- }
-
+ } else
+ break; // done!
+
// incomplete dir?
if (!dn) {
if (!curdir->is_complete()) {
}
}
+ // frozen inode?
+ if (dn->is_primary() &&
+ dn->inode->is_frozen()) {
+ if (reply->is_empty()) {
+ dout(7) << *dn->inode << " is frozen, empty reply, waiting" << dendl;
+ dn->inode->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
+ delete reply;
+ return;
+ } else {
+ dout(7) << *dn->inode << " is frozen, non-empty reply, stopping" << dendl;
+ break;
+ }
+ }
+
// add dentry
reply->add_dentry( dn->replicate_to( dis->get_asker() ) );
dout(7) << "handle_discover added dentry " << *dn << dendl;
}
// how did we do?
- if (reply->is_empty()) {
- dout(7) << "handle_discover dropping this empty reply)." << dendl;
- delete reply;
- } else {
- dout(7) << "handle_discover sending result back to asker mds" << dis->get_asker() << dendl;
- mds->send_message_mds(reply, dis->get_asker(), MDS_PORT_CACHE);
- }
+ assert(!reply->is_empty());
+ dout(7) << "handle_discover sending result back to asker mds" << dis->get_asker() << dendl;
+ mds->send_message_mds(reply, dis->get_asker(), MDS_PORT_CACHE);
- // done.
delete dis;
}
}
*/
- // starting point
list<Context*> finished, error;
-
- // grab base inode
+ int from = m->get_source().num();
+
+ // starting point
CInode *cur = get_inode(m->get_base_ino());
-
- if (cur) {
- dout(7) << "discover_reply " << *cur << " + " << m->get_path() << ", have " << m->get_num_inodes() << " inodes" << dendl;
- }
- else if (m->get_base_ino() == MDS_INO_ROOT) {
- // it's the root inode.
- assert(!root);
+
+ if (m->has_base_inode()) {
+ assert(m->get_base_ino() < MDS_INO_BASE);
assert(!m->has_base_dentry());
assert(!m->has_base_dir());
-
- dout(7) << "discover_reply root + " << m->get_path() << " " << m->get_num_inodes() << " inodes" << dendl;
-
- // add in root
- cur = add_replica_inode(m->get_inode(0), NULL);
- cur->force_auth = pair<int,int>(m->get_source().num(), CDIR_AUTH_UNKNOWN);
- set_root(cur);
- dout(7) << "discover_reply got root " << *cur << dendl;
-
- // take root waiters
- finished.swap(waiting_for_root);
- }
- else if (MDS_INO_IS_STRAY(m->get_base_ino())) {
- dout(7) << "discover_reply stray + " << m->get_path() << " " << m->get_num_inodes() << " inodes" << dendl;
-
- // add
- cur = add_replica_inode(m->get_inode(0), NULL);
+
+ // add base inode
+ cur = add_replica_inode(m->get_inode(0), NULL, finished);
cur->force_auth = pair<int,int>(m->get_source().num(), CDIR_AUTH_UNKNOWN);
- dout(7) << "discover_reply got stray " << *cur << dendl;
+ dout(7) << "discover_reply got base inode " << *cur << dendl;
// take waiters
- finished.swap(waiting_for_stray[cur->ino()]);
- waiting_for_stray.erase(cur->ino());
+ finished.swap(waiting_for_base_ino[from][cur->ino()]);
+ waiting_for_base_ino[from].erase(cur->ino());
}
+ assert(cur);
+ dout(7) << "discover_reply " << *cur
+ << " + " << m->get_num_dentries() << " dn, "
+ << m->get_num_inodes() << " inodes"
+ << dendl;
+
// fyi
- if (m->is_flag_error_dir()) dout(7) << " flag error, dir" << dendl;
- if (m->is_flag_error_dn()) dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl;
+ if (m->is_flag_error_dir())
+ dout(7) << " flag error, dir" << dendl;
+ if (m->is_flag_error_dn())
+ dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl;
+ if (m->is_flag_error_ino())
+ dout(7) << " flag error, ino = " << m->get_wanted_ino() << dendl;
+
dout(10) << "depth = " << m->get_depth()
<< ", has base_dir/base_dn/root = "
<< m->has_base_dir() << " / " << m->has_base_dentry() << " / " << m->has_base_inode()
<< m->get_num_dirs() << " / " << m->get_num_dentries() << " / " << m->get_num_inodes()
<< dendl;
+ // decrement discover counters
+ if (m->get_wanted_base_dir()) {
+ inodeno_t ino = m->get_base_ino();
+ assert(discover_dir[from].count(ino));
+ if (--discover_dir[from][ino] == 0)
+ discover_dir[from].erase(ino);
+ } else if (m->get_base_ino() >= MDS_INO_BASE) {
+ dirfrag_t df(m->get_base_ino(), m->get_base_dir_frag());
+ assert(discover_dir_sub[from].count(df));
+ if (--discover_dir_sub[from][df] == 0)
+ discover_dir_sub[from].erase(df);
+ }
+
// loop over discover results.
- // indexese follow each ([[dir] dentry] inode)
+ // indexes follow each ([[dir] dentry] inode)
// can start, end with any type.
-
for (int i=m->has_base_inode(); i<m->get_depth(); i++) {
dout(10) << "discover_reply i=" << i << " cur " << *cur << dendl;
if (i > 0 || m->has_base_dir()) {
assert(m->get_dir(i).get_dirfrag().ino == cur->ino());
fg = m->get_dir(i).get_dirfrag().frag;
-
- // add/update the dir replica
curdir = add_replica_dir(cur, fg, m->get_dir(i),
m->get_source().num(),
finished);
}
// dentry error?
- if (i == m->get_depth()-1 &&
- m->is_flag_error_dn()) {
+ if (i == m->get_depth()-1 && (m->is_flag_error_dn() || m->is_flag_error_ino())) {
// error!
assert(cur->is_dir());
if (curdir) {
- dout(7) << " flag_error on dentry " << m->get_error_dentry() << ", triggering dentry?" << dendl;
- curdir->take_dentry_waiting(m->get_error_dentry(),
- error);
+ if (m->get_error_dentry().length()) {
+ dout(7) << " flag_error on dentry " << m->get_error_dentry()
+ << ", triggering dentry" << dendl;
+ curdir->take_dentry_waiting(m->get_error_dentry(), error);
+ } else {
+ dout(7) << " flag_error on ino " << m->get_wanted_ino()
+ << ", triggering ino" << dendl;
+ curdir->take_ino_waiting(m->get_wanted_ino(), error);
+ }
} else {
- dout(7) << " flag_error on dentry " << m->get_error_dentry() << ", triggering dir?" << dendl;
+ dout(7) << " flag_error on dentry " << m->get_error_dentry()
+ << ", triggering dir?" << dendl;
cur->take_waiting(CInode::WAIT_DIR, error);
- dir_discovers.erase(cur->ino());
}
break;
}
// dentry
CDentry *dn = 0;
if (i >= m->get_last_dentry()) break;
- if (i > 0 || m->has_base_dentry()) {
+ if (i > 0 || m->has_base_dentry())
dn = add_replica_dentry(curdir, m->get_dentry(i), finished);
- }
// inode
if (i >= m->get_last_inode()) break;
- cur = add_replica_inode(m->get_inode(i), dn);
+ cur = add_replica_inode(m->get_inode(i), dn, finished);
}
- // dir_auth hint?
- if (m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN &&
- m->get_dir_auth_hint() != mds->get_nodeid()) {
- dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl;
-
- // try again. include dentry _and_ dirfrag, just in case.
- int hint = m->get_dir_auth_hint();
- filepath want;
- want.push_dentry(m->get_error_dentry());
- MDiscover *dis = new MDiscover(mds->get_nodeid(),
- cur->ino(),
- want,
- true,
- m->get_wanted_xlocks_hint());
- frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
- dis->set_base_dir_frag(fg);
- mds->send_message_mds(dis, hint, MDS_PORT_CACHE);
-
- // note the dangling discover... but only if it's already noted in dir_discovers (i.e. someone is waiting)
- if (dir_discovers.count(cur->ino())) {
- dir_discovers[cur->ino()].insert(hint);
- assert(cur->is_waiter_for(CInode::WAIT_DIR));
- }
- }
- else if (m->is_flag_error_dir()) {
- // dir error at the end there?
- dout(7) << " flag_error on dir " << *cur << dendl;
- assert(!cur->is_dir());
+ // dir error?
+ // or dir_auth hint?
+ if (m->is_flag_error_dir() && !cur->is_dir()) {
+ // not a dir.
cur->take_waiting(CInode::WAIT_DIR, error);
- dir_discovers.erase(cur->ino());
+ } else if (m->is_flag_error_dir() ||
+ (m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN &&
+ m->get_dir_auth_hint() != mds->get_nodeid())) {
+ int who = m->get_dir_auth_hint();
+ if (who == mds->get_nodeid()) who = -1;
+ if (who >= 0)
+ dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl;
+
+ // try again?
+ if (m->get_error_dentry().length()) {
+ // wanted a dentry
+ frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
+ CDir *dir = cur->get_dirfrag(fg);
+ if (dir) {
+ // don't actaully need the hint, now
+ if (dir->lookup(m->get_error_dentry()) == 0 &&
+ dir->is_waiting_for_dentry(m->get_error_dentry()))
+ discover_path(dir, m->get_error_dentry(), 0, m->get_wanted_xlocked());
+ else
+ dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
+ << m->get_error_dentry() << dendl;
+ } else {
+ if (cur->is_waiter_for(CInode::WAIT_DIR))
+ discover_path(cur, m->get_error_dentry(), 0, m->get_wanted_xlocked(), who);
+ else
+ dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
+ }
+ } else {
+ // wanted just the dir
+ frag_t fg = m->get_base_dir_frag();
+ if (cur->get_dirfrag(fg) == 0 && cur->is_waiter_for(CInode::WAIT_DIR))
+ discover_dir_frag(cur, fg, 0, who);
+ else
+ dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
+ }
}
-
- // finish errors directly
- finish_contexts(error, -ENOENT);
+
+ // waiters
+ finish_contexts(error, -ENOENT); // finish errors directly
mds->queue_waiters(finished);
// done
}
+
+// ----------------------------
+// REPLICAS
+
CDir *MDCache::add_replica_dir(CInode *diri,
frag_t fg, CDirDiscover &dis, int from,
list<Context*>& finished)
// get waiters
diri->take_waiting(CInode::WAIT_DIR, finished);
- dir_discovers.erase(diri->ino());
}
return dir;
return dn;
}
-CInode *MDCache::add_replica_inode(CInodeDiscover& dis, CDentry *dn)
+CInode *MDCache::add_replica_inode(CInodeDiscover& dis, CDentry *dn, list<Context*>& finished)
{
CInode *in = get_inode(dis.get_ino());
if (!in) {
if (dn) {
assert(dn->is_primary());
assert(dn->inode == in);
+
+ dn->get_dir()->take_ino_waiting(in->ino(), finished);
}
return in;
// inode
CInodeDiscover indis;
indis._decode(bl, off);
- CInode *strayin = add_replica_inode(indis, NULL);
+ CInode *strayin = add_replica_inode(indis, NULL, finished);
strayin->force_auth = pair<int,int>(from, CDIR_AUTH_UNKNOWN);
dout(15) << "strayin " << *strayin << dendl;
CDentry *straydn = 0;
if (m->strayin) {
list<Context*> finished;
- CInode *in = add_replica_inode(*m->strayin, NULL);
+ CInode *in = add_replica_inode(*m->strayin, NULL, finished);
CDir *dir = add_replica_dir(in, m->straydir->get_dirfrag().frag, *m->straydir,
m->get_source().num(), finished);
straydn = add_replica_dentry(dir, *m->straydn, finished);
*/
void MDCache::fragment_freeze(CInode *diri, list<CDir*>& frags, frag_t basefrag, int bits)
{
- C_Gather *gather = new C_Gather(new C_MDC_FragmentGo(this, diri, frags, basefrag, bits));
+ C_Gather *gather = new C_Gather(new C_MDC_FragmentGo(this, diri, frags, basefrag, bits));
// freeze the dirs
for (list<CDir*>::iterator p = frags.begin();
++p) {
CDir *dir = *p;
dir->auth_pin(); // this will block the freeze
- dir->freeze_dir(gather->new_sub());
+ dir->freeze_dir();
+ assert(dir->is_freezing_dir());
+ dir->add_waiter(CDir::WAIT_FROZEN, gather->new_sub());
}
}
int bits;
list<CDir*> resultfrags;
vector<version_t> pvs;
+ LogSegment *ls;
public:
C_MDC_FragmentLogged(MDCache *m, CInode *di, frag_t bf, int b,
- list<CDir*>& rf, vector<version_t>& p) :
- mdcache(m), diri(di), basefrag(bf), bits(b) {
+ list<CDir*>& rf, vector<version_t>& p,
+ LogSegment *s) :
+ mdcache(m), diri(di), basefrag(bf), bits(b), ls(s) {
resultfrags.swap(rf);
pvs.swap(p);
}
virtual void finish(int r) {
mdcache->fragment_logged(diri, basefrag, bits,
- resultfrags, pvs);
+ resultfrags, pvs,
+ ls);
}
};
mds->mdlog->submit_entry(le,
new C_MDC_FragmentLogged(this, diri, basefrag, bits,
- resultfrags, pvs));
+ resultfrags, pvs, mds->mdlog->get_current_segment()));
// announcelist<CDir*>& resultfrags,
for (set<int>::iterator p = peers.begin();
void MDCache::fragment_logged(CInode *diri, frag_t basefrag, int bits,
list<CDir*>& resultfrags,
- vector<version_t>& pvs)
+ vector<version_t>& pvs,
+ LogSegment *ls)
{
dout(10) << "fragment_logged " << basefrag << " bits " << bits
<< " on " << *diri << dendl;
// dirty, unpin, unfreeze
dir->state_clear(CDir::STATE_FRAGMENTING);
- dir->mark_dirty(*pv);
+ dir->mark_dirty(*pv, ls);
pv++;
for (CDir::map_t::iterator p = dir->items.begin();
return; // i won't print anything.
if (subtrees.empty()) {
- dout(dbl) << "no subtrees" << dendl;
+ dout(dbl) << "show_subtrees - no subtrees" << dendl;
return;
}
// root frags
- list<CDir*> rootfrags;
- if (root) root->get_dirfrags(rootfrags);
- if (stray) stray->get_dirfrags(rootfrags);
- dout(15) << "rootfrags " << rootfrags << dendl;
+ list<CDir*> basefrags;
+ for (set<CInode*>::iterator p = base_inodes.begin();
+ p != base_inodes.end();
+ ++p)
+ (*p)->get_dirfrags(basefrags);
+ //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
+ dout(15) << "show_subtrees" << dendl;
// queue stuff
list<pair<CDir*,int> > q;
set<CDir*> seen;
// calc max depth
- for (list<CDir*>::iterator p = rootfrags.begin(); p != rootfrags.end(); ++p)
+ for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
q.push_back(pair<CDir*,int>(*p, 0));
+ set<CDir*> subtrees_seen;
+
int depth = 0;
while (!q.empty()) {
CDir *dir = q.front().first;
if (subtrees.count(dir) == 0) continue;
+ subtrees_seen.insert(dir);
+
if (d > depth) depth = d;
// sanity check
// print tree
- for (list<CDir*>::iterator p = rootfrags.begin(); p != rootfrags.end(); ++p)
+ for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
q.push_back(pair<CDir*,int>(*p, 0));
while (!q.empty()) {
q.push_front(pair<CDir*,int>(*p, d+2));
}
}
+
+ // verify there isn't stray crap in subtree map
+ int lost = 0;
+ for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
+ p != subtrees.end();
+ ++p) {
+ if (subtrees_seen.count(p->first)) continue;
+ dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl;
+ lost++;
+ }
+ assert(lost == 0);
}
class MMDSFragmentNotify;
+class ESubtreeMap;
+
+
// MDCache
//typedef const char* pchar;
// -- i am a client (master) request
MClientRequest *client_request; // client request (if any)
- set<int> slaves; // mds nodes that have slave requests to me (implies client_request)
- set<int> waiting_on_slave; // peers i'm waiting for slavereq replies from.
vector<CDentry*> trace; // original path traversal.
CInode *ref; // reference inode. if there is only one, and its path is pinned.
MMDSSlaveRequest *slave_request; // slave request (if one is pending; implies slave == true)
int slave_to_mds; // this is a slave request if >= 0.
+ // -- misc --
+ LogSegment *ls; // the log segment i'm committing to
+ utime_t now;
+
// -- my pins and locks --
// cache pins (so things don't expire)
set< MDSCacheObject* > pins;
set<CInode*> stickydirs;
// auth pins
+ set< MDSCacheObject* > remote_auth_pins;
set< MDSCacheObject* > auth_pins;
// held locks
bool committing;
bool aborted;
- // for rename/link/unlink
- utime_t now;
- set<int> witnessed; // nodes who have journaled a RenamePrepare
- map<MDSCacheObject*,version_t> pvmap;
-
- // for rename
- set<int> extra_witnesses; // replica list from srcdn auth (rename)
- version_t src_reanchor_atid; // src->dst
- version_t dst_reanchor_atid; // dst->stray
- bufferlist inode_import;
- version_t inode_import_v;
- CDentry *srcdn; // srcdn, if auth, on slave
-
- // called when slave commits
- Context *slave_commit;
+ struct More {
+ set<int> slaves; // mds nodes that have slave requests to me (implies client_request)
+ set<int> waiting_on_slave; // peers i'm waiting for slavereq replies from.
+
+ // for rename/link/unlink
+ set<int> witnessed; // nodes who have journaled a RenamePrepare
+ map<MDSCacheObject*,version_t> pvmap;
+
+ // for rename
+ set<int> extra_witnesses; // replica list from srcdn auth (rename)
+ version_t src_reanchor_atid; // src->dst
+ version_t dst_reanchor_atid; // dst->stray
+ bufferlist inode_import;
+ version_t inode_import_v;
+ CInode* destdn_was_remote_inode;
+ bool was_link_merge;
+
+ // called when slave commits or aborts
+ Context *slave_commit;
+
+ More() :
+ src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0),
+ destdn_was_remote_inode(0), was_link_merge(false),
+ slave_commit(0) { }
+ } *_more;
// ---------------------------------------------------
MDRequest() :
client_request(0), ref(0),
slave_request(0), slave_to_mds(-1),
+ ls(0),
done_locking(false), committing(false), aborted(false),
- src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0),
- slave_commit(0) { }
+ _more(0) {}
MDRequest(metareqid_t ri, MClientRequest *req) :
reqid(ri), client_request(req), ref(0),
slave_request(0), slave_to_mds(-1),
+ ls(0),
done_locking(false), committing(false), aborted(false),
- src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0),
- slave_commit(0) { }
+ _more(0) {}
MDRequest(metareqid_t ri, int by) :
reqid(ri), client_request(0), ref(0),
slave_request(0), slave_to_mds(by),
+ ls(0),
done_locking(false), committing(false), aborted(false),
- src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0),
- slave_commit(0) { }
+ _more(0) {}
+ ~MDRequest() {
+ delete _more;
+ }
bool is_master() { return slave_to_mds < 0; }
bool is_slave() { return slave_to_mds >= 0; }
- bool slave_did_prepare() { return slave_commit; }
+ More* more() {
+ if (!_more) _more = new More();
+ return _more;
+ }
+
+ bool slave_did_prepare() { return more()->slave_commit; }
+
// pin items in cache
void pin(MDSCacheObject *o) {
if (pins.count(o) == 0) {
// auth pins
bool is_auth_pinned(MDSCacheObject *object) {
- return auth_pins.count(object);
+ return auth_pins.count(object) || remote_auth_pins.count(object);
}
void auth_pin(MDSCacheObject *object) {
if (!is_auth_pinned(object)) {
auth_pins.insert(object);
}
}
+ void auth_unpin(MDSCacheObject *object) {
+ assert(is_auth_pinned(object));
+ object->auth_unpin();
+ auth_pins.erase(object);
+ }
void drop_local_auth_pins() {
- set<MDSCacheObject*>::iterator it = auth_pins.begin();
- while (it != auth_pins.end()) {
- if ((*it)->is_auth()) {
- (*it)->auth_unpin();
- auth_pins.erase(it++);
- } else {
- it++;
- }
+ for (set<MDSCacheObject*>::iterator it = auth_pins.begin();
+ it != auth_pins.end();
+ it++) {
+ assert((*it)->is_auth());
+ (*it)->auth_unpin();
}
auth_pins.clear();
}
return out;
}
+struct MDSlaveUpdate {
+ EMetaBlob commit;
+ EMetaBlob rollback;
+ xlist<MDSlaveUpdate*>::item xlistitem;
+ Context *waiter;
+ MDSlaveUpdate() : xlistitem(this), waiter(0) {}
+ MDSlaveUpdate(EMetaBlob c, EMetaBlob r, xlist<MDSlaveUpdate*> &list) :
+ commit(c), rollback(r),
+ xlistitem(this),
+ waiter(0) {
+ list.push_back(&xlistitem);
+ }
+ ~MDSlaveUpdate() {
+ if (waiter) waiter->finish(0);
+ delete waiter;
+ }
+};
+
+
class MDCache {
public:
// my master
MDS *mds;
- LRU lru; // dentry lru for expiring items from cache
-
+ // -- my cache --
+ LRU lru; // dentry lru for expiring items from cache
protected:
- // the cache
- CInode *root; // root inode
- hash_map<inodeno_t,CInode*> inode_map; // map of inodes by ino
- CInode *stray; // my stray dir
+ hash_map<inodeno_t,CInode*> inode_map; // map of inodes by ino
+ CInode *root; // root inode
+ CInode *stray; // my stray dir
+
+ set<CInode*> base_inodes; // inodes < MDS_INO_BASE (root, stray, etc.)
+
+ // -- discover --
+ // waiters
+ map<int, hash_map<inodeno_t, list<Context*> > > waiting_for_base_ino;
+
+ // in process discovers, by mds.
+ // this is just enough info to kick any waiters in the event of a failure.
+ // FIXME: use pointers here instead of identifiers?
+ map<int, hash_map<inodeno_t,int> > discover_dir;
+ map<int, hash_map<dirfrag_t,int> > discover_dir_sub;
+
+ void discover_base_ino(inodeno_t want_ino, Context *onfinish, int from=-1);
+ void discover_dir_frag(CInode *base, frag_t approx_fg, Context *onfinish,
+ int from=-1);
+ void discover_path(CInode *base, filepath want_path, Context *onfinish,
+ bool want_xlocked=false, int from=-1);
+ void discover_path(CDir *base, filepath want_path, Context *onfinish,
+ bool want_xlocked=false);
+ void discover_ino(CDir *base, inodeno_t want_ino, Context *onfinish,
+ bool want_xlocked=false);
+
+ void kick_discovers(int who); // after a failure.
- // root
- list<Context*> waiting_for_root;
- map<inodeno_t,list<Context*> > waiting_for_stray;
public:
int get_num_inodes() { return inode_map.size(); }
adjust_bounded_subtree_auth(dir, bounds, pair<int,int>(a, CDIR_AUTH_UNKNOWN));
}
void map_dirfrag_set(list<dirfrag_t>& dfs, set<CDir*>& result);
- void adjust_export_state(CDir *dir);
void try_subtree_merge(CDir *root);
void try_subtree_merge_at(CDir *root);
- void subtree_merge_writebehind_finish(CInode *in);
+ void subtree_merge_writebehind_finish(CInode *in, LogSegment *ls);
void eval_subtree_root(CDir *dir);
CDir *get_subtree_root(CDir *dir);
void remove_subtree(CDir *dir);
// delayed cache expire
map<CDir*, map<int, MCacheExpire*> > delayed_expire; // subtree root -> expire msg
- // -- discover --
- hash_map<inodeno_t, set<int> > dir_discovers; // dirino -> mds set i'm trying to discover.
-
// -- requests --
-public:
-
-
protected:
hash_map<metareqid_t, MDRequest*> active_requests;
-
+
public:
MDRequest* request_start(MClientRequest *req);
MDRequest* request_start_slave(metareqid_t rid, int by);
// inode purging
- map<inodeno_t, map<off_t, inode_t> > purging;
- map<inodeno_t, map<off_t, list<Context*> > > waiting_for_purge;
+ map<CInode*, map<off_t, off_t> > purging; // inode -> newsize -> oldsize
+ map<CInode*, map<off_t, LogSegment*> > purging_ls;
+ map<CInode*, map<off_t, list<Context*> > > waiting_for_purge;
- // shutdown crap
- int shutdown_commits;
- bool did_shutdown_log_cap;
- friend class C_MDC_ShutdownCommit;
-
// -- recovery --
protected:
set<int> recovery_set;
// from MMDSResolves
map<int, map<dirfrag_t, list<dirfrag_t> > > other_ambiguous_imports;
- map<int, map<metareqid_t, EMetaBlob> > uncommitted_slave_updates; // for replay.
+ map<int, map<metareqid_t, MDSlaveUpdate> > uncommitted_slave_updates; // for replay.
map<metareqid_t, bool> ambiguous_slave_updates; // for log trimming.
map<metareqid_t, Context*> waiting_for_slave_update_commit;
friend class ESlaveUpdate;
void send_resolve_now(int who);
void send_resolve_later(int who);
void maybe_send_pending_resolves();
- void log_subtree_map(Context *onsync=0);
- void _logged_subtree_map(off_t off);
+
+ ESubtreeMap *create_subtree_map();
+
protected:
// [rejoin]
set<int> rejoin_gather; // nodes from whom i need a rejoin
+ set<int> rejoin_sent; // nodes i sent a rejoin to
set<int> rejoin_ack_gather; // nodes from whom i need a rejoin ack
map<inodeno_t,map<int,inode_caps_reconnect_t> > cap_exports; // ino -> client -> capex
bool shutdown_pass();
bool shutdown(); // clear cache (ie at shutodwn)
+ bool did_shutdown_log_cap;
+
// inode_map
bool have_inode( inodeno_t ino ) { return inode_map.count(ino) ? true:false; }
CInode* get_inode( inodeno_t ino ) {
public:
// inode purging
- void purge_inode(inode_t *inode, off_t newsize);
- void _do_purge_inode(inode_t *inode, off_t newsize);
- void purge_inode_finish(inodeno_t ino, off_t newsize);
- void purge_inode_finish_2(inodeno_t ino, off_t newsize);
- bool is_purging(inodeno_t ino, off_t newsize) {
- return purging.count(ino) && purging[ino].count(newsize);
+ void purge_inode(CInode *in, off_t newsize, off_t oldsize, LogSegment *ls);
+ void _do_purge_inode(CInode *in, off_t newsize, off_t oldsize);
+ void purge_inode_finish(CInode *in, off_t newsize, off_t oldsize);
+ void purge_inode_finish_2(CInode *in, off_t newsize, off_t oldsize);
+ bool is_purging(CInode *in, off_t newsize, off_t oldsize) {
+ return purging.count(in) && purging[in].count(newsize);
}
- void wait_for_purge(inodeno_t ino, off_t newsize, Context *c) {
- waiting_for_purge[ino][newsize].push_back(c);
+ void wait_for_purge(CInode *in, off_t newsize, Context *c) {
+ waiting_for_purge[in][newsize].push_back(c);
}
- void add_recovered_purge(const inode_t& inode, off_t newsize);
- void remove_recovered_purge(inodeno_t ino, off_t newsize);
+ void add_recovered_purge(CInode *in, off_t newsize, off_t oldsize, LogSegment *ls);
+ void remove_recovered_purge(CInode *in, off_t newsize, off_t oldsize);
void start_recovered_purges();
vector<Anchor>& anchortrace,
Context *onfinish);
- bool parallel_fetch(map<inodeno_t,string>& pathmap,
- Context *c);
+ C_Gather *parallel_fetch(map<inodeno_t,string>& pathmap);
void make_trace(vector<CDentry*>& trace, CInode *in);
void anchor_destroy(CInode *in, Context *onfinish);
protected:
void _anchor_create_prepared(CInode *in, version_t atid);
- void _anchor_create_logged(CInode *in, version_t atid, version_t pdv);
+ void _anchor_create_logged(CInode *in, version_t atid, LogSegment *ls);
void _anchor_destroy_prepared(CInode *in, version_t atid);
- void _anchor_destroy_logged(CInode *in, version_t atid, version_t pdv);
+ void _anchor_destroy_logged(CInode *in, version_t atid, LogSegment *ls);
friend class C_MDC_AnchorCreatePrepared;
friend class C_MDC_AnchorCreateLogged;
void eval_stray(CDentry *dn);
protected:
void _purge_stray(CDentry *dn);
- void _purge_stray_logged(CDentry *dn, version_t pdv);
+ void _purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls);
friend class C_MDC_PurgeStray;
void reintegrate_stray(CDentry *dn, CDentry *rlink);
void migrate_stray(CDentry *dn, int dest);
CDir* forge_replica_dir(CInode *diri, frag_t fg, int from);
CDentry *add_replica_dentry(CDir *dir, CDentryDiscover &dis, list<Context*>& finished);
- CInode *add_replica_inode(CInodeDiscover& dis, CDentry *dn);
+public: // for Server::handle_slave_rename_prep
+ CInode *add_replica_inode(CInodeDiscover& dis, CDentry *dn, list<Context*>& finished);
public:
CDentry *add_replica_stray(bufferlist &bl, CInode *strayin, int from);
void fragment_mark_and_complete(CInode *diri, list<CDir*>& startfrags, frag_t basefrag, int bits);
void fragment_go(CInode *diri, list<CDir*>& startfrags, frag_t basefrag, int bits);
void fragment_stored(CInode *diri, frag_t basefrag, int bits, list<CDir*>& resultfrags);
- void fragment_logged(CInode *diri, frag_t basefrag, int bits, list<CDir*>& resultfrags, vector<version_t>& pvs);
+ void fragment_logged(CInode *diri, frag_t basefrag, int bits, list<CDir*>& resultfrags, vector<version_t>& pvs, LogSegment *ls);
friend class C_MDC_FragmentGo;
friend class C_MDC_FragmentMarking;
friend class C_MDC_FragmentStored;
#include "common/LogType.h"
#include "common/Logger.h"
+#include "events/ESubtreeMap.h"
+
#include "config.h"
#define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".log "
static bool didit = false;
if (!didit) {
didit = true;
- mdlog_logtype.add_inc("add");
- mdlog_logtype.add_inc("obs");
- mdlog_logtype.add_inc("trims");
- mdlog_logtype.add_inc("trimf");
- mdlog_logtype.add_inc("trimng");
- mdlog_logtype.add_set("size");
- mdlog_logtype.add_set("rdpos");
+ mdlog_logtype.add_inc("evadd");
+ mdlog_logtype.add_inc("evex");
+ mdlog_logtype.add_inc("evtrm");
+ mdlog_logtype.add_set("ev");
+ mdlog_logtype.add_set("evexg");
+ mdlog_logtype.add_set("evexd");
+
+ mdlog_logtype.add_inc("segadd");
+ mdlog_logtype.add_inc("segex");
+ mdlog_logtype.add_inc("segtrm");
+ mdlog_logtype.add_set("seg");
+ mdlog_logtype.add_set("segexg");
+ mdlog_logtype.add_set("segexd");
+
+ mdlog_logtype.add_set("expos");
mdlog_logtype.add_set("wrpos");
+
mdlog_logtype.add_avg("jlat");
}
journaler = new Journaler(log_inode, mds->objecter, logger, &mds->mds_lock);
}
+void MDLog::write_head(Context *c)
+{
+ journaler->write_head(c);
+}
+
+off_t MDLog::get_read_pos()
+{
+ return journaler->get_read_pos();
+}
+
+off_t MDLog::get_write_pos()
+{
+ return journaler->get_write_pos();
+}
+
-void MDLog::reset()
+void MDLog::create(Context *c)
{
- dout(5) << "reset to empty log" << dendl;
+ dout(5) << "create empty log" << dendl;
init_journaler();
journaler->reset();
+ write_head(c);
+
+ logger->set("expos", journaler->get_expire_pos());
+ logger->set("wrpos", journaler->get_write_pos());
}
void MDLog::open(Context *c)
dout(5) << "open discovering log bounds" << dendl;
init_journaler();
journaler->recover(c);
+
+ // either append() or replay() will follow.
}
void MDLog::append()
dout(5) << "append positioning at end" << dendl;
journaler->set_read_pos(journaler->get_write_pos());
journaler->set_expire_pos(journaler->get_write_pos());
-}
-
-void MDLog::write_head(Context *c)
-{
- journaler->write_head(c);
-}
-
-off_t MDLog::get_read_pos()
-{
- return journaler->get_read_pos();
+ logger->set("expos", journaler->get_write_pos());
}
-off_t MDLog::get_write_pos()
-{
- return journaler->get_write_pos();
-}
+// -------------------------------------------------
void MDLog::submit_entry( LogEvent *le, Context *c )
{
- if (g_conf.mds_log) {
- dout(5) << "submit_entry " << journaler->get_write_pos() << " : " << *le << dendl;
-
- // encode it, with event type
- {
- bufferlist bl;
- bl.append((char*)&le->_type, sizeof(le->_type));
- le->encode_payload(bl);
-
- // journal it.
- journaler->append_entry(bl); // bl is destroyed.
- }
-
- assert(!capped);
-
- delete le;
- num_events++;
-
- if (logger) {
- logger->inc("add");
- logger->set("size", num_events);
- logger->set("wrpos", journaler->get_write_pos());
- }
-
- if (c) {
- unflushed = 0;
- journaler->flush(c);
- }
- else
- unflushed++;
-
- // should we log a new import_map?
- // FIXME: should this go elsewhere?
- if (!writing_subtree_map &&
- (journaler->get_write_pos() / log_inode.layout.period()) !=
- (get_last_subtree_map_offset() / log_inode.layout.period()) &&
- (journaler->get_write_pos() - get_last_subtree_map_offset() > log_inode.layout.period()/2)) {
- // log import map
- dout(10) << "submit_entry also logging subtree map: last = " << get_last_subtree_map_offset()
- << ", cur pos = " << journaler->get_write_pos() << dendl;
- mds->mdcache->log_subtree_map();
- }
-
- } else {
+ if (!g_conf.mds_log) {
// hack: log is disabled.
if (c) {
c->finish(0);
delete c;
}
+ return;
+ }
+
+ dout(5) << "submit_entry " << journaler->get_write_pos() << " : " << *le << dendl;
+
+ // let the event register itself in the segment
+ assert(!segments.empty());
+ le->_segment = segments.rbegin()->second;
+ le->_segment->num_events++;
+ le->update_segment();
+
+ num_events++;
+ assert(!capped);
+
+ // encode it, with event type
+ {
+ bufferlist bl;
+ ::_encode(le->_type, bl);
+ le->encode_payload(bl);
+
+ // journal it.
+ journaler->append_entry(bl); // bl is destroyed.
+ }
+
+ delete le;
+
+ if (logger) {
+ logger->inc("evadd");
+ logger->set("ev", num_events);
+ logger->set("wrpos", journaler->get_write_pos());
+ }
+
+ if (c) {
+ unflushed = 0;
+ journaler->flush(c);
+ }
+ else
+ unflushed++;
+
+ // start a new segment?
+ // FIXME: should this go elsewhere?
+ off_t last_seg = get_last_segment_offset();
+ if (!segments.empty() &&
+ !writing_subtree_map &&
+ (journaler->get_write_pos() / log_inode.layout.period()) != (last_seg / log_inode.layout.period()) &&
+ (journaler->get_write_pos() - last_seg > log_inode.layout.period()/2)) {
+ dout(10) << "submit_entry also starting new segment: last = " << last_seg
+ << ", cur pos = " << journaler->get_write_pos() << dendl;
+ start_new_segment();
}
}
unflushed = 0;
// trim
- trim(NULL);
+ trim();
}
void MDLog::cap()
{
dout(5) << "cap" << dendl;
capped = true;
- kick_subtree_map();
}
-// trim
+// -----------------------------
+// segments
-class C_MDL_Trimmed : public Context {
-public:
- MDLog *mdl;
- LogEvent *le;
+void MDLog::start_new_segment(Context *onsync)
+{
+ dout(7) << "start_new_segment at " << journaler->get_write_pos() << dendl;
+ assert(!writing_subtree_map);
- C_MDL_Trimmed(MDLog *mdl, LogEvent *le) {
- this->mdl = mdl;
- this->le = le;
- }
- void finish(int res) {
- mdl->_trimmed(le);
- }
-};
+ segments[journaler->get_write_pos()] = new LogSegment(journaler->get_write_pos());
-class C_MDL_Reading : public Context {
-public:
- MDLog *mdl;
- C_MDL_Reading(MDLog *m) {
- mdl = m;
- }
- void finish(int res) {
- mdl->_did_read();
- }
-};
+ writing_subtree_map = true;
+ ESubtreeMap *le = mds->mdcache->create_subtree_map();
+ submit_entry(le, new C_MDL_WroteSubtreeMap(this, mds->mdlog->get_write_pos()));
+ if (onsync)
+ wait_for_sync(onsync);
+
+ logger->inc("segadd");
+ logger->set("seg", segments.size());
+}
-void MDLog::_did_read()
+void MDLog::_logged_subtree_map(off_t off)
{
- dout(5) << "_did_read()" << dendl;
- waiting_for_read = false;
- trim(0);
+ dout(10) << "_logged_subtree_map at " << off << dendl;
+ writing_subtree_map = false;
+
+ /*
+ list<Context*> ls;
+ take_subtree_map_expire_waiters(ls);
+ mds->queue_waiters(ls);
+ */
}
-void MDLog::_trimmed(LogEvent *le)
+
+
+void MDLog::trim()
{
- // successful trim?
- if (!le->has_expired(mds)) {
- dout(7) << "retrimming : " << le->get_start_off() << " : " << *le << dendl;
- le->expire(mds, new C_MDL_Trimmed(this, le));
- return;
- }
+ // trim!
+ dout(10) << "trim "
+ << segments.size() << " / " << max_segments << " segments, "
+ << num_events << " / " << max_events << " events"
+ << ", " << expiring_segments.size() << " (" << expiring_events << ") expiring"
+ << ", " << expired_segments.size() << " (" << expired_events << ") expired"
+ << dendl;
- dout(7) << "trimmed : " << le->get_start_off() << " : " << *le << dendl;
+ if (segments.empty()) return;
- bool kick = false;
+ // hack: only trim for a few seconds at a time
+ utime_t stop = g_clock.now();
+ stop += 2.0;
- map<off_t,LogEvent*>::iterator p = trimming.begin();
- if (p->first == le->_start_off) {
- // we trimmed off the front! it must have been a segment head.
- assert(!subtree_maps.empty());
- assert(p->first == *subtree_maps.begin());
- subtree_maps.erase(subtree_maps.begin());
+ map<off_t,LogSegment*>::iterator p = segments.begin();
+ int left = num_events;
+ while (p != segments.end() &&
+ ((max_events >= 0 && left-expiring_events-expired_events > max_events) ||
+ (max_segments >= 0 && (int)(segments.size()-expiring_segments.size()-expired_segments.size()) > max_segments))) {
- // we can expire the log a bit.
- off_t to = get_trimmed_to();
- journaler->set_expire_pos(to);
- journaler->trim();
+ if (stop < g_clock.now())
+ break;
+
+ if ((int)expiring_segments.size() >= g_conf.mds_log_max_expiring)
+ break;
+
+ // look at first segment
+ LogSegment *ls = p->second;
+ assert(ls);
- kick = true;
- } else {
p++;
- // is the next one us?
- if (le->_start_off == p->first) {
- p++;
-
- // did we empty a segment?
- if (subtree_maps.size() >= 2) {
- set<off_t>::iterator segp = subtree_maps.begin();
- assert(*segp < le->_end_off);
- segp++;
- dout(20) << "i ended at " << le->get_end_off()
- << ", next seg starts at " << *segp
- << ", next trimming is " << (p == trimming.end() ? 0:p->first)
- << dendl;
- if (*segp >= le->_end_off &&
- (p == trimming.end() ||
- p->first >= *segp)) {
- dout(10) << "_trimmed segment looks empty" << dendl;
- kick = true;
- }
- } else if (capped && trimming.size() < 3) {
- kick = true; // blech, imprecise
- }
+ left -= ls->num_events;
+
+ if (expiring_segments.count(ls)) {
+ dout(5) << "trim already expiring segment " << ls->offset << ", " << ls->num_events << " events" << dendl;
+ } else if (expired_segments.count(ls)) {
+ dout(5) << "trim already expired segment " << ls->offset << ", " << ls->num_events << " events" << dendl;
+ } else {
+ try_expire(ls);
}
}
+}
- trimming.erase(le->_start_off);
- delete le;
- if (kick)
- kick_subtree_map();
-
- if (logger) {
- logger->inc("trimf");
- logger->set("trimng", trimming.size());
- logger->set("rdpos", journaler->get_read_pos());
+void MDLog::try_expire(LogSegment *ls)
+{
+ C_Gather *exp = ls->try_to_expire(mds);
+ if (exp) {
+ assert(expiring_segments.count(ls) == 0);
+ expiring_segments.insert(ls);
+ expiring_events += ls->num_events;
+ dout(5) << "try_expire expiring segment " << ls->offset << dendl;
+ exp->set_finisher(new C_MaybeExpiredSegment(this, ls));
+ } else {
+ dout(10) << "try_expire expired segment " << ls->offset << dendl;
+ _expired(ls);
}
-
- trim(0);
+
+ logger->set("segexg", expiring_segments.size());
+ logger->set("evexg", expiring_events);
}
-
-
-void MDLog::trim(Context *c)
+void MDLog::_maybe_expired(LogSegment *ls)
{
- // add waiter
- if (c)
- trim_waiters.push_back(c);
-
- // trim!
- dout(10) << "trim " << num_events << " events / " << max_events << " max" << dendl;
+ dout(10) << "_maybe_expired segment " << ls->offset << " " << ls->num_events << " events" << dendl;
+ assert(expiring_segments.count(ls));
+ expiring_segments.erase(ls);
+ expiring_events -= ls->num_events;
+ try_expire(ls);
+}
- // hack: only trim for a few seconds at a time
- utime_t stop = g_clock.now();
- stop += 2.0;
+void MDLog::_expired(LogSegment *ls)
+{
+ dout(5) << "_expired segment " << ls->offset << " " << ls->num_events << " events" << dendl;
- while (num_events > max_events) {
- // don't check the clock on _every_ event, here!
- if (num_events % 100 == 0 &&
- stop < g_clock.now())
- break;
+ if (!capped && ls == get_current_segment()) {
+ dout(5) << "_expired not expiring " << ls->offset << ", last one and !capped" << dendl;
+ } else {
+ // expired.
+ expired_segments.insert(ls);
+ expired_events += ls->num_events;
- off_t gap = journaler->get_write_pos() - journaler->get_read_pos();
- dout(5) << "trim num_events " << num_events << " > max " << max_events
- << ", trimming " << trimming.size()
- << ", byte gap " << gap
- << dendl;
-
- if ((int)trimming.size() >= g_conf.mds_log_max_trimming) {
- dout(7) << "trim already trimming max, waiting" << dendl;
- return;
- }
+ logger->inc("evex", ls->num_events);
+ logger->inc("segex");
- bufferlist bl;
- off_t so = journaler->get_read_pos();
- if (journaler->try_read_entry(bl)) {
- // decode logevent
- LogEvent *le = LogEvent::decode(bl);
- le->_start_off = so;
- le->_end_off = journaler->get_read_pos();
- num_events--;
-
- // we just read an event.
- if (le->has_expired(mds)) {
- // obsolete
- dout(7) << "trim obsolete : " << le->get_start_off() << " : " << *le << dendl;
- delete le;
- if (logger) logger->inc("obs");
- } else {
- assert ((int)trimming.size() < g_conf.mds_log_max_trimming);
-
- // trim!
- dout(7) << "trim expiring : " << le->get_start_off() << " : " << *le << dendl;
- trimming[le->_start_off] = le;
- le->expire(mds, new C_MDL_Trimmed(this, le));
- if (logger) {
- logger->inc("trims");
- logger->set("trimng", trimming.size());
- }
- }
- if (logger) {
- logger->set("rdpos", journaler->get_read_pos());
- logger->set("size", num_events);
- }
- } else {
- // need to read!
- if (!waiting_for_read) {
- waiting_for_read = true;
- dout(7) << "trim waiting for read" << dendl;
- journaler->wait_for_readable(new C_MDL_Reading(this));
- } else {
- dout(7) << "trim already waiting for read" << dendl;
- }
- return;
+ // trim expired segments?
+ while (!segments.empty()) {
+ ls = segments.begin()->second;
+ if (!expired_segments.count(ls)) break;
+
+ expired_events -= ls->num_events;
+ expired_segments.erase(ls);
+ num_events -= ls->num_events;
+
+ journaler->set_expire_pos(ls->offset); // this was the oldest segment, adjust expire pos
+ journaler->write_head(0);
+
+ logger->set("expos", ls->offset);
+ logger->inc("segtrm");
+ logger->inc("evtrm", ls->num_events);
+
+ segments.erase(ls->offset);
+ delete ls;
}
}
- dout(10) << "trim num_events " << num_events << " <= max " << max_events
- << ", trimming " << trimming.size()
- << ", done for now."
- << dendl;
-
- // trimmed!
- std::list<Context*> finished;
- finished.swap(trim_waiters);
- finish_contexts(finished, 0);
+ logger->set("ev", num_events);
+ logger->set("evexd", expired_events);
+ logger->set("seg", segments.size());
+ logger->set("segexd", expired_segments.size());
}
-
void MDLog::replay(Context *c)
{
assert(journaler->is_active());
assert(num_events == 0);
replay_thread.create();
- //_replay();
}
class C_MDL_Replay : public Context {
C_MDL_Replay(MDLog *l) : mdlog(l) {}
void finish(int r) {
mdlog->replay_cond.Signal();
- //mdlog->_replay();
}
};
// unpack event
LogEvent *le = LogEvent::decode(bl);
+ // new segment?
+ if (le->get_type() == EVENT_SUBTREEMAP) {
+ segments[pos] = new LogSegment(pos);
+ logger->set("seg", segments.size());
+ }
+
// have we seen an import map yet?
- if (!seen_subtree_map &&
- le->get_type() != EVENT_SUBTREEMAP) {
+ if (segments.empty()) {
dout(10) << "_replay " << pos << " / " << journaler->get_write_pos()
<< " -- waiting for subtree_map. (skipping " << *le << ")" << dendl;
} else {
dout(10) << "_replay " << pos << " / " << journaler->get_write_pos()
<< " : " << *le << dendl;
+ le->_segment = get_current_segment(); // replay may need this
+ le->_segment->num_events++;
+ num_events++;
+
le->replay(mds);
- num_events++;
if (!new_expire_pos)
new_expire_pos = pos;
-
- if (le->get_type() == EVENT_SUBTREEMAP)
- seen_subtree_map = true;
}
delete le;
+ logger->set("rdpos", pos);
+
// drop lock for a second, so other events/messages (e.g. beacon timer!) can go off
mds->mds_lock.Unlock();
mds->mds_lock.Lock();
// move read pointer _back_ to first subtree map we saw, for eventual trimming
journaler->set_read_pos(new_expire_pos);
journaler->set_expire_pos(new_expire_pos);
+ logger->set("expos", new_expire_pos);
// kick waiter(s)
list<Context*> ls;
#include "common/Thread.h"
#include "common/Cond.h"
+#include "LogSegment.h"
+
#include <list>
//#include <ext/hash_map>
class Journaler;
class LogEvent;
class MDS;
+class LogSegment;
+class ESubtreeMap;
class Logger;
-/*
-namespace __gnu_cxx {
- template<> struct hash<LogEvent*> {
- size_t operator()(const LogEvent *p) const {
- static hash<unsigned long> H;
- return H((unsigned long)p);
- }
- };
-}
-*/
+#include <map>
+using std::map;
+
class MDLog {
protected:
MDS *mds;
- size_t num_events; // in events
- size_t max_events;
+ int num_events; // in events
+ int max_events;
+ int max_segments;
int unflushed;
Logger *logger;
- // -- trimming --
- map<off_t,LogEvent*> trimming;
- std::list<Context*> trim_waiters; // contexts waiting for trim
- bool trim_reading;
-
- bool waiting_for_read;
- friend class C_MDL_Reading;
-
-
- off_t get_trimmed_to() {
- if (trimming.empty())
- return get_read_pos();
- else
- return trimming.begin()->first;
- }
-
// -- replay --
Cond replay_cond;
void _replay_thread(); // new way
+ // -- segments --
+ map<off_t,LogSegment*> segments;
+ set<LogSegment*> expiring_segments;
+ set<LogSegment*> expired_segments;
+ int expiring_events;
+ int expired_events;
+
+ class C_MDL_WroteSubtreeMap : public Context {
+ MDLog *mdlog;
+ off_t off;
+ public:
+ C_MDL_WroteSubtreeMap(MDLog *l, off_t o) : mdlog(l), off(o) { }
+ void finish(int r) {
+ mdlog->_logged_subtree_map(off);
+ }
+ };
+ void _logged_subtree_map(off_t off);
+
// -- subtreemaps --
- set<off_t> subtree_maps;
- map<off_t,list<Context*> > subtree_map_expire_waiters;
bool writing_subtree_map; // one is being written now
- bool seen_subtree_map; // for recovery
friend class ESubtreeMap;
friend class C_MDS_WroteImportMap;
friend class MDCache;
- void kick_subtree_map() {
- if (subtree_map_expire_waiters.empty()) return;
- list<Context*> ls;
- ls.swap(subtree_map_expire_waiters.begin()->second);
- subtree_map_expire_waiters.erase(subtree_map_expire_waiters.begin());
- finish_contexts(ls);
- }
-
public:
- off_t get_last_subtree_map_offset() {
- assert(!subtree_maps.empty());
- return *subtree_maps.rbegin();
+ off_t get_last_segment_offset() {
+ assert(!segments.empty());
+ return segments.rbegin()->first;
}
public:
MDLog(MDS *m) : mds(m),
- num_events(0), max_events(g_conf.mds_log_max_len),
+ num_events(0),
+ max_events(g_conf.mds_log_max_events),
+ max_segments(g_conf.mds_log_max_segments),
unflushed(0),
capped(false),
journaler(0),
logger(0),
- trim_reading(false), waiting_for_read(false),
replay_thread(this),
- writing_subtree_map(false), seen_subtree_map(false) {
+ expiring_events(0), expired_events(0),
+ writing_subtree_map(false) {
}
~MDLog();
- void set_max_events(size_t max) { max_events = max; }
- size_t get_max_events() { return max_events; }
- size_t get_num_events() { return num_events + trimming.size(); }
- size_t get_non_subtreemap_events() { return num_events + trimming.size() - subtree_map_expire_waiters.size(); }
+ void start_new_segment(Context *onsync=0);
+ LogSegment *get_current_segment() {
+ return segments.empty() ? 0:segments.rbegin()->second;
+ }
+
+
+ void flush_logger();
+
+ size_t get_num_events() { return num_events; }
+ void set_max_events(int m) { max_events = m; }
+ size_t get_num_segments() { return segments.size(); }
+ void set_max_segments(int m) { max_segments = m; }
off_t get_read_pos();
off_t get_write_pos();
- bool empty() {
- return get_read_pos() == get_write_pos();
- }
+ bool empty() { return segments.empty(); }
bool is_capped() { return capped; }
void cap();
void wait_for_sync( Context *c );
void flush();
- void trim(Context *c);
- void _did_read();
- void _trimmed(LogEvent *le);
+private:
+ class C_MaybeExpiredSegment : public Context {
+ MDLog *mdlog;
+ LogSegment *ls;
+ public:
+ C_MaybeExpiredSegment(MDLog *mdl, LogSegment *s) : mdlog(mdl), ls(s) {}
+ void finish(int res) {
+ mdlog->_maybe_expired(ls);
+ }
+ };
+
+ void try_expire(LogSegment *ls);
+ void _maybe_expired(LogSegment *ls);
+ void _expired(LogSegment *ls);
- void reset(); // fresh, empty log!
- void open(Context *onopen);
- void append();
+public:
+ void trim();
+
+private:
void write_head(Context *onfinish);
+public:
+ void create(Context *onfinish); // fresh, empty log!
+ void open(Context *onopen); // append() or replay() to follow!
+ void append();
void replay(Context *onfinish);
};
// schedule tick
reset_tick();
- // init logger
- //reopen_logger(g_clock.now());
-
mds_lock.Unlock();
return 0;
}
if (logger) {
req_rate = logger->get("req");
- logger->set("l", (int)load.mds_load());
+ logger->fset("l", (int)load.mds_load());
logger->set("q", messenger->get_dispatch_queue_len());
logger->set("buf", buffer_total_alloc);
logger->set("sm", mdcache->num_subtrees());
-
+
mdcache->log_stat(logger);
}
-
+
+ if (is_active() || is_stopping())
+ locker->scatter_unscatter_autoscattered();
+
// booted?
if (is_active()) {
return;
}
- // note some old state
+ // keep old map, for a moment
+ MDSMap *oldmap = mdsmap;
int oldwhoami = whoami;
int oldstate = state;
- set<int> oldresolve;
- mdsmap->get_mds_set(oldresolve, MDSMap::STATE_RESOLVE);
- bool wasrejoining = mdsmap->is_rejoining();
- set<int> oldfailed;
- mdsmap->get_mds_set(oldfailed, MDSMap::STATE_FAILED);
- set<int> oldactive;
- mdsmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE);
- set<int> oldcreating;
- mdsmap->get_mds_set(oldcreating, MDSMap::STATE_CREATING);
- set<int> oldstopped;
- mdsmap->get_mds_set(oldstopped, MDSMap::STATE_STOPPED);
// decode and process
+ mdsmap = new MDSMap;
mdsmap->decode(m->get_encoded());
// see who i am
return;
}
- if (oldwhoami != whoami || !logger) // fakesyn/newsyn starts knowing who they are
- reopen_logger(mdsmap->get_create());
-
+ // open logger?
+ // note that fakesyn/newsyn starts knowing who they are
+ if (whoami >= 0 &&
+ mdsmap->is_up(whoami) && !mdsmap->is_standby(whoami) &&
+ (oldwhoami != whoami || !logger))
+ reopen_logger(mdsmap->get_create()); // adopt mds cluster timeline
+
if (oldwhoami != whoami) {
// update messenger.
dout(1) << "handle_mds_map i am now mds" << whoami
messenger->send_message(new MOSDGetMap(0),
monmap->get_inst(mon));
}
-
}
// tell objecter my incarnation
return;
}
}
-
+
// RESOLVE
// is someone else newly resolving?
if (is_resolve() || is_rejoin() || is_active() || is_stopping()) {
- set<int> resolve;
+ set<int> oldresolve, resolve;
+ oldmap->get_mds_set(oldresolve, MDSMap::STATE_RESOLVE);
mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE);
if (oldresolve != resolve) {
dout(10) << "resolve set is " << resolve << ", was " << oldresolve << dendl;
- for (set<int>::iterator p = resolve.begin(); p != resolve.end(); ++p) {
- if (*p == whoami) continue;
- if (oldresolve.count(*p)) continue;
- mdcache->send_resolve(*p); // now or later.
- }
+ for (set<int>::iterator p = resolve.begin(); p != resolve.end(); ++p)
+ if (*p != whoami &&
+ oldresolve.count(*p) == 0)
+ mdcache->send_resolve(*p); // now or later.
}
}
// is everybody finally rejoining?
if (is_rejoin() || is_active() || is_stopping()) {
// did we start?
- if (!wasrejoining && mdsmap->is_rejoining())
+ if (!oldmap->is_rejoining() && mdsmap->is_rejoining())
rejoin_joint_start();
// did we finish?
if (g_conf.mds_dump_cache_after_rejoin &&
- wasrejoining && !mdsmap->is_rejoining())
+ oldmap->is_rejoining() && !mdsmap->is_rejoining())
mdcache->dump_cache(); // for DEBUG only
}
-
+ if (oldmap->is_degraded() && !mdsmap->is_degraded() && state >= MDSMap::STATE_ACTIVE)
+ dout(1) << "cluster recovered." << dendl;
+
// did someone go active?
if (is_active() || is_stopping()) {
- set<int> active;
+ set<int> oldactive, active;
+ oldmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE);
mdsmap->get_mds_set(active, MDSMap::STATE_ACTIVE);
- for (set<int>::iterator p = active.begin(); p != active.end(); ++p) {
- if (*p == whoami) continue; // not me
- if (oldactive.count(*p)) continue; // newly so?
- handle_mds_recovery(*p);
- }
+ for (set<int>::iterator p = active.begin(); p != active.end(); ++p)
+ if (*p != whoami && // not me
+ oldactive.count(*p) == 0) // newly so?
+ handle_mds_recovery(*p);
}
+ // did someone fail or stop?
if (is_active() || is_stopping()) {
- // did anyone go down?
- set<int> failed;
+ // new failed?
+ set<int> oldfailed, failed;
+ oldmap->get_mds_set(oldfailed, MDSMap::STATE_FAILED);
mdsmap->get_mds_set(failed, MDSMap::STATE_FAILED);
- for (set<int>::iterator p = failed.begin(); p != failed.end(); ++p) {
- if (oldfailed.count(*p)) continue; // newly so?
- mdcache->handle_mds_failure(*p);
- }
+ for (set<int>::iterator p = failed.begin(); p != failed.end(); ++p)
+ if (oldfailed.count(*p) == 0)
+ mdcache->handle_mds_failure(*p);
+
+ // or down then up?
+ // did their addr/inst change?
+ set<int> up;
+ mdsmap->get_up_mds_set(up);
+ for (set<int>::iterator p = up.begin(); p != up.end(); ++p)
+ if (oldmap->have_inst(*p) &&
+ oldmap->get_inst(*p) != mdsmap->get_inst(*p))
+ mdcache->handle_mds_failure(*p);
// did anyone stop?
- set<int> stopped;
+ set<int> oldstopped, stopped;
+ oldmap->get_mds_set(oldstopped, MDSMap::STATE_STOPPED);
mdsmap->get_mds_set(stopped, MDSMap::STATE_STOPPED);
- for (set<int>::iterator p = stopped.begin(); p != stopped.end(); ++p) {
- if (oldstopped.count(*p)) continue; // newly so?
- mdcache->migrator->handle_mds_failure_or_stop(*p);
- }
+ for (set<int>::iterator p = stopped.begin(); p != stopped.end(); ++p)
+ if (oldstopped.count(*p) == 0) // newly so?
+ mdcache->migrator->handle_mds_failure_or_stop(*p);
}
-
- // in set set changed?
- /*
- if (state >= MDSMap::STATE_ACTIVE && // only if i'm active+. otherwise they'll get map during reconnect.
- mdsmap->get_same_in_set_since() > last_client_mdsmap_bcast) {
- bcast_mds_map();
- }
- */
-
// just got mdsmap+osdmap?
if (hadepoch == 0 &&
mdsmap->get_epoch() > 0 &&
}
delete m;
+ delete oldmap;
}
void MDS::bcast_mds_map()
C_Gather *fin = new C_Gather(new C_MDS_CreateFinish(this));
+ CDir *rootdir = 0;
if (whoami == 0) {
dout(3) << "boot_create since i am also mds0, creating root inode and dir" << dendl;
assert(root);
// force empty root dir
- CDir *dir = root->get_dirfrag(frag_t());
- dir->mark_complete();
- dir->mark_dirty(dir->pre_dirty());
-
- // save it
- dir->commit(0, fin->new_sub());
+ rootdir = root->get_dirfrag(frag_t());
+ rootdir->mark_complete();
}
// create my stray dir
+ CDir *straydir;
{
dout(10) << "boot_create creating local stray dir" << dendl;
mdcache->open_local_stray();
CInode *stray = mdcache->get_stray();
- CDir *dir = stray->get_dirfrag(frag_t());
- dir->mark_complete();
- dir->mark_dirty(dir->pre_dirty());
- dir->commit(0, fin->new_sub());
+ straydir = stray->get_dirfrag(frag_t());
+ straydir->mark_complete();
}
// start with a fresh journal
dout(10) << "boot_create creating fresh journal" << dendl;
- mdlog->reset();
- mdlog->write_head(fin->new_sub());
+ mdlog->create(fin->new_sub());
// write our first subtreemap
- mdcache->log_subtree_map(fin->new_sub());
+ mdlog->start_new_segment(fin->new_sub());
+ // dirty, commit (root and) stray dir(s)
+ if (whoami == 0) {
+ rootdir->mark_dirty(rootdir->pre_dirty(), mdlog->get_current_segment());
+ rootdir->commit(0, fin->new_sub());
+ }
+ straydir->mark_dirty(straydir->pre_dirty(), mdlog->get_current_segment());
+ straydir->commit(0, fin->new_sub());
+
// fixme: fake out idalloc (reset, pretend loaded)
dout(10) << "boot_create creating fresh idalloc table" << dendl;
idalloc->reset();
if (is_replay()) {
dout(2) << "boot_start " << step << ": replaying mds log" << dendl;
mdlog->replay(new C_MDS_BootStart(this, 3));
+ break;
} else {
dout(2) << "boot_start " << step << ": positioning at end of old mds log" << dendl;
mdlog->append();
- mdcache->log_subtree_map(new C_MDS_BootStart(this, 3));
+ step++;
}
- break;
case 3:
if (is_replay()) {
dout(3) << "starting_done" << dendl;
assert(is_starting());
set_want_state(MDSMap::STATE_ACTIVE);
+
+ // start new segment
+ mdlog->start_new_segment(0);
}
dout(2) << "i am not alone, moving to state resolve" << dendl;
set_want_state(MDSMap::STATE_RESOLVE);
}
+
+ // start new segment
+ mdlog->start_new_segment(0);
}
void MDS::stopping_start()
{
dout(2) << "stopping_start" << dendl;
-
+
// start cache shutdown
mdcache->shutdown_start();
// terminate client sessions
server->terminate_sessions();
-
- // flush log
- mdlog->set_max_events(0);
- mdlog->trim(NULL);
}
+
void MDS::stopping_done()
{
dout(2) << "stopping_done" << dendl;
mdsmap->get_inst(from) != m->get_source_inst() ||
mdsmap->is_down(from)) {
// bogus mds?
- if (m->get_type() != MSG_MDS_MAP) {
+ if (m->get_type() == MSG_MDS_MAP) {
+ dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source()
+ << ", but it's an mdsmap, looking at it" << dendl;
+ } else if (m->get_type() == MSG_MDS_CACHEEXPIRE &&
+ mdsmap->get_inst(from) == m->get_source_inst()) {
+ dout(5) << "got " << *m << " from down mds " << m->get_source()
+ << ", but it's a cache_expire, looking at it" << dendl;
+ } else {
dout(5) << "got " << *m << " from down/old/bad/imposter mds " << m->get_source()
<< ", dropping" << dendl;
delete m;
return;
- } else {
- dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source()
- << ", but it's an mdsmap, looking at it" << dendl;
}
}
}
void Migrator::export_empty_import(CDir *dir)
{
dout(7) << "export_empty_import " << *dir << dendl;
-
- if (dir->inode->is_auth()) return;
- if (!dir->is_auth()) return;
-
- if (dir->inode->is_freezing() || dir->inode->is_frozen()) return;
- if (dir->is_freezing() || dir->is_frozen()) return;
-
- if (dir->get_size() > 0) {
- dout(7) << "not actually empty" << dendl;
+ assert(dir->is_subtree_root());
+
+ if (dir->inode->is_auth()) {
+ dout(7) << " inode is auth" << dendl;
return;
}
-
- if (dir->inode->is_root()) {
- dout(7) << "root" << dendl;
+ if (!dir->is_auth()) {
+ dout(7) << " not auth" << dendl;
return;
}
-
- // is it really empty?
- if (!dir->is_complete()) {
- dout(7) << "not complete, fetching." << dendl;
- dir->fetch(new C_MDC_EmptyImport(this,dir));
+ if (dir->is_freezing() || dir->is_frozen()) {
+ dout(7) << " freezing or frozen" << dendl;
+ return;
+ }
+ if (dir->get_size() > 0) {
+ dout(7) << " not actually empty" << dendl;
+ return;
+ }
+ if (dir->inode->is_root()) {
+ dout(7) << " root" << dendl;
return;
}
int dest = dir->inode->authority().first;
-
- // comment this out ot wreak havoc?
//if (mds->is_shutting_down()) dest = 0; // this is more efficient.
- dout(7) << "really empty, exporting to " << dest << dendl;
+ dout(7) << " really empty, exporting to " << dest << dendl;
assert (dest != mds->get_nodeid());
- dout(-7) << "exporting to mds" << dest
+ dout(7) << "exporting to mds" << dest
<< " empty import " << *dir << dendl;
export_dir( dir, dest );
}
dir->auth_unpin();
export_state.erase(dir); // clean up
dir->state_clear(CDir::STATE_EXPORTING);
- dir->put(CDir::PIN_EXPORTING);
if (export_peer[dir] != who) // tell them.
mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir], MDS_PORT_MIGRATOR);
break;
dir->unfreeze_tree(); // cancel the freeze
export_state.erase(dir); // clean up
dir->state_clear(CDir::STATE_EXPORTING);
- dir->put(CDir::PIN_EXPORTING);
if (export_peer[dir] != who) // tell them.
mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir], MDS_PORT_MIGRATOR);
break;
cache->try_subtree_merge(dir);
export_state.erase(dir); // clean up
dir->state_clear(CDir::STATE_EXPORTING);
- dir->put(CDir::PIN_EXPORTING);
break;
case EXPORT_EXPORTING:
export_reverse(dir);
export_state.erase(dir); // clean up
dir->state_clear(CDir::STATE_EXPORTING);
- dir->put(CDir::PIN_EXPORTING);
break;
case EXPORT_LOGGINGFINISH:
export_peer[dir] = dest;
dir->state_set(CDir::STATE_EXPORTING);
- dir->get(CDir::PIN_EXPORTING);
// send ExportDirDiscover (ask target)
- mds->send_message_mds(new MExportDirDiscover(dir), export_peer[dir], MDS_PORT_MIGRATOR);
+ mds->send_message_mds(new MExportDirDiscover(dir), dest, MDS_PORT_MIGRATOR);
// start the freeze, but hold it up with an auth_pin.
dir->auth_pin();
- dir->freeze_tree(new C_MDC_ExportFreeze(this, dir));
+ dir->freeze_tree();
+ assert(dir->is_freezing_tree());
+ dir->add_waiter(CDir::WAIT_FROZEN, new C_MDC_ExportFreeze(this, dir));
}
export_warning_ack_waiting.erase(dir);
export_state[dir] = EXPORT_EXPORTING;
- assert(export_data.count(dir) == 0);
assert(dir->get_cum_auth_pins() == 0);
// set ambiguous auth
// fill export message with cache data
utime_t now = g_clock.now();
- C_Contexts *fin = new C_Contexts; // collect all the waiters
map<int,entity_inst_t> exported_client_map;
- int num_exported_inodes = encode_export_dir( export_data[dir],
- fin,
- dir, // base
+ bufferlist export_data;
+ int num_exported_inodes = encode_export_dir( export_data,
dir, // recur start point
- dest,
exported_client_map,
now );
bufferlist bl;
::_encode(exported_client_map, bl);
- export_data[dir].push_front(bl);
+ bl.claim_append(export_data);
+ export_data.claim(bl);
// send the export data!
MExportDir *req = new MExportDir(dir->dirfrag());
- req->set_dirstate(export_data[dir]);
+ req->take_dirstate(export_data);
// add bounds to message
set<CDir*> bounds;
++p)
req->add_export((*p)->dirfrag());
- //s end
+ // send
mds->send_message_mds(req, dest, MDS_PORT_MIGRATOR);
- // queue up the finisher
- dir->add_waiter( CDir::WAIT_UNFREEZE, fin );
-
// stats
if (mds->logger) mds->logger->inc("ex");
if (mds->logger) mds->logger->inc("iex", num_exported_inodes);
* update our local state for this inode to export.
* encode relevant state to be sent over the wire.
* used by: encode_export_dir, file_rename (if foreign)
+ *
+ * FIXME: the separation between CInode.encode_export and these methods
+ * is pretty arbitrary and dumb.
*/
-void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, int new_auth,
- map<int,entity_inst_t>& exported_client_map,
- utime_t now)
+void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state,
+ map<int,entity_inst_t>& exported_client_map)
+{
+ dout(7) << "encode_export_inode " << *in << dendl;
+ assert(!in->is_replica(mds->get_nodeid()));
+
+ ::_encode_simple(in->inode.ino, enc_state);
+ in->encode_export(enc_state);
+
+ // make note of clients named by exported capabilities
+ for (map<int, Capability>::iterator it = in->client_caps.begin();
+ it != in->client_caps.end();
+ it++)
+ exported_client_map[it->first] = mds->clientmap.get_inst(it->first);
+}
+
+void Migrator::finish_export_inode(CInode *in, utime_t now, list<Context*>& finished)
{
+ dout(12) << "finish_export_inode " << *in << dendl;
+
+ in->finish_export(now);
+
// tell (all) clients about migrating caps.. mark STALE
for (map<int, Capability>::iterator it = in->client_caps.begin();
it != in->client_caps.end();
it++) {
- dout(7) << "encode_export_inode " << *in << " telling client" << it->first << " stale caps" << dendl;
+ dout(7) << "finish_export_inode telling client" << it->first
+ << " stale caps on " << *in << dendl;
MClientFileCaps *m = new MClientFileCaps(MClientFileCaps::OP_STALE,
in->inode,
it->second.get_last_seq(),
it->second.pending(),
it->second.wanted());
entity_inst_t inst = mds->clientmap.get_inst(it->first);
- exported_client_map[it->first] = inst;
mds->send_message_client_maybe_open(m, inst);
}
+ in->clear_client_caps();
// relax locks?
if (!in->is_replicated())
in->replicate_relax_locks();
- // add inode
- assert(!in->is_replica(mds->get_nodeid()));
- CInodeExport istate(in, now);
- istate._encode( enc_state );
-
- // we're export this inode; fix inode state
- dout(7) << "encode_export_inode " << *in << dendl;
-
+ // clean
if (in->is_dirty()) in->mark_clean();
// clear/unpin cached_by (we're no longer the authority)
in->state_clear(CInode::STATE_AUTH);
in->replica_nonce = CInode::EXPORT_NONCE;
+ // waiters
+ in->take_waiting(CInode::WAIT_ANY, finished);
+
// *** other state too?
// move to end of LRU so we drop out of cache quickly!
if (in->get_parent_dn())
cache->lru.lru_bottouch(in->get_parent_dn());
-}
+}
-int Migrator::encode_export_dir(list<bufferlist>& dirstatelist,
- C_Contexts *fin,
- CDir *basedir,
+int Migrator::encode_export_dir(bufferlist& exportbl,
CDir *dir,
- int newauth,
map<int,entity_inst_t>& exported_client_map,
utime_t now)
{
assert(dir->get_projected_version() == dir->get_version());
// dir
- bufferlist enc_dir;
-
- CDirExport dstate(dir, now);
- dstate._encode( enc_dir );
-
- // release open_by
- dir->clear_replica_map();
-
- // mark
- assert(dir->is_auth());
- dir->state_clear(CDir::STATE_AUTH);
- dir->replica_nonce = CDir::NONCE_EXPORT;
-
- list<CDir*> subdirs;
-
- if (dir->is_dirty())
- dir->mark_clean();
-
- // discard most dir state
- dir->state &= CDir::MASK_STATE_EXPORT_KEPT; // i only retain a few things.
+ dirfrag_t df = dir->dirfrag();
+ ::_encode_simple(df, exportbl);
+ dir->encode_export(exportbl);
- // suck up all waiters
- list<Context*> waiting;
- dir->take_waiting(CDir::WAIT_ANY, waiting); // all dir waiters
- fin->take(waiting);
+ long nden = dir->items.size();
+ ::_encode_simple(nden, exportbl);
// dentries
+ list<CDir*> subdirs;
CDir::map_t::iterator it;
for (it = dir->begin(); it != dir->end(); it++) {
CDentry *dn = it->second;
// -- dentry
dout(7) << "encode_export_dir exporting " << *dn << dendl;
- // name
- ::_encode(it->first, enc_dir);
+ // dn name
+ ::_encode(it->first, exportbl);
// state
- it->second->encode_export_state(enc_dir);
+ dn->encode_export(exportbl);
// points to...
// null dentry?
if (dn->is_null()) {
- enc_dir.append("N", 1); // null dentry
+ exportbl.append("N", 1); // null dentry
continue;
}
if (dn->is_remote()) {
// remote link
- enc_dir.append("L", 1); // remote link
+ exportbl.append("L", 1); // remote link
inodeno_t ino = dn->get_remote_ino();
unsigned char d_type = dn->get_remote_d_type();
- ::_encode(ino, enc_dir);
- ::_encode(d_type, enc_dir);
+ ::_encode(ino, exportbl);
+ ::_encode(d_type, exportbl);
continue;
}
// primary link
// -- inode
- enc_dir.append("I", 1); // inode dentry
+ exportbl.append("I", 1); // inode dentry
- encode_export_inode(in, enc_dir, newauth, exported_client_map, now); // encode, and (update state for) export
+ encode_export_inode(in, exportbl, exported_client_map); // encode, and (update state for) export
// directory?
list<CDir*> dfs;
subdirs.push_back(dir); // it's ours, recurse (later)
}
}
-
- // waiters
- list<Context*> waiters;
- in->take_waiting(CInode::WAIT_ANY, waiters);
- fin->take(waiters);
}
- // add to dirstatelist
- bufferlist bl;
- dirstatelist.push_back( bl );
- dirstatelist.back().claim( enc_dir );
-
// subdirs
for (list<CDir*>::iterator it = subdirs.begin(); it != subdirs.end(); it++)
- num_exported += encode_export_dir(dirstatelist, fin, basedir, *it, newauth,
- exported_client_map, now);
+ num_exported += encode_export_dir(exportbl, *it, exported_client_map, now);
return num_exported;
}
+void Migrator::finish_export_dir(CDir *dir, list<Context*>& finished, utime_t now)
+{
+ dout(10) << "finish_export_dir " << *dir << dendl;
+
+ // release open_by
+ dir->clear_replica_map();
+
+ // mark
+ assert(dir->is_auth());
+ dir->state_clear(CDir::STATE_AUTH);
+ dir->replica_nonce = CDir::NONCE_EXPORT;
+
+ if (dir->is_dirty())
+ dir->mark_clean();
+
+ // discard most dir state
+ dir->state &= CDir::MASK_STATE_EXPORT_KEPT; // i only retain a few things.
+
+ // suck up all waiters
+ dir->take_waiting(CDir::WAIT_ANY, finished); // all dir waiters
+
+ // pop
+ dir->finish_export(now);
+
+ // dentries
+ list<CDir*> subdirs;
+ CDir::map_t::iterator it;
+ for (it = dir->begin(); it != dir->end(); it++) {
+ CDentry *dn = it->second;
+ CInode *in = dn->get_inode();
+
+ // dentry
+ dn->finish_export();
+
+ // inode?
+ if (dn->is_primary()) {
+ finish_export_inode(in, now, finished);
+
+ // subdirs?
+ in->get_nested_dirfrags(subdirs);
+ }
+ }
+
+ // subdirs
+ for (list<CDir*>::iterator it = subdirs.begin(); it != subdirs.end(); it++)
+ finish_export_dir(*it, finished, now);
+}
class C_MDS_ExportFinishLogged : public Context {
Migrator *migrator;
export_warning_ack_waiting.erase(dir);
export_state[dir] = EXPORT_LOGGINGFINISH;
- export_data.erase(dir);
set<CDir*> bounds;
cache->get_subtree_bounds(dir, bounds);
- // log completion
+ // log completion.
+ // include export bounds, to ensure they're in the journal.
EExport *le = new EExport(mds->mdlog, dir);
le->metablob.add_dir_context(dir);
le->metablob.add_dir( dir, false );
+
+
/*
* this happens if hte dest failes after i send teh export data but before it is acked
* that is, we don't know they safely received and logged it, so we reverse our changes
dout(7) << "export_reverse " << *dir << dendl;
assert(export_state[dir] == EXPORT_EXPORTING);
- assert(export_data.count(dir));
set<CDir*> bounds;
cache->get_subtree_bounds(dir, bounds);
cache->adjust_subtree_auth(dir, mds->get_nodeid());
cache->try_subtree_merge(dir);
+ // remove exporting pins
+ list<CDir*> rq;
+ rq.push_back(dir);
+ while (!rq.empty()) {
+ CDir *dir = rq.front();
+ rq.pop_front();
+ dir->abort_export();
+ for (CDir::map_t::iterator p = dir->items.begin(); p != dir->items.end(); ++p) {
+ p->second->abort_export();
+ if (!p->second->is_primary()) continue;
+ CInode *in = p->second->get_inode();
+ in->abort_export();
+ if (in->is_dir())
+ in->get_nested_dirfrags(rq);
+ }
+ }
+
// unpin bounds
for (set<CDir*>::iterator p = bounds.begin();
p != bounds.end();
bd->state_clear(CDir::STATE_EXPORTBOUND);
}
- // re-import the metadata
- map<int,entity_inst_t> imported_client_map;
- int off = 0;
- ::_decode(imported_client_map, export_data[dir].front(), off);
- export_data[dir].pop_front();
-
- while (!export_data[dir].empty()) {
- decode_import_dir(export_data[dir].front(),
- export_peer[dir],
- dir, // import root
- 0,
- imported_client_map);
- export_data[dir].pop_front();
- }
-
// process delayed expires
cache->process_delayed_expire(dir);
// some clean up
- export_data.erase(dir);
export_warning_ack_waiting.erase(dir);
export_notify_ack_waiting.erase(dir);
dout(7) << "not sending MExportDirFinish, dest has failed" << dendl;
}
+ // finish export (adjust local cache state)
+ C_Contexts *fin = new C_Contexts;
+ finish_export_dir(dir, fin->contexts, g_clock.now());
+ dir->add_waiter(CDir::WAIT_UNFREEZE, fin);
+
// unfreeze
dout(7) << "export_finish unfreezing" << dendl;
dir->unfreeze_tree();
// remove from exporting list, clean up state
dir->state_clear(CDir::STATE_EXPORTING);
- dir->put(CDir::PIN_EXPORTING);
export_state.erase(dir);
export_peer.erase(dir);
export_notify_ack_waiting.erase(dir);
// add this crap to my cache
map<int,entity_inst_t> imported_client_map;
- int off = 0;
- ::_decode(imported_client_map, m->get_dirstate().front(), off);
- m->get_dirstate().pop_front();
+ bufferlist::iterator blp = m->get_dirstate().begin();
+ ::_decode_simple(imported_client_map, blp);
int num_imported_inodes = 0;
- while (!m->get_dirstate().empty()) {
+ while (!blp.end()) {
num_imported_inodes +=
- decode_import_dir(m->get_dirstate().front(),
+ decode_import_dir(blp,
oldauth,
dir, // import root
le,
- imported_client_map);
- m->get_dirstate().pop_front();
+ imported_client_map,
+ mds->mdlog->get_current_segment(),
+ import_updated_scatterlocks[dir]);
}
dout(10) << " " << m->get_bounds().size() << " imported bounds" << dendl;
import_peer.erase(dir->dirfrag());
import_bystanders.erase(dir);
import_bound_ls.erase(dir);
+ import_updated_scatterlocks.erase(dir);
// send pending import_maps?
mds->mdcache->maybe_send_pending_resolves();
// log finish
mds->mdlog->submit_entry(new EImportFinish(dir, true));
+ // clear updated scatterlocks
+ for (list<ScatterLock*>::iterator p = import_updated_scatterlocks[dir].begin();
+ p != import_updated_scatterlocks[dir].end();
+ ++p)
+ (*p)->clear_updated();
+
// remove pins
set<CDir*> bounds;
cache->get_subtree_bounds(dir, bounds);
import_peer.erase(dir->dirfrag());
import_bystanders.erase(dir);
import_bound_ls.erase(dir);
+ import_updated_scatterlocks.erase(dir);
// process delayed expires
cache->process_delayed_expire(dir);
}
-void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int oldauth,
- map<int,entity_inst_t>& imported_client_map)
+void Migrator::decode_import_inode(CDentry *dn, bufferlist::iterator& blp, int oldauth,
+ map<int,entity_inst_t>& imported_client_map,
+ LogSegment *ls,
+ list<ScatterLock*>& updated_scatterlocks)
{
dout(15) << "decode_import_inode on " << *dn << dendl;
- CInodeExport istate;
- off = istate._decode(bl, off);
+ inodeno_t ino;
+ ::_decode_simple(ino, blp);
bool added = false;
- CInode *in = cache->get_inode(istate.get_ino());
+ CInode *in = cache->get_inode(ino);
if (!in) {
in = new CInode(mds->mdcache);
added = true;
// state after link -- or not! -sage
set<int> merged_client_caps;
- istate.update_inode(in, merged_client_caps);
+ in->decode_import(blp, merged_client_caps, ls);
// link before state -- or not! -sage
if (dn->inode != in) {
dout(10) << " had " << *in << dendl;
}
+ // clear if dirtyscattered, since we're going to journal this
+ // but not until we _actually_ finish the import...
+ if (in->dirlock.is_updated())
+ updated_scatterlocks.push_back(&in->dirlock);
+
+ // put in autoscatter list?
+ // this is conservative, but safe.
+ if (in->dirlock.get_state() == LOCK_SCATTER)
+ mds->locker->note_autoscattered(&in->dirlock);
// adjust replica list
//assert(!in->is_replica(oldauth)); // not true on failed export
}
-int Migrator::decode_import_dir(bufferlist& bl,
+int Migrator::decode_import_dir(bufferlist::iterator& blp,
int oldauth,
CDir *import_root,
EImportStart *le,
- map<int,entity_inst_t>& imported_client_map)
+ map<int,entity_inst_t>& imported_client_map,
+ LogSegment *ls,
+ list<ScatterLock*>& updated_scatterlocks)
{
- int off = 0;
-
// set up dir
- CDirExport dstate;
- off = dstate._decode(bl, off);
-
- CInode *diri = cache->get_inode(dstate.get_dirfrag().ino);
+ dirfrag_t df;
+ ::_decode_simple(df, blp);
+
+ CInode *diri = cache->get_inode(df.ino);
assert(diri);
- CDir *dir = diri->get_or_open_dirfrag(mds->mdcache, dstate.get_dirfrag().frag);
+ CDir *dir = diri->get_or_open_dirfrag(mds->mdcache, df.frag);
assert(dir);
dout(7) << "decode_import_dir " << *dir << dendl;
// assimilate state
- dstate.update_dir( dir );
+ dir->decode_import(blp);
// mark (may already be marked from get_or_open_dir() above)
if (!dir->is_auth())
dout(15) << "doing contents" << dendl;
// contents
- long nden = dstate.get_nden();
+ long nden;
+ ::_decode_simple(nden, blp);
for (; nden>0; nden--) {
num_imported++;
// dentry
string dname;
- ::_decode(dname, bl, off);
+ ::_decode_simple(dname, blp);
CDentry *dn = dir->lookup(dname);
if (!dn)
dn = dir->add_null_dentry(dname);
- // decode state
- dn->decode_import_state(bl, off, oldauth, mds->get_nodeid());
+ dn->decode_import(blp, ls);
+
+ dn->add_replica(oldauth, CDentry::EXPORT_NONCE);
+ if (dn->is_replica(mds->get_nodeid()))
+ dn->remove_replica(mds->get_nodeid());
+
dout(15) << "decode_import_dir got " << *dn << dendl;
// points to...
char icode;
- bl.copy(off, 1, &icode);
- off++;
+ ::_decode_simple(icode, blp);
if (icode == 'N') {
// null dentry
// remote link
inodeno_t ino;
unsigned char d_type;
- ::_decode(ino, bl, off);
- ::_decode(d_type, bl, off);
+ ::_decode_simple(ino, blp);
+ ::_decode_simple(d_type, blp);
if (dn->is_remote()) {
assert(dn->get_remote_ino() == ino);
} else {
}
else if (icode == 'I') {
// inode
- decode_import_inode(dn, bl, off, oldauth, imported_client_map);
+ decode_import_inode(dn, blp, oldauth, imported_client_map, ls, updated_scatterlocks);
}
// add dentry to journal entry
// export fun
map<CDir*,int> export_state;
map<CDir*,int> export_peer;
- map<CDir*,list<bufferlist> > export_data; // only during EXPORTING state
+ //map<CDir*,list<bufferlist> > export_data; // only during EXPORTING state
map<CDir*,set<int> > export_warning_ack_waiting;
map<CDir*,set<int> > export_notify_ack_waiting;
map<dirfrag_t,int> import_peer;
map<CDir*,set<int> > import_bystanders;
map<CDir*,list<dirfrag_t> > import_bound_ls;
-
+ map<CDir*,list<ScatterLock*> > import_updated_scatterlocks;
/*
// -- hashing madness --
void export_dir_nicely(CDir *dir, int dest);
void maybe_do_queued_export();
+ void clear_export_queue() {
+ export_queue.clear();
+ }
- void encode_export_inode(CInode *in, bufferlist& enc_state, int newauth,
- map<int,entity_inst_t>& exported_client_map,
- utime_t now);
- int encode_export_dir(list<bufferlist>& dirstatelist,
- class C_Contexts *fin,
- CDir *basedir,
+ void encode_export_inode(CInode *in, bufferlist& enc_state,
+ map<int,entity_inst_t>& exported_client_map);
+ void finish_export_inode(CInode *in, utime_t now, list<Context*>& finished);
+ int encode_export_dir(bufferlist& exportbl,
CDir *dir,
- int newauth,
map<int,entity_inst_t>& exported_client_map,
utime_t now);
+ void finish_export_dir(CDir *dir, list<Context*>& finished, utime_t now);
void add_export_finish_waiter(CDir *dir, Context *c) {
export_finish_waiters[dir].push_back(c);
void handle_export_dir(MExportDir *m);
public:
- void decode_import_inode(CDentry *dn, bufferlist& bl, int &off, int oldauth,
- map<int,entity_inst_t>& imported_client_map);
- int decode_import_dir(bufferlist& bl,
+ void decode_import_inode(CDentry *dn, bufferlist::iterator& blp, int oldauth,
+ map<int,entity_inst_t>& imported_client_map,
+ LogSegment *ls,
+ list<ScatterLock*>& updated_scatterlocks);
+ int decode_import_dir(bufferlist::iterator& blp,
int oldauth,
CDir *import_root,
EImportStart *le,
- map<int,entity_inst_t>& imported_client_map);
+ map<int,entity_inst_t>& imported_client_map,
+ LogSegment *ls,
+ list<ScatterLock*>& updated_scatterlocks);
public:
void import_reverse(CDir *dir);
class ScatterLock : public SimpleLock {
int num_wrlock;
bool updated;
+ utime_t last_scatter;
public:
+ xlist<ScatterLock*>::item xlistitem_autoscattered;
+
ScatterLock(MDSCacheObject *o, int t, int wo) :
SimpleLock(o, t, wo),
num_wrlock(0),
- updated(false) {}
+ updated(false),
+ xlistitem_autoscattered(this) {}
int get_replica_state() {
switch (state) {
if (updated) {
parent->put(MDSCacheObject::PIN_DIRTYSCATTERED);
updated = false;
+ parent->clear_dirty_scattered(type);
}
}
bool is_updated() { return updated; }
+ void set_last_scatter(utime_t t) { last_scatter = t; }
+ utime_t get_last_scatter() { return last_scatter; }
+
void replicate_relax() {
- //if (state == LOCK_SYNC && !is_rdlocked())
- //state = LOCK_SCATTER;
}
void export_twiddle() {
out << " x=" << get_xlocked_by();
if (is_wrlocked())
out << " wr=" << get_num_wrlocks();
+ if (updated)
+ out << " updated";
out << ")";
}
}
// we shouldn't be waiting on anyone.
- assert(mdr->waiting_on_slave.empty());
+ assert(mdr->more()->waiting_on_slave.empty());
switch (req->get_op()) {
SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
m->get_object_info());
MDRequest *mdr = mdcache->request_get(m->get_reqid());
- mdr->slaves.insert(from);
+ mdr->more()->slaves.insert(from);
dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
mdr->xlocks.insert(lock);
mdr->locks.insert(lock);
}
break;
- case MMDSSlaveRequest::OP_RENAMEGETINODEACK:
- {
- MDRequest *mdr = mdcache->request_get(m->get_reqid());
- handle_slave_rename_get_inode_ack(mdr, m);
- }
- break;
-
default:
assert(0);
}
handle_slave_rename_prep(mdr);
break;
- case MMDSSlaveRequest::OP_RENAMEGETINODE:
- handle_slave_rename_get_inode(mdr);
- break;
-
case MMDSSlaveRequest::OP_FINISH:
// finish off request.
mdcache->request_finish(mdr);
!(*p)->can_auth_pin()) {
// wait
dout(10) << " waiting for authpinnable on " << **p << dendl;
- (*p)->add_waiter(CDir::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr));
+ (*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
mdr->drop_local_auth_pins();
return;
}
assert(object); // we pinned it
dout(10) << " remote has pinned " << *object << dendl;
if (!mdr->is_auth_pinned(object))
- mdr->auth_pins.insert(object);
+ mdr->remote_auth_pins.insert(object);
pinned.insert(object);
}
// removed auth pins?
- set<MDSCacheObject*>::iterator p = mdr->auth_pins.begin();
- while (p != mdr->auth_pins.end()) {
+ set<MDSCacheObject*>::iterator p = mdr->remote_auth_pins.begin();
+ while (p != mdr->remote_auth_pins.end()) {
if ((*p)->authority().first == from &&
pinned.count(*p) == 0) {
dout(10) << " remote has unpinned " << **p << dendl;
set<MDSCacheObject*>::iterator o = p;
++p;
- mdr->auth_pins.erase(o);
+ mdr->remote_auth_pins.erase(o);
} else {
++p;
}
}
// note slave
- mdr->slaves.insert(from);
+ mdr->more()->slaves.insert(from);
// clear from waiting list
- assert(mdr->waiting_on_slave.count(from));
- mdr->waiting_on_slave.erase(from);
+ assert(mdr->more()->waiting_on_slave.count(from));
+ mdr->more()->waiting_on_slave.erase(from);
// go again?
- if (mdr->waiting_on_slave.empty())
+ if (mdr->more()->waiting_on_slave.empty())
dispatch_client_request(mdr);
else
- dout(10) << "still waiting on slaves " << mdr->waiting_on_slave << dendl;
+ dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
}
if (want_auth) {
if (ref->is_frozen()) {
dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
- ref->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr));
+ ref->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
return 0;
}
mdr->auth_pin(ref);
// make sure we can auth_pin (or have already authpinned) dir
if (dir->is_frozen()) {
dout(7) << "waiting for !frozen/authpinnable on " << *dir << dendl;
- dir->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr));
+ dir->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
return 0;
}
inode_t *pi = diri->project_inode();
if (dirpv) pi->version = dirpv;
pi->ctime = pi->mtime = mdr->now;
- blob->add_dir_context(diri->get_parent_dir());
+ blob->add_dir_context(diri->get_parent_dn()->get_dir());
blob->add_primary_dentry(diri->get_parent_dn(), true, 0, pi);
} else {
// journal the mtime change anyway.
dout(10) << "predirty_dn_diri (non-auth) ctime/mtime " << mdr->now << " on " << *diri << dendl;
blob->add_dirtied_inode_mtime(diri->ino(), mdr->now);
+ assert(mdr->ls);
+ mdr->ls->dirty_inode_mtimes.push_back(&diri->xlist_dirty_inode_mtime);
}
return dirpv;
/** dirty_dn_diri
* follow-up with actual dirty of inode after journal entry commits.
*/
-void Server::dirty_dn_diri(CDentry *dn, version_t dirpv, utime_t mtime)
+void Server::dirty_dn_diri(MDRequest *mdr, CDentry *dn, version_t dirpv)
{
CInode *diri = dn->dir->inode;
if (dirpv) {
// we journaled and predirtied.
assert(diri->is_auth() && !diri->is_root());
- diri->pop_and_dirty_projected_inode();
- dout(10) << "dirty_dn_diri ctime/mtime " << mtime << " v " << diri->inode.version << " on " << *diri << dendl;
+ diri->pop_and_dirty_projected_inode(mdr->ls);
+ dout(10) << "dirty_dn_diri ctime/mtime " << mdr->now << " v " << diri->inode.version << " on " << *diri << dendl;
} else {
// dirlock scatterlock will propagate the update.
- diri->inode.ctime = diri->inode.mtime = mtime;
+ diri->inode.ctime = diri->inode.mtime = mdr->now;
diri->dirlock.set_updated();
- dout(10) << "dirty_dn_diri (non-dirty) ctime/mtime " << mtime << " on " << *diri << dendl;
+ dout(10) << "dirty_dn_diri (non-dirty) ctime/mtime " << mdr->now << " on " << *diri << dendl;
}
}
assert(r == 0);
// apply
- in->pop_and_dirty_projected_inode();
+ in->pop_and_dirty_projected_inode(mdr->ls);
mds->balancer->hit_inode(mdr->now, in, META_POP_IWR);
pi->ctime = g_clock.real_now();
// log + wait
+ mdr->ls = mdlog->get_current_segment();
EUpdate *le = new EUpdate(mdlog, "utime");
le->metablob.add_client_req(req->get_reqid());
le->metablob.add_dir_context(cur->get_parent_dir());
le->metablob.add_primary_dentry(cur->parent, true, 0, pi);
- mdlog->submit_entry(le);
- mdlog->wait_for_sync(new C_MDS_inode_update_finish(mds, mdr, cur));
+ mdlog->submit_entry(le, new C_MDS_inode_update_finish(mds, mdr, cur));
}
pi->ctime = g_clock.real_now();
// log + wait
+ mdr->ls = mdlog->get_current_segment();
EUpdate *le = new EUpdate(mdlog, "chmod");
le->metablob.add_client_req(req->get_reqid());
le->metablob.add_dir_context(cur->get_parent_dir());
le->metablob.add_primary_dentry(cur->parent, true, 0, pi);
- mdlog->submit_entry(le);
- mdlog->wait_for_sync(new C_MDS_inode_update_finish(mds, mdr, cur));
+ mdlog->submit_entry(le, new C_MDS_inode_update_finish(mds, mdr, cur));
}
pi->ctime = g_clock.real_now();
// log + wait
+ mdr->ls = mdlog->get_current_segment();
EUpdate *le = new EUpdate(mdlog, "chown");
le->metablob.add_client_req(req->get_reqid());
le->metablob.add_dir_context(cur->get_parent_dir());
}
// build dir contents
- list<InodeStat*> inls;
- list<string> dnls;
+ bufferlist dirbl;
+
+ DirStat::_encode(dirbl, dir, mds->get_nodeid());
int numfiles = 0;
for (CDir::map_t::iterator it = dir->begin();
}
assert(in);
- InodeStat *st;
- if (in) {
- dout(12) << "including inode " << *in << dendl;
-
- // add this item
- // note: InodeStat makes note of whether inode data is readable.
- st = new InodeStat(in, mds->get_nodeid());
- } else {
- assert(0);
- /*
- assert(dn->is_remote());
- dout(12) << "including inode-less (remote) dentry " << *dn << dendl;
- st = new InodeStat;
- st->mask = STAT_MASK_INO | STAT_MASK_TYPE;
- memset(&st->inode, 0, sizeof(st->inode));
- st->inode.ino = dn->get_remote_ino();
- st->inode.mode = DT_TO_MODE(dn->get_remote_d_type());
- */
- }
+
+ assert(in);
- dnls.push_back( it->first );
- inls.push_back(st);
- numfiles++;
+ dout(12) << "including inode " << *in << dendl;
+
+ // add this dentry + inodeinfo
+ ::_encode(it->first, dirbl);
+ InodeStat::_encode(dirbl, in);
// touch it
mdcache->lru.lru_touch(dn);
// yay, reply
MClientReply *reply = new MClientReply(req);
- reply->take_dir_items(dnls, inls, numfiles);
+ reply->take_dir_items(dirbl);
dout(10) << "reply to " << *req << " readdir " << numfiles << " files" << dendl;
reply->set_result(0);
CDentry *dn;
CInode *newi;
version_t dirpv;
+ version_t newdirpv;
public:
- C_MDS_mknod_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni, version_t dirpv_) :
+ C_MDS_mknod_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni, version_t dirpv_, version_t newdirpv_=0) :
mds(m), mdr(r), dn(d), newi(ni),
- dirpv(dirpv_) {}
+ dirpv(dirpv_), newdirpv(newdirpv_) {}
void finish(int r) {
assert(r == 0);
dn->get_dir()->link_primary_inode(dn, newi);
// dirty inode, dn, dir
- newi->mark_dirty(newi->inode.version + 1);
+ newi->mark_dirty(newi->inode.version + 1, mdr->ls);
+
+ // mkdir?
+ if (newdirpv) {
+ CDir *dir = newi->get_dirfrag(frag_t());
+ assert(dir);
+ dir->mark_dirty(newdirpv, mdr->ls);
+ }
// dir inode's mtime
- mds->server->dirty_dn_diri(dn, dirpv, newi->inode.ctime);
+ mds->server->dirty_dn_diri(mdr, dn, dirpv);
// hit pop
mds->balancer->hit_inode(mdr->now, newi, META_POP_IWR);
newi->inode.version = dn->pre_dirty() - 1;
// prepare finisher
+ mdr->ls = mdlog->get_current_segment();
EUpdate *le = new EUpdate(mdlog, "mknod");
le->metablob.add_client_req(req->get_reqid());
le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version());
le->metablob.add_primary_dentry(dn, true, newi, &newi->inode);
// log + wait
- mdlog->submit_entry(le);
- mdlog->wait_for_sync(new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv));
+ mdlog->submit_entry(le, new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv));
}
// ...and that new dir is empty.
CDir *newdir = newi->get_or_open_dirfrag(mds->mdcache, frag_t());
newdir->mark_complete();
- newdir->mark_dirty(newdir->pre_dirty());
+ version_t newdirpv = newdir->pre_dirty();
//if (mds->logger) mds->logger->inc("mkdir");
// prepare finisher
+ mdr->ls = mdlog->get_current_segment();
EUpdate *le = new EUpdate(mdlog, "mkdir");
le->metablob.add_client_req(req->get_reqid());
le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version());
le->metablob.add_dir(newdir, true, true); // dirty AND complete
// log + wait
- mdlog->submit_entry(le);
- mdlog->wait_for_sync(new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv));
+ mdlog->submit_entry(le, new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv, newdirpv));
/* old export heuristic. pbly need to reimplement this at some point.
if (
if (!dn) return;
mdr->now = g_clock.real_now();
+
CInode *newi = prepare_new_inode(mdr, dn->dir);
assert(newi);
newi->inode.version = dn->pre_dirty() - 1;
// prepare finisher
+ mdr->ls = mdlog->get_current_segment();
EUpdate *le = new EUpdate(mdlog, "symlink");
le->metablob.add_client_req(req->get_reqid());
le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version());
le->metablob.add_primary_dentry(dn, true, newi, &newi->inode);
// log + wait
- mdlog->submit_entry(le);
- mdlog->wait_for_sync(new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv));
+ mdlog->submit_entry(le, new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv));
}
{
dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
+ mdr->ls = mdlog->get_current_segment();
+
// predirty NEW dentry
version_t dnpv = dn->pre_dirty();
version_t tipv = targeti->pre_dirty();
le->metablob.add_dir_context(targeti->get_parent_dir());
le->metablob.add_primary_dentry(targeti->parent, true, targeti, pi); // update old primary
- mdlog->submit_entry(le);
- mdlog->wait_for_sync(new C_MDS_link_local_finish(mds, mdr, dn, targeti, dnpv, tipv, dirpv));
+ mdlog->submit_entry(le, new C_MDS_link_local_finish(mds, mdr, dn, targeti, dnpv, tipv, dirpv));
}
void Server::_link_local_finish(MDRequest *mdr, CDentry *dn, CInode *targeti,
// link and unlock the NEW dentry
dn->dir->link_remote_inode(dn, targeti->ino(), MODE_TO_DT(targeti->inode.mode));
- dn->mark_dirty(dnpv);
+ dn->mark_dirty(dnpv, mdr->ls);
// target inode
- targeti->pop_and_dirty_projected_inode();
+ targeti->pop_and_dirty_projected_inode(mdr->ls);
// new dentry dir mtime
- dirty_dn_diri(dn, dirpv, mdr->now);
+ dirty_dn_diri(mdr, dn, dirpv);
// bump target popularity
mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR);
// 1. send LinkPrepare to dest (journal nlink++ prepare)
int linkauth = targeti->authority().first;
- if (mdr->witnessed.count(linkauth) == 0) {
+ if (mdr->more()->witnessed.count(linkauth) == 0) {
dout(10) << " targeti auth must prepare nlink++" << dendl;
MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_LINKPREP);
req->now = mdr->now;
mds->send_message_mds(req, linkauth, MDS_PORT_SERVER);
- assert(mdr->waiting_on_slave.count(linkauth) == 0);
- mdr->waiting_on_slave.insert(linkauth);
+ assert(mdr->more()->waiting_on_slave.count(linkauth) == 0);
+ mdr->more()->waiting_on_slave.insert(linkauth);
return;
}
dout(10) << " targeti auth has prepared nlink++" << dendl;
dn->pre_dirty();
// add to event
+ mdr->ls = mdlog->get_current_segment();
EUpdate *le = new EUpdate(mdlog, "link_remote");
le->metablob.add_client_req(mdr->reqid);
version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir inode's mtime
mdr->committing = true;
// log + wait
- mdlog->submit_entry(le);
- mdlog->wait_for_sync(new C_MDS_link_remote_finish(mds, mdr, dn, targeti, dirpv));
+ mdlog->submit_entry(le, new C_MDS_link_remote_finish(mds, mdr, dn, targeti, dirpv));
}
void Server::_link_remote_finish(MDRequest *mdr, CDentry *dn, CInode *targeti,
// link the new dentry
dn->dir->link_remote_inode(dn, targeti->ino(), MODE_TO_DT(targeti->inode.mode));
- dn->mark_dirty(dpv);
+ dn->mark_dirty(dpv, mdr->ls);
// dir inode's mtime
- dirty_dn_diri(dn, dirpv, mdr->now);
+ dirty_dn_diri(mdr, dn, dirpv);
// bump target popularity
mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR);
}
}
+ // journal it
+ mdr->ls = mdlog->get_current_segment();
+ ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE);
inode_t *pi = dn->inode->project_inode();
+ // rollback case
+ le->rollback.add_dir_context(targeti->get_parent_dir());
+ le->rollback.add_primary_dentry(dn, true, targeti, pi); // update old primary
+
// update journaled target inode
bool inc;
if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) {
dout(10) << " projected inode " << pi << " v " << pi->version << dendl;
- // journal it
- ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE);
- le->metablob.add_dir_context(targeti->get_parent_dir());
- le->metablob.add_primary_dentry(dn, true, targeti, pi); // update old primary
- mds->mdlog->submit_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti, old_ctime, inc));
+ // commit case
+ le->commit.add_dir_context(targeti->get_parent_dir());
+ le->commit.add_primary_dentry(dn, true, targeti, pi); // update old primary
+
+ mdlog->submit_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti, old_ctime, inc));
}
class C_MDS_SlaveLinkCommit : public Context {
version_t old_version = targeti->inode.version;
// update the target
- targeti->pop_and_dirty_projected_inode();
+ targeti->pop_and_dirty_projected_inode(mdr->ls);
// hit pop
mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR);
mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER);
// set up commit waiter
- mdr->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti, old_ctime, old_version, inc);
+ mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti, old_ctime, old_version, inc);
// done.
delete mdr->slave_request;
targeti->inode.nlink--;
}
- mds->mdlog->submit_entry(le);
+ mdlog->submit_entry(le);
}
int from = m->get_source().num();
// note slave
- mdr->slaves.insert(from);
+ mdr->more()->slaves.insert(from);
// witnessed!
- assert(mdr->witnessed.count(from) == 0);
- mdr->witnessed.insert(from);
+ assert(mdr->more()->witnessed.count(from) == 0);
+ mdr->more()->witnessed.insert(from);
// remove from waiting list
- assert(mdr->waiting_on_slave.count(from));
- mdr->waiting_on_slave.erase(from);
+ assert(mdr->more()->waiting_on_slave.count(from));
+ mdr->more()->waiting_on_slave.erase(from);
- assert(mdr->waiting_on_slave.empty());
+ assert(mdr->more()->waiting_on_slave.empty());
dispatch_client_request(mdr); // go again!
}
dout(10) << " straydn is " << *straydn << dendl;
assert(straydn->is_null());
- if (!mdr->dst_reanchor_atid &&
+ if (!mdr->more()->dst_reanchor_atid &&
dn->inode->is_anchored()) {
dout(10) << "reanchoring to stray " << *dn->inode << dendl;
vector<Anchor> trace;
straydn->make_anchor_trace(trace, dn->inode);
- mds->anchorclient->prepare_update(dn->inode->ino(), trace, &mdr->dst_reanchor_atid,
+ mds->anchorclient->prepare_update(dn->inode->ino(), trace, &mdr->more()->dst_reanchor_atid,
new C_MDS_RetryRequest(mdcache, mdr));
return;
}
dout(10) << "_unlink_local " << *dn << dendl;
// ok, let's do it.
+ mdr->ls = mdlog->get_current_segment();
+
// prepare log entry
EUpdate *le = new EUpdate(mdlog, "unlink_local");
le->metablob.add_client_req(mdr->reqid);
le->metablob.add_dir_context(dn->get_dir());
le->metablob.add_null_dentry(dn, true);
- if (mdr->dst_reanchor_atid)
- le->metablob.add_anchor_transaction(mdr->dst_reanchor_atid);
+ if (mdr->more()->dst_reanchor_atid)
+ le->metablob.add_anchor_transaction(mdr->more()->dst_reanchor_atid);
// log + wait
journal_opens(); // journal pending opens, just in case
- mdlog->submit_entry(le);
- mdlog->wait_for_sync(new C_MDS_unlink_local_finish(mds, mdr, dn, straydn,
- dirpv));
+ mdlog->submit_entry(le, new C_MDS_unlink_local_finish(mds, mdr, dn, straydn,
+ dirpv));
}
void Server::_unlink_local_finish(MDRequest *mdr,
}
// nlink--, dirty old dentry
- in->pop_and_dirty_projected_inode();
- dn->mark_dirty(dnpv);
+ in->pop_and_dirty_projected_inode(mdr->ls);
+ dn->mark_dirty(dnpv, mdr->ls);
// dir inode's mtime
- dirty_dn_diri(dn, dirpv, mdr->now);
+ dirty_dn_diri(mdr, dn, dirpv);
// share unlink news with replicas
for (map<int,int>::iterator it = dn->replicas_begin();
}
// commit anchor update?
- if (mdr->dst_reanchor_atid)
- mds->anchorclient->commit(mdr->dst_reanchor_atid);
+ if (mdr->more()->dst_reanchor_atid)
+ mds->anchorclient->commit(mdr->more()->dst_reanchor_atid, mdr->ls);
// bump pop
//mds->balancer->hit_dir(mdr->now, dn->dir, META_POP_DWR);
// 1. send LinkPrepare to dest (journal nlink-- prepare)
int inauth = dn->inode->authority().first;
- if (mdr->witnessed.count(inauth) == 0) {
+ if (mdr->more()->witnessed.count(inauth) == 0) {
dout(10) << " inode auth must prepare nlink--" << dendl;
MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_UNLINKPREP);
req->now = mdr->now;
mds->send_message_mds(req, inauth, MDS_PORT_SERVER);
- assert(mdr->waiting_on_slave.count(inauth) == 0);
- mdr->waiting_on_slave.insert(inauth);
+ assert(mdr->more()->waiting_on_slave.count(inauth) == 0);
+ mdr->more()->waiting_on_slave.insert(inauth);
return;
}
dout(10) << " inode auth has prepared nlink--" << dendl;
// ok, let's do it.
// prepare log entry
+ mdr->ls = mdlog->get_current_segment();
EUpdate *le = new EUpdate(mdlog, "unlink_remote");
le->metablob.add_client_req(mdr->reqid);
le->metablob.add_dir_context(dn->get_dir());
le->metablob.add_null_dentry(dn, true);
- if (mdr->dst_reanchor_atid)
- le->metablob.add_anchor_transaction(mdr->dst_reanchor_atid);
+ if (mdr->more()->dst_reanchor_atid)
+ le->metablob.add_anchor_transaction(mdr->more()->dst_reanchor_atid);
// finisher
C_MDS_unlink_remote_finish *fin = new C_MDS_unlink_remote_finish(mds, mdr, dn, dirpv);
mdr->committing = true;
// log + wait
- mdlog->submit_entry(le);
- mdlog->wait_for_sync(fin);
+ mdlog->submit_entry(le, fin);
}
void Server::_unlink_remote_finish(MDRequest *mdr,
// unlink main dentry
dn->dir->unlink_inode(dn);
- dn->mark_dirty(dnpv); // dirty old dentry
+ dn->mark_dirty(dnpv, mdr->ls); // dirty old dentry
// dir inode's mtime
- dirty_dn_diri(dn, dirpv, mdr->now);
+ dirty_dn_diri(mdr, dn, dirpv);
// share unlink news with replicas
for (map<int,int>::iterator it = dn->replicas_begin();
}
// commit anchor update?
- if (mdr->dst_reanchor_atid)
- mds->anchorclient->commit(mdr->dst_reanchor_atid);
+ if (mdr->more()->dst_reanchor_atid)
+ mds->anchorclient->commit(mdr->more()->dst_reanchor_atid, mdr->ls);
//mds->balancer->hit_dir(mdr->now, dn->dir, META_POP_DWR);
rdlocks.insert(&srctrace[i]->lock);
xlocks.insert(&srcdn->lock);
wrlocks.insert(&srcdn->dir->inode->dirlock);
- rdlocks.insert(&srcdn->dir->inode->dirfragtreelock); // rd lock on srci dirfragtree.
+ /*
+ * no, this causes problems if the dftlock is scattered...
+ * and what was i thinking anyway?
+ * rdlocks.insert(&srcdn->dir->inode->dirfragtreelock); // rd lock on srci dirfragtree.
+ */
// rdlock destdir path, xlock dest dentry
for (int i=0; i<(int)desttrace.size(); i++)
xlocks.insert(&destdn->lock);
wrlocks.insert(&destdn->dir->inode->dirlock);
+ // xlock versionlock on srci if remote?
+ // this ensures it gets safely remotely auth_pinned, avoiding deadlock;
+ // strictly speaking, having the slave node freeze the inode is
+ // otherwise sufficient for avoiding conflicts with inode locks, etc.
+ if (!srcdn->is_auth() && srcdn->is_primary())
+ xlocks.insert(&srcdn->inode->versionlock);
+
// xlock oldin (for nlink--)
if (oldin) xlocks.insert(&oldin->linklock);
++p) {
CDir *dir = srci->get_dirfrag(*p);
if (!dir) {
- dout(10) << " opening " << *dir << dendl;
+ dout(10) << " opening " << *p << " under " << *srci << dendl;
mdcache->open_remote_dirfrag(srci, *p, new C_MDS_RetryRequest(mdcache, mdr));
return;
}
* (currently, it can ignore rename effects, because the resolve
* stage will sort them out.)
*/
- set<int> witnesses = mdr->extra_witnesses;
+ set<int> witnesses = mdr->more()->extra_witnesses;
if (srcdn->is_auth())
srcdn->list_replicas(witnesses);
else
witnesses.insert(srcdn->authority().first);
destdn->list_replicas(witnesses);
+ dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
+ // do srcdn auth last
+ int last = -1;
+ if (!srcdn->is_auth())
+ last = srcdn->authority().first;
+
for (set<int>::iterator p = witnesses.begin();
p != witnesses.end();
++p) {
- if (mdr->witnessed.count(*p)) {
+ if (*p == last) continue; // do it last!
+ if (mdr->more()->witnessed.count(*p)) {
dout(10) << " already witnessed by mds" << *p << dendl;
+ } else if (mdr->more()->waiting_on_slave.count(*p)) {
+ dout(10) << " already waiting on witness mds" << *p << dendl;
} else {
- dout(10) << " not yet witnessed by mds" << *p << ", sending prepare" << dendl;
- MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREP);
- srcdn->make_path(req->srcdnpath);
- destdn->make_path(req->destdnpath);
- req->now = mdr->now;
-
- if (straydn) {
- CInodeDiscover *indis = straydn->dir->inode->replicate_to(*p);
- CDirDiscover *dirdis = straydn->dir->replicate_to(*p);
- CDentryDiscover *dndis = straydn->replicate_to(*p);
- indis->_encode(req->stray);
- dirdis->_encode(req->stray);
- dndis->_encode(req->stray);
- delete indis;
- delete dirdis;
- delete dndis;
- }
-
- mds->send_message_mds(req, *p, MDS_PORT_SERVER);
-
- assert(mdr->waiting_on_slave.count(*p) == 0);
- mdr->waiting_on_slave.insert(*p);
+ _rename_prepare_witness(mdr, *p, srcdn, destdn, straydn);
}
}
- if (!mdr->waiting_on_slave.empty())
+ if (!mdr->more()->waiting_on_slave.empty())
return; // we're waiting for a witness.
- // -- inode migration? --
- if (!srcdn->is_auth() &&
- srcdn->is_primary()) {
- if (mdr->inode_import.length() == 0) {
- // get inode
- int auth = srcdn->authority().first;
- dout(10) << " requesting inode export from srcdn auth mds" << auth << dendl;
- MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEGETINODE);
- srcdn->make_path(req->srcdnpath);
- mds->send_message_mds(req, auth, MDS_PORT_SERVER);
-
- assert(mdr->waiting_on_slave.count(auth) == 0);
- mdr->waiting_on_slave.insert(auth);
- return;
- } else {
- dout(10) << " already (just!) got inode export from srcdn auth" << dendl;
- /*int off = 0;
- mdcache->migrator->decode_import_inode(destdn, mdr->inode_import, off,
- srcdn->authority().first);
- srcdn->inode->force_auth.first = srcdn->authority().first;
- */
- }
+ if (last >= 0 &&
+ mdr->more()->witnessed.count(last) == 0 &&
+ mdr->more()->waiting_on_slave.count(last) == 0) {
+ dout(10) << " preparing last witness (srcdn auth)" << dendl;
+ _rename_prepare_witness(mdr, last, srcdn, destdn, straydn);
+ return;
}
// -- prepare anchor updates --
if (srcdn->is_primary() && srcdn->inode->is_anchored() &&
srcdn->dir != destdn->dir &&
- !mdr->src_reanchor_atid) {
+ !mdr->more()->src_reanchor_atid) {
dout(10) << "reanchoring src->dst " << *srcdn->inode << dendl;
vector<Anchor> trace;
destdn->make_anchor_trace(trace, srcdn->inode);
anchorgather = new C_Gather(new C_MDS_RetryRequest(mdcache, mdr));
- mds->anchorclient->prepare_update(srcdn->inode->ino(), trace, &mdr->src_reanchor_atid,
+ mds->anchorclient->prepare_update(srcdn->inode->ino(), trace, &mdr->more()->src_reanchor_atid,
anchorgather->new_sub());
}
if (destdn->is_primary() &&
destdn->inode->is_anchored() &&
- !mdr->dst_reanchor_atid) {
+ !mdr->more()->dst_reanchor_atid) {
dout(10) << "reanchoring dst->stray " << *destdn->inode << dendl;
assert(straydn);
if (!anchorgather)
anchorgather = new C_Gather(new C_MDS_RetryRequest(mdcache, mdr));
- mds->anchorclient->prepare_update(destdn->inode->ino(), trace, &mdr->dst_reanchor_atid,
+ mds->anchorclient->prepare_update(destdn->inode->ino(), trace, &mdr->more()->dst_reanchor_atid,
anchorgather->new_sub());
}
}
// -- prepare journal entry --
+ mdr->ls = mdlog->get_current_segment();
EUpdate *le = new EUpdate(mdlog, "rename");
le->metablob.add_client_req(mdr->reqid);
mdr->committing = true;
// log + wait
- mdlog->submit_entry(le);
- mdlog->wait_for_sync(fin);
+ mdlog->submit_entry(le, fin);
}
_rename_apply(mdr, srcdn, destdn, straydn);
// commit anchor updates?
- if (mdr->src_reanchor_atid) mds->anchorclient->commit(mdr->src_reanchor_atid);
- if (mdr->dst_reanchor_atid) mds->anchorclient->commit(mdr->dst_reanchor_atid);
+ if (mdr->more()->src_reanchor_atid)
+ mds->anchorclient->commit(mdr->more()->src_reanchor_atid, mdr->ls);
+ if (mdr->more()->dst_reanchor_atid)
+ mds->anchorclient->commit(mdr->more()->dst_reanchor_atid, mdr->ls);
// bump popularity
//if (srcdn->is_auth())
// helpers
+void Server::_rename_prepare_witness(MDRequest *mdr, int who, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
+{
+ dout(10) << "_rename_prepare_witness mds" << who << dendl;
+ MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREP);
+ srcdn->make_path(req->srcdnpath);
+ destdn->make_path(req->destdnpath);
+ req->now = mdr->now;
+
+ if (straydn) {
+ CInodeDiscover *indis = straydn->dir->inode->replicate_to(who);
+ CDirDiscover *dirdis = straydn->dir->replicate_to(who);
+ CDentryDiscover *dndis = straydn->replicate_to(who);
+ indis->_encode(req->stray);
+ dirdis->_encode(req->stray);
+ dndis->_encode(req->stray);
+ delete indis;
+ delete dirdis;
+ delete dndis;
+ }
+
+ // srcdn auth will verify our current witness list is sufficient
+ req->witnesses = mdr->more()->witnessed;
+
+ mds->send_message_mds(req, who, MDS_PORT_SERVER);
+
+ assert(mdr->more()->waiting_on_slave.count(who) == 0);
+ mdr->more()->waiting_on_slave.insert(who);
+}
+
+
void Server::_rename_prepare(MDRequest *mdr,
EMetaBlob *metablob,
CDentry *srcdn, CDentry *destdn, CDentry *straydn)
(srcdn->is_primary() || destdn->is_primary()));
if (mdr->is_master()) {
- mdr->pvmap[destdn->dir->inode] = predirty_dn_diri(mdr, destdn, metablob);
+ mdr->more()->pvmap[destdn->dir->inode] = predirty_dn_diri(mdr, destdn, metablob);
if (destdn->dir != srcdn->dir)
- mdr->pvmap[srcdn->dir->inode] = predirty_dn_diri(mdr, srcdn, metablob);
+ mdr->more()->pvmap[srcdn->dir->inode] = predirty_dn_diri(mdr, srcdn, metablob);
}
inode_t *ji = 0; // journaled inode getting nlink--
// destdn -> primary
metablob->add_dir_context(destdn->dir);
if (destdn->is_auth())
- ipv = mdr->pvmap[destdn] = destdn->pre_dirty(destdn->inode->inode.version);
+ ipv = mdr->more()->pvmap[destdn] = destdn->pre_dirty(destdn->inode->inode.version);
ji = metablob->add_primary_dentry(destdn, true, destdn->inode);
// do src dentry
metablob->add_dir_context(srcdn->dir);
if (srcdn->is_auth())
- mdr->pvmap[srcdn] = srcdn->pre_dirty();
+ mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
metablob->add_null_dentry(srcdn, true);
} else {
// link-- inode, move to stray dir.
metablob->add_dir_context(straydn->dir);
if (straydn->is_auth())
- ipv = mdr->pvmap[straydn] = straydn->pre_dirty(destdn->inode->inode.version);
+ ipv = mdr->more()->pvmap[straydn] = straydn->pre_dirty(destdn->inode->inode.version);
ji = metablob->add_primary_dentry(straydn, true, destdn->inode);
}
else if (destdn->is_remote()) {
// nlink-- targeti
metablob->add_dir_context(destdn->inode->get_parent_dir());
if (destdn->inode->is_auth())
- ipv = mdr->pvmap[destdn->inode] = destdn->inode->pre_dirty();
+ ipv = mdr->more()->pvmap[destdn->inode] = destdn->inode->pre_dirty();
ji = metablob->add_primary_dentry(destdn->inode->parent, true, destdn->inode); // update primary
dout(10) << "remote targeti (nlink--) is " << *destdn->inode << dendl;
}
if (srcdn->is_auth())
siv = srcdn->inode->get_projected_version();
else
- siv = mdr->inode_import_v;
- mdr->pvmap[destdn] = destdn->pre_dirty(siv+1);
+ siv = mdr->more()->inode_import_v;
+ mdr->more()->pvmap[destdn] = destdn->pre_dirty(siv+1);
}
metablob->add_primary_dentry(destdn, true, srcdn->inode);
assert(srcdn->is_remote());
dout(10) << "src is a remote dentry" << dendl;
if (destdn->is_auth())
- mdr->pvmap[destdn] = destdn->pre_dirty();
+ mdr->more()->pvmap[destdn] = destdn->pre_dirty();
metablob->add_remote_dentry(destdn, true, srcdn->get_remote_ino());
}
// remove src dentry
metablob->add_dir_context(srcdn->dir);
if (srcdn->is_auth())
- mdr->pvmap[srcdn] = srcdn->pre_dirty();
+ mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
metablob->add_null_dentry(srcdn, true);
// new subtree?
}
// anchor updates?
- if (mdr->src_reanchor_atid)
- metablob->add_anchor_transaction(mdr->src_reanchor_atid);
- if (mdr->dst_reanchor_atid)
- metablob->add_anchor_transaction(mdr->dst_reanchor_atid);
+ if (mdr->more()->src_reanchor_atid)
+ metablob->add_anchor_transaction(mdr->more()->src_reanchor_atid);
+ if (mdr->more()->dst_reanchor_atid)
+ metablob->add_anchor_transaction(mdr->more()->dst_reanchor_atid);
}
void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
{
dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
- dout(10) << " pvs " << mdr->pvmap << dendl;
+ dout(10) << " pvs " << mdr->more()->pvmap << dendl;
CInode *oldin = destdn->inode;
// dir mtimes
if (mdr->is_master()) {
- dirty_dn_diri(destdn, mdr->pvmap[destdn->dir->inode], mdr->now);
+ dirty_dn_diri(mdr, destdn, mdr->more()->pvmap[destdn->dir->inode]);
if (destdn->dir != srcdn->dir)
- dirty_dn_diri(srcdn, mdr->pvmap[srcdn->dir->inode], mdr->now);
+ dirty_dn_diri(mdr, srcdn, mdr->more()->pvmap[srcdn->dir->inode]);
}
if (linkmerge) {
destdn->inode->inode.nlink--;
destdn->inode->inode.ctime = mdr->now;
if (destdn->inode->is_auth())
- destdn->inode->mark_dirty(mdr->pvmap[destdn]);
+ destdn->inode->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
// unlink srcdn
srcdn->dir->unlink_inode(srcdn);
if (srcdn->is_auth())
- srcdn->mark_dirty(mdr->pvmap[srcdn]);
+ srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
} else {
dout(10) << "merging primary onto remote link" << dendl;
assert(srcdn->is_primary());
destdn->inode->inode.nlink--;
destdn->inode->inode.ctime = mdr->now;
if (destdn->inode->is_auth())
- destdn->inode->mark_dirty(mdr->pvmap[destdn]);
+ destdn->inode->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
// mark src dirty
if (srcdn->is_auth())
- srcdn->mark_dirty(mdr->pvmap[srcdn]);
+ srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
}
}
else {
oldin->inode.nlink--;
oldin->inode.ctime = mdr->now;
if (oldin->is_auth())
- oldin->pop_and_dirty_projected_inode();
+ oldin->pop_and_dirty_projected_inode(mdr->ls);
}
else if (oldin) {
// nlink-- remote. destdn was remote.
oldin->inode.nlink--;
oldin->inode.ctime = mdr->now;
if (oldin->is_auth())
- oldin->pop_and_dirty_projected_inode();
+ oldin->pop_and_dirty_projected_inode(mdr->ls);
}
CInode *in = srcdn->inode;
destdn->dir->link_remote_inode(destdn, in->ino(), MODE_TO_DT(in->inode.mode));
destdn->link_remote(in);
if (destdn->is_auth())
- destdn->mark_dirty(mdr->pvmap[destdn]);
+ destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
} else {
// srcdn was primary.
srcdn->dir->unlink_inode(srcdn);
// srcdn inode import?
if (!srcdn->is_auth() && destdn->is_auth()) {
- assert(mdr->inode_import.length() > 0);
- int off = 0;
+ assert(mdr->more()->inode_import.length() > 0);
+ bufferlist::iterator blp = mdr->more()->inode_import.begin();
map<int,entity_inst_t> imported_client_map;
- ::_decode(imported_client_map, mdr->inode_import, off);
- mdcache->migrator->decode_import_inode(destdn, mdr->inode_import, off,
+ list<ScatterLock*> updated_scatterlocks; // we clear_updated explicitly below
+ ::_decode_simple(imported_client_map, blp);
+ mdcache->migrator->decode_import_inode(destdn, blp,
srcdn->authority().first,
- imported_client_map);
+ imported_client_map,
+ mdr->ls,
+ updated_scatterlocks);
+ destdn->inode->dirlock.clear_updated();
}
if (destdn->inode->is_auth())
- destdn->inode->mark_dirty(mdr->pvmap[destdn]);
+ destdn->inode->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
}
if (srcdn->is_auth())
- srcdn->mark_dirty(mdr->pvmap[srcdn]);
+ srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
}
// update subtree map?
mdr->now = mdr->slave_request->now;
+ // set up commit waiter (early, to clean up any freezing etc we do)
+ if (!mdr->more()->slave_commit)
+ mdr->more()->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn);
+
+ // am i srcdn auth?
+ if (srcdn->is_auth()) {
+ if (srcdn->is_primary() &&
+ !srcdn->inode->is_freezing_inode() &&
+ !srcdn->inode->is_frozen_inode()) {
+ // srci auth.
+ // set ambiguous auth.
+ srcdn->inode->state_set(CInode::STATE_AMBIGUOUSAUTH);
+
+ // freeze?
+ // we need this to
+ // - avoid conflicting lock state changes
+ // - avoid concurrent updates to the inode
+ // (this could also be accomplished with the versionlock)
+ int allowance = 1; // for the versionlock and possible linklock xlock (both are tied to mdr)
+ dout(10) << " freezing srci " << *srcdn->inode << " with allowance " << allowance << dendl;
+ if (!srcdn->inode->freeze_inode(allowance)) {
+ srcdn->inode->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
+ return;
+ }
+ }
+
+ // is witness list sufficient?
+ set<int> srcdnrep;
+ srcdn->list_replicas(srcdnrep);
+ for (set<int>::iterator p = srcdnrep.begin();
+ p != srcdnrep.end();
+ ++p) {
+ if (*p == mdr->slave_to_mds ||
+ mdr->slave_request->witnesses.count(*p)) continue;
+ dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
+ MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREPACK);
+ reply->witnesses.swap(srcdnrep);
+ mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER);
+ delete mdr->slave_request;
+ mdr->slave_request = 0;
+ return;
+ }
+ dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
+ }
+
// journal it?
if (srcdn->is_auth() ||
- destdn->inode->is_auth() ||
+ (destdn->inode && destdn->inode->is_auth()) ||
srcdn->inode->is_any_caps()) {
// journal.
+ mdr->ls = mdlog->get_current_segment();
ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE);
- _rename_prepare(mdr, &le->metablob, srcdn, destdn, straydn);
- mds->mdlog->submit_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn));
+
+ // rollback case
+ if (destdn->inode && destdn->inode->is_auth()) {
+ assert(destdn->is_remote());
+ le->rollback.add_dir_context(destdn->dir);
+ le->rollback.add_dentry(destdn, true);
+ }
+ if (srcdn->is_auth() ||
+ (srcdn->inode && srcdn->inode->is_auth())) {
+ le->rollback.add_dir_context(srcdn->dir);
+ le->rollback.add_dentry(srcdn, true);
+ }
+
+ // commit case
+ _rename_prepare(mdr, &le->commit, srcdn, destdn, straydn);
+
+ mdlog->submit_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn));
} else {
// don't journal.
dout(10) << "not journaling, i'm not auth for anything, and srci isn't open" << dendl;
+
+ // prepare anyway; this may twiddle dir_auth
+ EMetaBlob blah;
+ _rename_prepare(mdr, &blah, srcdn, destdn, straydn);
_logged_slave_rename(mdr, srcdn, destdn, straydn);
}
}
{
dout(10) << "_logged_slave_rename " << *mdr << dendl;
- // ack
+ // prepare ack
MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREPACK);
- if (srcdn->is_auth()) {
- // share the replica list, so that they can all witness the rename.
- srcdn->list_replicas(reply->srcdn_replicas);
+
+ // export srci?
+ if (srcdn->is_auth() && srcdn->is_primary()) {
+ list<Context*> finished;
+ map<int,entity_inst_t> exported_client_map;
+ bufferlist inodebl;
+ mdcache->migrator->encode_export_inode(srcdn->inode, inodebl,
+ exported_client_map);
+ mdcache->migrator->finish_export_inode(srcdn->inode, mdr->now, finished);
+ mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
+ ::_encode(exported_client_map, reply->inode_export);
+ reply->inode_export.claim_append(inodebl);
+ reply->inode_export_v = srcdn->inode->inode.version;
+
+ // remove mdr auth pin
+ mdr->auth_unpin(srcdn->inode);
+ assert(!srcdn->inode->is_auth_pinned());
+
+ dout(10) << " exported srci " << *srcdn->inode << dendl;
+ }
- // note srcdn, we'll get asked for inode momentarily
- mdr->srcdn = srcdn;
- }
+ // apply
+ _rename_apply(mdr, srcdn, destdn, straydn);
mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER);
- // set up commit waiter
- mdr->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn);
-
// bump popularity
//if (srcdn->is_auth())
//mds->balancer->hit_dir(mdr->now, srcdn->get_dir(), META_POP_DWR);
- if (destdn->inode->is_auth())
+ if (destdn->inode && destdn->inode->is_auth())
mds->balancer->hit_inode(mdr->now, destdn->inode, META_POP_IWR);
// done.
{
dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl;
+ // unfreeze+singleauth inode
+ // hmm, do i really need to delay this?
+ if (srcdn->is_auth() && destdn->is_primary()) {
+ dout(10) << " unfreezing exported inode " << *destdn->inode << dendl;
+ list<Context*> finished;
+
+ // singleauth
+ assert(destdn->inode->state_test(CInode::STATE_AMBIGUOUSAUTH));
+ destdn->inode->state_clear(CInode::STATE_AMBIGUOUSAUTH);
+ destdn->inode->take_waiting(CInode::WAIT_SINGLEAUTH, finished);
+
+ // unfreeze
+ assert(destdn->inode->is_frozen_inode() ||
+ destdn->inode->is_freezing_inode());
+ destdn->inode->unfreeze_inode(finished);
+
+ mds->queue_waiters(finished);
+ }
+
+
ESlaveUpdate *le;
if (r == 0) {
- // commit
- _rename_apply(mdr, srcdn, destdn, straydn);
-
// write a commit to the journal
le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT);
+
} else {
// abort
le = new ESlaveUpdate(mdlog, "slave_rename_abort", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK);
- }
- mds->mdlog->submit_entry(le);
-}
-void Server::handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m)
-{
- dout(10) << "handle_slave_rename_prep_ack " << *mdr
- << " witnessed by " << m->get_source()
- << " " << *m << dendl;
- int from = m->get_source().num();
+ // -- rollback in memory --
- // note slave
- mdr->slaves.insert(from);
+ if (mdr->more()->was_link_merge) {
+ // link merge
+ CInode *in = destdn->inode;
+ in->inode.nlink++;
+ if (mdr->more()->destdn_was_remote_inode) {
+ destdn->dir->unlink_inode(destdn);
+ srcdn->dir->link_primary_inode(srcdn, in);
+ destdn->dir->link_remote_inode(destdn, in->ino(), MODE_TO_DT(in->inode.mode));
+ } else {
+ srcdn->dir->link_remote_inode(srcdn, in->ino(), MODE_TO_DT(in->inode.mode));
+ }
+ } else {
+ // normal
- // witnessed!
- assert(mdr->witnessed.count(from) == 0);
- mdr->witnessed.insert(from);
+ // revert srcdn
+ if (destdn->is_remote()) {
+ srcdn->dir->link_remote_inode(srcdn, destdn->inode->ino(), MODE_TO_DT(destdn->inode->inode.mode));
+ destdn->dir->unlink_inode(destdn);
+ } else {
+ // renamed a primary
+ CInode *in = destdn->inode;
+ destdn->dir->unlink_inode(destdn);
+ srcdn->dir->link_primary_inode(srcdn, in);
+ }
+
+ // revert destdn
+ if (mdr->more()->destdn_was_remote_inode) {
+ destdn->dir->link_remote_inode(destdn,
+ mdr->more()->destdn_was_remote_inode->ino(),
+ MODE_TO_DT(mdr->more()->destdn_was_remote_inode->inode.mode));
+ mdr->more()->destdn_was_remote_inode->inode.nlink++;
+ } else if (straydn && straydn->inode) {
+ CInode *in = straydn->inode;
+ straydn->dir->unlink_inode(straydn);
+ destdn->dir->link_primary_inode(destdn, in);
+ straydn->dir->remove_dentry(straydn);
+ }
+ }
+ // FIXME: reverse srci export?
+
+ dout(-10) << " srcdn back to " << *srcdn << dendl;
+ dout(-10) << " srci back to " << *srcdn->inode << dendl;
+ dout(-10) << " destdn back to " << *destdn << dendl;
+ if (destdn->inode) dout(-10) << " desti back to " << *destdn->inode << dendl;
+
+ // *** WRITE ME ***
+ assert(0);
- // add extra witnesses?
- if (!m->srcdn_replicas.empty()) {
- dout(10) << " extra witnesses (srcdn replicas) are " << m->srcdn_replicas << dendl;
- mdr->extra_witnesses = m->srcdn_replicas;
- mdr->extra_witnesses.erase(mds->get_nodeid()); // not me!
}
- // remove from waiting list
- assert(mdr->waiting_on_slave.count(from));
- mdr->waiting_on_slave.erase(from);
+
- if (mdr->waiting_on_slave.empty())
- dispatch_client_request(mdr); // go again!
- else
- dout(10) << "still waiting on slaves " << mdr->waiting_on_slave << dendl;
+ mdlog->submit_entry(le);
}
-
-
-void Server::handle_slave_rename_get_inode(MDRequest *mdr)
+void Server::handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *ack)
{
- dout(10) << "handle_slave_rename_get_inode " << *mdr << dendl;
-
- assert(mdr->srcdn);
- assert(mdr->srcdn->is_auth());
- assert(mdr->srcdn->is_primary());
-
- // reply
- MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEGETINODEACK);
- dout(10) << " replying with inode export info " << *mdr->srcdn->inode << dendl;
-
- map<int,entity_inst_t> exported_client_map;
- bufferlist inodebl;
- mdcache->migrator->encode_export_inode(mdr->srcdn->inode, inodebl, mdr->slave_to_mds,
- exported_client_map,
- mdr->now);
- ::_encode(exported_client_map, reply->inode_export);
- reply->inode_export.claim_append(inodebl);
-
- reply->inode_export_v = mdr->srcdn->inode->inode.version;
+ dout(10) << "handle_slave_rename_prep_ack " << *mdr
+ << " witnessed by " << ack->get_source()
+ << " " << *ack << dendl;
+ int from = ack->get_source().num();
- mdr->inode_import = reply->inode_export; // keep a copy locally, in case we have to rollback
-
- mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER);
+ // note slave
+ mdr->more()->slaves.insert(from);
- // clean up.
- delete mdr->slave_request;
- mdr->slave_request = 0;
-}
+ // witnessed? or add extra witnesses?
+ assert(mdr->more()->witnessed.count(from) == 0);
+ if (ack->witnesses.empty()) {
+ mdr->more()->witnessed.insert(from);
+ } else {
+ dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
+ mdr->more()->extra_witnesses.swap(ack->witnesses);
+ mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
+ }
-void Server::handle_slave_rename_get_inode_ack(MDRequest *mdr, MMDSSlaveRequest *m)
-{
- dout(10) << "handle_slave_rename_get_inode_ack " << *mdr
- << " " << *m << dendl;
- int from = m->get_source().num();
+ // srci import?
+ if (ack->inode_export.length()) {
+ dout(10) << " got srci import" << dendl;
+ mdr->more()->inode_import.claim(ack->inode_export);
+ mdr->more()->inode_import_v = ack->inode_export_v;
+ }
- assert(m->inode_export.length());
- dout(10) << " got inode export, saving in " << *mdr << dendl;
- mdr->inode_import.claim(m->inode_export);
- mdr->inode_import_v = m->inode_export_v;
+ // remove from waiting list
+ assert(mdr->more()->waiting_on_slave.count(from));
+ mdr->more()->waiting_on_slave.erase(from);
- assert(mdr->waiting_on_slave.count(from));
- mdr->waiting_on_slave.erase(from);
-
- if (mdr->waiting_on_slave.empty())
+ if (mdr->more()->waiting_on_slave.empty())
dispatch_client_request(mdr); // go again!
else
- dout(10) << "still waiting on slaves " << mdr->waiting_on_slave << dendl;
+ dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
}
-
// ===================================
// TRUNCATE, FSYNC
in->inode.size = size;
in->inode.ctime = ctime;
in->inode.mtime = ctime;
- in->mark_dirty(pv);
+ in->mark_dirty(pv, mdr->ls);
// reply
mds->server->reply_request(mdr, 0);
assert(r == 0);
// purge
- mds->mdcache->purge_inode(&in->inode, size);
- mds->mdcache->wait_for_purge(in->inode.ino, size,
+ mds->mdcache->purge_inode(in, size, in->inode.size, mdr->ls);
+ mds->mdcache->wait_for_purge(in, size,
new C_MDS_truncate_purged(mds, mdr, in, pv, size, ctime));
}
};
pdv, req->args.truncate.length, ctime);
// log + wait
+ mdr->ls = mdlog->get_current_segment();
EUpdate *le = new EUpdate(mdlog, "truncate");
le->metablob.add_client_req(mdr->reqid);
le->metablob.add_dir_context(cur->get_parent_dir());
- le->metablob.add_inode_truncate(cur->inode, req->args.truncate.length);
+ le->metablob.add_inode_truncate(cur->ino(), req->args.truncate.length, cur->inode.size);
inode_t *pi = le->metablob.add_dentry(cur->parent, true);
pi->mtime = ctime;
pi->ctime = ctime;
pi->version = pdv;
pi->size = req->args.truncate.length;
- mdlog->submit_entry(le);
- mdlog->wait_for_sync(fin);
+
+ mdlog->submit_entry(le, fin);
}
EOpen *le = 0;
// check queued inodes
+ LogSegment *ls = mdlog->get_current_segment();
for (set<CInode*>::iterator p = journal_open_queue.begin();
p != journal_open_queue.end();
++p) {
- (*p)->put(CInode::PIN_BATCHOPENJOURNAL);
- if ((*p)->is_any_caps()) {
+ CInode *in = *p;
+ in->put(CInode::PIN_BATCHOPENJOURNAL);
+ if (in->is_any_caps()) {
if (!le) le = new EOpen(mdlog);
- le->add_inode(*p);
- (*p)->last_open_journaled = mds->mdlog->get_write_pos();
+ le->add_inode(in);
+ in->last_open_journaled = mds->mdlog->get_write_pos();
+ ls->open_files.push_back(&in->xlist_open_file);
}
}
journal_open_queue.clear();
if (le) {
// journal
- mds->mdlog->submit_entry(le);
+ mdlog->submit_entry(le);
// add waiters to journal entry
for (list<Context*>::iterator p = journal_open_waiters.begin();
in->inode.size = 0;
in->inode.ctime = ctime;
in->inode.mtime = ctime;
- in->mark_dirty(pv);
+ in->mark_dirty(pv, mdr->ls);
// do the open
mds->server->_do_open(mdr, in);
mds->balancer->hit_inode(mdr->now, in, META_POP_IWR);
// purge also...
- mds->mdcache->purge_inode(&in->inode, 0);
- mds->mdcache->wait_for_purge(in->inode.ino, 0,
+ mds->mdcache->purge_inode(in, 0, in->inode.size, mdr->ls);
+ mds->mdcache->wait_for_purge(in, 0,
new C_MDS_open_truncate_purged(mds, mdr, in, pv, ctime));
}
};
pdv, ctime);
// log + wait
+ mdr->ls = mdlog->get_current_segment();
EUpdate *le = new EUpdate(mdlog, "open_truncate");
le->metablob.add_client_req(mdr->reqid);
le->metablob.add_dir_context(cur->get_parent_dir());
- le->metablob.add_inode_truncate(cur->inode, 0);
+ le->metablob.add_inode_truncate(cur->ino(), 0, cur->inode.size);
inode_t *pi = le->metablob.add_dentry(cur->parent, true);
pi->mtime = ctime;
pi->ctime = ctime;
pi->version = pdv;
pi->size = 0;
- mdlog->submit_entry(le);
- mdlog->wait_for_sync(fin);
+ mdlog->submit_entry(le, fin);
}
dn->get_dir()->link_primary_inode(dn, newi);
// dirty inode, dn, dir
- newi->mark_dirty(pv);
+ newi->mark_dirty(pv, mdr->ls);
// downgrade xlock to rdlock
//mds->locker->dentry_xlock_downgrade_to_rdlock(dn, mdr);
// prepare finisher
C_MDS_openc_finish *fin = new C_MDS_openc_finish(mds, mdr, dn, in);
+ mdr->ls = mdlog->get_current_segment();
EUpdate *le = new EUpdate(mdlog, "openc");
le->metablob.add_client_req(req->get_reqid());
le->metablob.add_allocated_ino(in->ino(), mds->idalloc->get_version());
le->metablob.add_primary_dentry(dn, true, in, &in->inode);
// log + wait
- mdlog->submit_entry(le);
- mdlog->wait_for_sync(fin);
+ mdlog->submit_entry(le, fin);
/*
FIXME. this needs to be rewritten when the write capability stuff starts
CDir* try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequest *mdr);
version_t predirty_dn_diri(MDRequest *mdr, CDentry *dn, class EMetaBlob *blob);
- void dirty_dn_diri(CDentry *dn, version_t dirpv, utime_t mtime);
+ void dirty_dn_diri(MDRequest *mdr, CDentry *dn, version_t dirpv);
// requests on existing inodes.
CDentry *srcdn, CDentry *destdn, CDentry *straydn);
// helpers
+ void _rename_prepare_witness(MDRequest *mdr, int who,
+ CDentry *srcdn, CDentry *destdn, CDentry *straydn);
void _rename_prepare(MDRequest *mdr,
EMetaBlob *metablob,
CDentry *srcdn, CDentry *destdn, CDentry *straydn);
void handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m);
void _logged_slave_rename(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
void _commit_slave_rename(MDRequest *mdr, int r, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
- void handle_slave_rename_get_inode(MDRequest *mdr);
- void handle_slave_rename_get_inode_ack(MDRequest *mdr, MMDSSlaveRequest *m);
};
// encode/decode
void _encode(bufferlist& bl) {
- ::_encode(state, bl);
- ::_encode(gather_set, bl);
+ ::_encode_simple(state, bl);
+ ::_encode_simple(gather_set, bl);
}
- void _decode(bufferlist& bl, int& off) {
- ::_decode(state, bl, off);
- ::_decode(gather_set, bl, off);
+ void _decode(bufferlist::iterator& p) {
+ ::_decode_simple(state, p);
+ ::_decode_simple(gather_set, p);
}
if (reqmds >= 0) out << " by mds" << reqmds;
}
- bool has_expired(MDS *mds);
- void expire(MDS *mds, Context *c);
- void replay(MDS *mds);
-
+ void update_segment();
+ void replay(MDS *mds);
};
#endif
if (atid) out << " atid " << atid;
}
- bool has_expired(MDS *mds);
- void expire(MDS *mds, Context *c);
void replay(MDS *mds);
};
#include "../CDir.h"
#include "../CDentry.h"
+#include "include/triple.h"
+
class MDS;
class MDLog;
+class LogSegment;
/*
* a bunch of metadata in the journal
::_encode(dn, bl);
::_encode(dnv, bl);
::_encode(inode, bl);
- ::_encode(dirfragtree, bl);
+ dirfragtree._encode(bl);
if (inode.is_symlink())
::_encode(symlink, bl);
::_encode(dirty, bl);
::_decode(dn, bl, off);
::_decode(dnv, bl, off);
::_decode(inode, bl, off);
- ::_decode(dirfragtree, bl, off);
+ dirfragtree._decode(bl, off);
if (inode.is_symlink())
::_decode(symlink, bl, off);
::_decode(dirty, bl, off);
list<nullbit> dnull;
public:
- dirlump() : state(0), nfull(0), nremote(0), nnull(0), dn_decoded(true) { }
+ dirlump() : dirv(0), state(0), nfull(0), nremote(0), nnull(0), dn_decoded(true) { }
bool is_complete() { return state & STATE_COMPLETE; }
void mark_complete() { state |= STATE_COMPLETE; }
version_t alloc_tablev;
// inodes i've destroyed.
- list< pair<inode_t,off_t> > truncated_inodes;
+ list< triple<inodeno_t,off_t,off_t> > truncated_inodes;
// idempotent op(s)
list<metareqid_t> client_reqs;
off_t last_subtree_map;
off_t my_offset;
- EMetaBlob() : last_subtree_map(0), my_offset(0) { }
+ // for replay, in certain cases
+ LogSegment *_segment;
+
+ EMetaBlob() : last_subtree_map(0), my_offset(0), _segment(0) { }
EMetaBlob(MDLog *mdl); // defined in journal.cc
void print(ostream& out) {
alloc_tablev = tablev;
}
- void add_inode_truncate(const inode_t& inode, off_t newsize) {
- truncated_inodes.push_back(pair<inode_t,off_t>(inode, newsize));
+ void add_inode_truncate(inodeno_t ino, off_t newsize, off_t oldsize) {
+ truncated_inodes.push_back(triple<inodeno_t,off_t,off_t>(ino, newsize, oldsize));
}
void add_null_dentry(CDentry *dn, bool dirty) {
bool has_expired(MDS *mds);
void expire(MDS *mds, Context *c);
- void replay(MDS *mds);
+ void update_segment(LogSegment *ls);
+ void replay(MDS *mds, LogSegment *ls=0);
};
inline ostream& operator<<(ostream& out, const EMetaBlob& t) {
metablob._decode(bl, off);
}
- bool has_expired(MDS *mds);
- void expire(MDS *mds, Context *c);
+ void update_segment();
void replay(MDS *mds);
};
class EPurgeFinish : public LogEvent {
protected:
inodeno_t ino;
- off_t newsize;
+ off_t newsize, oldsize;
public:
- EPurgeFinish(inodeno_t i, off_t s) :
+ EPurgeFinish(inodeno_t i, off_t ns, off_t os) :
LogEvent(EVENT_PURGEFINISH),
- ino(i), newsize(s) { }
+ ino(i), newsize(ns), oldsize(os) { }
EPurgeFinish() : LogEvent(EVENT_PURGEFINISH) { }
void print(ostream& out) {
- out << "purgefinish " << ino << " to " << newsize;
+ out << "purgefinish " << ino << " " << oldsize << " ->" << newsize;
}
virtual void encode_payload(bufferlist& bl) {
bl.append((char*)&ino, sizeof(ino));
bl.append((char*)&newsize, sizeof(newsize));
+ bl.append((char*)&oldsize, sizeof(oldsize));
}
void decode_payload(bufferlist& bl, int& off) {
- bl.copy(off, sizeof(ino), (char*)&ino);
- off += sizeof(ino);
- bl.copy(off, sizeof(newsize), (char*)&newsize);
- off += sizeof(newsize);
+ ::_decode(ino, bl, off);
+ ::_decode(newsize, bl, off);
+ ::_decode(oldsize, bl, off);
}
bool has_expired(MDS *mds);
void expire(MDS *mds, Context *c);
+ void update_segment();
void replay(MDS *mds);
-
};
#endif
bool has_expired(MDS *mds);
void expire(MDS *mds, Context *c);
- void replay(MDS *mds);
-
+ void update_segment();
+ void replay(MDS *mds);
};
#endif
const static int OP_COMMIT = 2;
const static int OP_ROLLBACK = 3;
- EMetaBlob metablob;
+ /*
+ * we journal a rollback metablob that contains the unmodified metadata
+ * too, because we may be updating previously dirty metadata, which
+ * will allow old log segments to be trimmed. if we end of rolling back,
+ * those updates could be lost.. so we re-journal the unmodified metadata,
+ * and replay will apply _either_ commit or rollback.
+ */
+ EMetaBlob commit, rollback;
string type;
metareqid_t reqid;
int master;
ESlaveUpdate() : LogEvent(EVENT_SLAVEUPDATE) { }
ESlaveUpdate(MDLog *mdlog, const char *s, metareqid_t ri, int mastermds, int o) :
- LogEvent(EVENT_SLAVEUPDATE), metablob(mdlog),
+ LogEvent(EVENT_SLAVEUPDATE), commit(mdlog), rollback(mdlog),
type(s),
reqid(ri),
master(mastermds),
out << " " << op;
out << " " << reqid;
out << " for mds" << master;
- out << metablob;
+ out << commit << " " << rollback;
}
void encode_payload(bufferlist& bl) {
::_encode(reqid, bl);
::_encode(master, bl);
::_encode(op, bl);
- metablob._encode(bl);
+ commit._encode(bl);
+ rollback._encode(bl);
}
void decode_payload(bufferlist& bl, int& off) {
::_decode(type, bl, off);
::_decode(reqid, bl, off);
::_decode(master, bl, off);
::_decode(op, bl, off);
- metablob._decode(bl, off);
+ commit._decode(bl, off);
+ rollback._decode(bl, off);
}
bool has_expired(MDS *mds);
::_decode(subtrees, bl, off);
}
- bool has_expired(MDS *mds);
- void expire(MDS *mds, Context *c);
+ //bool has_expired(MDS *mds);
+ //void expire(MDS *mds, Context *c);
void replay(MDS *mds);
};
metablob._decode(bl, off);
}
- bool has_expired(MDS *mds);
- void expire(MDS *mds, Context *c);
+ void update_segment();
void replay(MDS *mds);
};
#include "events/EAnchor.h"
#include "events/EAnchorClient.h"
+#include "LogSegment.h"
+
#include "MDS.h"
#include "MDLog.h"
#include "MDCache.h"
#include "AnchorTable.h"
#include "AnchorClient.h"
#include "IdAllocator.h"
+#include "Locker.h"
+
#include "config.h"
+#define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log || l <= g_conf.debug_mds_log_expire) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".journal "
+#define derr(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log || l <= g_conf.debug_mds_log_expire) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".journal "
+
+
+// -----------------------
+// LogSegment
+
+class C_MDL_RetryExpireSegment : public Context {
+public:
+ MDS *mds;
+ LogSegment *ls;
+ C_MDL_RetryExpireSegment(MDS *m, LogSegment *l) : mds(m), ls(l) {}
+ void finish(int r) {
+ ls->try_to_expire(mds);
+ }
+};
+
+C_Gather *LogSegment::try_to_expire(MDS *mds)
+{
+ C_Gather *gather = 0;
+
+ set<CDir*> commit;
+
+ dout(6) << "LogSegment(" << offset << ").try_to_expire" << dendl;
+
+ // commit dirs
+ for (xlist<CDir*>::iterator p = dirty_dirfrags.begin(); !p.end(); ++p)
+ commit.insert(*p);
+ for (xlist<CDentry*>::iterator p = dirty_dentries.begin(); !p.end(); ++p)
+ commit.insert((*p)->get_dir());
+ for (xlist<CInode*>::iterator p = dirty_inodes.begin(); !p.end(); ++p)
+ commit.insert((*p)->get_parent_dn()->get_dir());
+
+ if (!commit.empty()) {
+ if (!gather) gather = new C_Gather;
+
+ for (set<CDir*>::iterator p = commit.begin();
+ p != commit.end();
+ ++p) {
+ CDir *dir = *p;
+ if (dir->can_auth_pin()) {
+ dout(15) << "try_to_expire committing " << *dir << dendl;
+ dir->commit(0, gather->new_sub());
+ } else {
+ dout(15) << "try_to_expire waiting for unfreeze on " << *dir << dendl;
+ dir->add_waiter(CDir::WAIT_UNFREEZE, gather->new_sub());
+ }
+ }
+ }
+
+ // dirty non-auth mtimes
+ for (xlist<CInode*>::iterator p = dirty_inode_mtimes.begin(); !p.end(); ++p) {
+ CInode *in = *p;
+ dout(10) << "try_to_expire waiting for dirlock mtime flush on " << *in << dendl;
+ if (!gather) gather = new C_Gather;
+
+ if (in->is_ambiguous_auth()) {
+ dout(10) << " waiting for single auth on " << *in << dendl;
+ in->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, gather->new_sub());
+ } else if (in->is_auth()) {
+ dout(10) << " i'm auth, unscattering dirlock on " << *in << dendl;
+ assert(in->is_replicated()); // hrm!
+ mds->locker->scatter_lock(&in->dirlock);
+ in->dirlock.add_waiter(SimpleLock::WAIT_STABLE, gather->new_sub());
+ } else {
+ dout(10) << " i'm a replica, requesting dirlock unscatter of " << *in << dendl;
+ mds->locker->scatter_try_unscatter(&in->dirlock, gather->new_sub());
+ }
+ //(*p)->dirlock.add_waiter(SimpleLock::WAIT_STABLE, gather->new_sub());
+ }
+
+ // open files
+ if (!open_files.empty()) {
+ assert(!mds->mdlog->is_capped()); // hmm FIXME
+ for (xlist<CInode*>::iterator p = open_files.begin(); !p.end(); ++p) {
+ dout(20) << "try_to_expire requeueing open file " << **p << dendl;
+ mds->server->queue_journal_open(*p);
+ }
+ if (!gather) gather = new C_Gather;
+ mds->server->add_journal_open_waiter(gather->new_sub());
+ mds->server->maybe_journal_opens();
+ dout(10) << "try_to_expire waiting for open files to rejournal" << dendl;
+ }
+
+ // slave updates
+ for (xlist<MDSlaveUpdate*>::iterator p = slave_updates.begin(); !p.end(); ++p) {
+ MDSlaveUpdate *su = *p;
+ dout(10) << "try_to_expire waiting on slave update " << su << dendl;
+ assert(su->waiter == 0);
+ if (!gather) gather = new C_Gather;
+ su->waiter = gather->new_sub();
+ }
+
+ // idalloc
+ if (allocv > mds->idalloc->get_committed_version()) {
+ dout(10) << "try_to_expire saving idalloc table, need " << allocv
+ << ", committed is " << mds->idalloc->get_committed_version()
+ << " (" << mds->idalloc->get_committing_version() << ")"
+ << dendl;
+ if (!gather) gather = new C_Gather;
+ mds->idalloc->save(gather->new_sub(), allocv);
+ }
+
+ // clientmap
+ if (clientmapv > mds->clientmap.get_committed()) {
+ dout(10) << "try_to_expire saving clientmap, need " << clientmapv
+ << ", committed is " << mds->clientmap.get_committed()
+ << " (" << mds->clientmap.get_committing() << ")"
+ << dendl;
+ if (!gather) gather = new C_Gather;
+ mds->clientmap.save(gather->new_sub(), clientmapv);
+ }
+
+ // pending commit atids
+ for (hash_set<version_t>::iterator p = pending_commit_atids.begin();
+ p != pending_commit_atids.end();
+ ++p) {
+ if (!gather) gather = new C_Gather;
+ assert(!mds->anchorclient->has_committed(*p));
+ dout(10) << "try_to_expire anchor transaction " << *p
+ << " pending commit (not yet acked), waiting" << dendl;
+ mds->anchorclient->wait_for_ack(*p, gather->new_sub());
+ }
+
+ // anchortable
+ if (anchortablev > mds->anchortable->get_committed_version()) {
+ dout(10) << "try_to_expire waiting for anchor table to save, need " << anchortablev << dendl;
+ if (!gather) gather = new C_Gather;
+ mds->anchortable->save(gather->new_sub());
+ }
+
+ // FIXME client requests...?
+ // audit handling of anchor transactions?
+
+ if (gather) {
+ dout(6) << "LogSegment(" << offset << ").try_to_expire waiting" << dendl;
+ } else {
+ dout(6) << "LogSegment(" << offset << ").try_to_expire success" << dendl;
+ }
+ return gather;
+}
+
+
+
+#undef dout
+#undef derr
#define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".journal "
#define derr(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".journal "
// EMetaBlob
EMetaBlob::EMetaBlob(MDLog *mdlog) :
- last_subtree_map(mdlog->get_last_subtree_map_offset()),
+ last_subtree_map(mdlog->get_last_segment_offset()),
my_offset(mdlog->get_write_pos())
{
}
*/
bool EMetaBlob::has_expired(MDS *mds)
{
+/*
// examine dirv's for my lumps
for (map<dirfrag_t,dirlump>::iterator lp = lump_map.begin();
lp != lump_map.end();
}
+ */
return true; // all dirlumps expired, etc.
}
void EMetaBlob::expire(MDS *mds, Context *c)
{
+/*
map<CDir*,version_t> commit; // dir -> version needed
list<CDir*> waitfor_export;
list<CDir*> waitfor_import;
else
// pbly about to export|split|merge.
// just wait for it to unfreeze, then retry
- p->first->add_waiter(CDir::WAIT_AUTHPINNABLE, gather->new_sub());
+ p->first->add_waiter(CDir::WAIT_UNFREEZE, gather->new_sub());
}
for (list<CDir*>::iterator p = waitfor_export.begin();
p != waitfor_export.end();
dout(10) << "my gather finsher is " << gather << " with " << gather->get_num() << dendl;
+*/
+}
+
+void EMetaBlob::update_segment(LogSegment *ls)
+{
+ // atids?
+ //for (list<version_t>::iterator p = atids.begin(); p != atids.end(); ++p)
+ // ls->pending_commit_atids[*p] = ls;
+ // -> handled directly by AnchorClient
+
+ // dirty inode mtimes
+ // -> handled directly by Server.cc, replay()
+
+ // alloc table update?
+ if (!allocated_inos.empty())
+ ls->allocv = alloc_tablev;
+
+ // truncated inodes
+ // -> handled directly by Server.cc
+
+ // client requests
+ // note the newest request per client
+ //if (!client_reqs.empty())
+ // ls->last_client_tid[client_reqs.rbegin()->client] = client_reqs.rbegin()->tid);
}
-void EMetaBlob::replay(MDS *mds)
+void EMetaBlob::replay(MDS *mds, LogSegment *logseg)
{
dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps" << dendl;
+ if (!logseg) logseg = _segment;
+ assert(logseg);
+
// walk through my dirs (in order!)
for (list<dirfrag_t>::iterator lp = lump_order.begin();
lp != lump_order.end();
}
dir->set_version( lump.dirv );
if (lump.is_dirty())
- dir->_mark_dirty();
+ dir->_mark_dirty(logseg);
if (lump.is_complete())
dir->mark_complete();
if (!dn) {
dn = dir->add_null_dentry(p->dn);
dn->set_version(p->dnv);
- if (p->dirty) dn->_mark_dirty();
+ if (p->dirty) dn->_mark_dirty(logseg);
dout(10) << "EMetaBlob.replay added " << *dn << dendl;
} else {
dn->set_version(p->dnv);
- if (p->dirty) dn->_mark_dirty();
+ if (p->dirty) dn->_mark_dirty(logseg);
dout(10) << "EMetaBlob.replay had " << *dn << dendl;
}
//assert(0); // hrm! fallout from sloppy unlink? or? hmmm FIXME investigate further
}
dir->link_primary_inode(dn, in);
- if (p->dirty) in->_mark_dirty();
+ if (p->dirty) in->_mark_dirty(logseg);
dout(10) << "EMetaBlob.replay added " << *in << dendl;
} else {
if (dn->get_inode() != in && in->get_parent_dn()) {
in->inode = p->inode;
in->dirfragtree = p->dirfragtree;
if (in->inode.is_symlink()) in->symlink = p->symlink;
- if (p->dirty) in->_mark_dirty();
+ if (p->dirty) in->_mark_dirty(logseg);
if (dn->get_inode() != in) {
dir->link_primary_inode(dn, in);
dout(10) << "EMetaBlob.replay linked " << *in << dendl;
if (!dn) {
dn = dir->add_remote_dentry(p->dn, p->ino, p->d_type);
dn->set_version(p->dnv);
- if (p->dirty) dn->_mark_dirty();
+ if (p->dirty) dn->_mark_dirty(logseg);
dout(10) << "EMetaBlob.replay added " << *dn << dendl;
} else {
if (!dn->is_null()) {
}
dn->set_remote(p->ino, p->d_type);
dn->set_version(p->dnv);
- if (p->dirty) dn->_mark_dirty();
+ if (p->dirty) dn->_mark_dirty(logseg);
dout(10) << "EMetaBlob.replay had " << *dn << dendl;
}
}
if (!dn) {
dn = dir->add_null_dentry(p->dn);
dn->set_version(p->dnv);
- if (p->dirty) dn->_mark_dirty();
+ if (p->dirty) dn->_mark_dirty(logseg);
dout(10) << "EMetaBlob.replay added " << *dn << dendl;
} else {
if (!dn->is_null()) {
dir->unlink_inode(dn);
}
dn->set_version(p->dnv);
- if (p->dirty) dn->_mark_dirty();
+ if (p->dirty) dn->_mark_dirty(logseg);
dout(10) << "EMetaBlob.replay had " << *dn << dendl;
}
}
p != atids.end();
++p) {
dout(10) << "EMetaBlob.replay noting anchor transaction " << *p << dendl;
- mds->anchorclient->got_journaled_agree(*p);
+ mds->anchorclient->got_journaled_agree(*p, logseg);
}
// dirtied inode mtimes
CInode *in = mds->mdcache->get_inode(p->first);
dout(10) << "EMetaBlob.replay setting dirlock updated flag on " << *in << dendl;
in->dirlock.set_updated();
+ logseg->dirty_inode_mtimes.push_back(&in->xlist_dirty_inode_mtime);
}
// allocated_inos
inodeno_t ino = mds->idalloc->alloc_id();
assert(ino == *p); // this should match.
-
- assert(alloc_tablev == mds->idalloc->get_version());
}
+ assert(alloc_tablev == mds->idalloc->get_version());
}
}
// truncated inodes
- for (list< pair<inode_t,off_t> >::iterator p = truncated_inodes.begin();
+ for (list< triple<inodeno_t,off_t,off_t> >::iterator p = truncated_inodes.begin();
p != truncated_inodes.end();
++p) {
- dout(10) << "EMetaBlob.replay will purge truncated inode " << p->first.ino
- << " to " << p->second << dendl;
- mds->mdcache->add_recovered_purge(p->first, p->second);
+ CInode *in = mds->mdcache->get_inode(p->first);
+ assert(in);
+ dout(10) << "EMetaBlob.replay will purge truncated "
+ << p->third << " -> " << p->second
+ << " on " << *in << dendl;
+ mds->mdcache->add_recovered_purge(in, p->second, p->third, logseg);
}
// client requests
p != client_reqs.end();
++p)
mds->clientmap.add_completed_request(*p);
+
+
+ // update segment
+ update_segment(logseg);
}
// -----------------------
// ESession
-bool ESession::has_expired(MDS *mds)
-{
- if (mds->clientmap.get_committed() >= cmapv) {
- dout(10) << "ESession.has_expired newer clientmap " << mds->clientmap.get_committed()
- << " >= " << cmapv << " has committed" << dendl;
- return true;
- } else if (mds->clientmap.get_committing() >= cmapv) {
- dout(10) << "ESession.has_expired newer clientmap " << mds->clientmap.get_committing()
- << " >= " << cmapv << " is still committing" << dendl;
- return false;
- } else {
- dout(10) << "ESession.has_expired clientmap " << mds->clientmap.get_version()
- << " > " << cmapv << ", need to save" << dendl;
- return false;
- }
-}
-void ESession::expire(MDS *mds, Context *c)
-{
- dout(10) << "ESession.expire saving clientmap" << dendl;
- mds->clientmap.save(c, cmapv);
+void ESession::update_segment()
+{
+ _segment->clientmapv = cmapv;
}
void ESession::replay(MDS *mds)
// -----------------------
// EAnchor
-bool EAnchor::has_expired(MDS *mds)
+void EAnchor::update_segment()
{
- version_t cv = mds->anchortable->get_committed_version();
- if (cv < version) {
- dout(10) << "EAnchor.has_expired v " << version << " > " << cv
- << ", still dirty" << dendl;
- return false; // still dirty
- } else {
- dout(10) << "EAnchor.has_expired v " << version << " <= " << cv
- << ", already flushed" << dendl;
- return true; // already flushed
- }
-}
-
-void EAnchor::expire(MDS *mds, Context *c)
-{
- dout(10) << "EAnchor.expire saving anchor table" << dendl;
- mds->anchortable->save(c);
+ _segment->anchortablev = version;
}
void EAnchor::replay(MDS *mds)
// EAnchorClient
-bool EAnchorClient::has_expired(MDS *mds)
-{
- return true;
-}
-
-void EAnchorClient::expire(MDS *mds, Context *c)
-{
- assert(0);
-}
-
void EAnchorClient::replay(MDS *mds)
{
dout(10) << " EAnchorClient.replay op " << op << " atid " << atid << dendl;
// -----------------------
// EUpdate
-bool EUpdate::has_expired(MDS *mds)
-{
- return metablob.has_expired(mds);
-}
-
-void EUpdate::expire(MDS *mds, Context *c)
+void EUpdate::update_segment()
{
- metablob.expire(mds, c);
+ metablob.update_segment(_segment);
}
void EUpdate::replay(MDS *mds)
{
- metablob.replay(mds);
+ metablob.replay(mds, _segment);
}
// ------------------------
// EOpen
-bool EOpen::has_expired(MDS *mds)
-{
- for (list<inodeno_t>::iterator p = inos.begin(); p != inos.end(); ++p) {
- CInode *in = mds->mdcache->get_inode(*p);
- if (in &&
- in->is_any_caps() &&
- !(in->last_open_journaled > get_start_off() ||
- in->last_open_journaled == 0)) {
- dout(10) << "EOpen.has_expired still refer to caps on " << *in << dendl;
- return false;
- }
- }
- return true;
-}
-
-void EOpen::expire(MDS *mds, Context *c)
+void EOpen::update_segment()
{
- dout(10) << "EOpen.expire " << dendl;
-
- if (mds->mdlog->is_capped()) {
- dout(0) << "uh oh, log is capped, but i have unexpired opens." << dendl;
- assert(0);
- }
-
- for (list<inodeno_t>::iterator p = inos.begin(); p != inos.end(); ++p) {
- CInode *in = mds->mdcache->get_inode(*p);
- if (!in) continue;
- if (!in->is_any_caps()) continue;
-
- dout(10) << "EOpen.expire " << in->ino()
- << " last_open_journaled " << in->last_open_journaled << dendl;
-
- mds->server->queue_journal_open(in);
- }
- mds->server->add_journal_open_waiter(c);
- mds->server->maybe_journal_opens();
+ // ??
}
void EOpen::replay(MDS *mds)
{
dout(10) << "EOpen.replay " << dendl;
- metablob.replay(mds);
+ metablob.replay(mds, _segment);
}
// -----------------------
// ESlaveUpdate
-bool ESlaveUpdate::has_expired(MDS *mds)
-{
- switch (op) {
- case ESlaveUpdate::OP_PREPARE:
- if (mds->mdcache->ambiguous_slave_updates.count(reqid) == 0) {
- dout(10) << "ESlaveUpdate.has_expired prepare " << reqid << " for mds" << master
- << ": haven't yet seen commit|rollback" << dendl;
- return false;
- }
- else if (mds->mdcache->ambiguous_slave_updates[reqid]) {
- dout(10) << "ESlaveUpdate.has_expired prepare " << reqid << " for mds" << master
- << ": committed, checking metablob" << dendl;
- bool exp = metablob.has_expired(mds);
- if (exp)
- mds->mdcache->ambiguous_slave_updates.erase(reqid);
- return exp;
- }
- else {
- dout(10) << "ESlaveUpdate.has_expired prepare " << reqid << " for mds" << master
- << ": aborted" << dendl;
- mds->mdcache->ambiguous_slave_updates.erase(reqid);
- return true;
- }
-
- case ESlaveUpdate::OP_COMMIT:
- case ESlaveUpdate::OP_ROLLBACK:
- if (mds->mdcache->waiting_for_slave_update_commit.count(reqid)) {
- dout(10) << "ESlaveUpdate.has_expired "
- << ((op == ESlaveUpdate::OP_COMMIT) ? "commit ":"rollback ")
- << reqid << " for mds" << master
- << ": noting commit, kicking prepare waiter" << dendl;
- mds->mdcache->ambiguous_slave_updates[reqid] = (op == ESlaveUpdate::OP_COMMIT);
- mds->mdcache->waiting_for_slave_update_commit[reqid]->finish(0);
- delete mds->mdcache->waiting_for_slave_update_commit[reqid];
- mds->mdcache->waiting_for_slave_update_commit.erase(reqid);
- } else {
- dout(10) << "ESlaveUpdate.has_expired "
- << ((op == ESlaveUpdate::OP_COMMIT) ? "commit ":"rollback ")
- << reqid << " for mds" << master
- << ": no prepare waiter, ignoring" << dendl;
- }
- return true;
-
- default:
- assert(0);
- return false;
- }
-}
-
-void ESlaveUpdate::expire(MDS *mds, Context *c)
-{
- assert(op == ESlaveUpdate::OP_PREPARE);
-
- if (mds->mdcache->ambiguous_slave_updates.count(reqid) == 0) {
- // wait
- dout(10) << "ESlaveUpdate.expire prepare " << reqid << " for mds" << master
- << ": waiting for commit|rollback" << dendl;
- mds->mdcache->waiting_for_slave_update_commit[reqid] = c;
- } else {
- // we committed.. expire the metablob
- assert(mds->mdcache->ambiguous_slave_updates[reqid] == true);
- dout(10) << "ESlaveUpdate.expire prepare " << reqid << " for mds" << master
- << ": waiting for metablob to expire" << dendl;
- metablob.expire(mds, c);
- }
-}
-
void ESlaveUpdate::replay(MDS *mds)
{
switch (op) {
case ESlaveUpdate::OP_PREPARE:
// FIXME: horribly inefficient copy; EMetaBlob needs a swap() or something
dout(10) << "ESlaveUpdate.replay prepare " << reqid << " for mds" << master
- << ": saving blob for later commit" << dendl;
+ << ": saving blobs for later commit" << dendl;
assert(mds->mdcache->uncommitted_slave_updates[master].count(reqid) == 0);
- mds->mdcache->uncommitted_slave_updates[master][reqid] = metablob;
+ commit._segment = _segment; // may need this later
+ rollback._segment = _segment; // may need this later
+ mds->mdcache->uncommitted_slave_updates[master][reqid] =
+ MDSlaveUpdate(commit, rollback, _segment->slave_updates);
break;
case ESlaveUpdate::OP_COMMIT:
if (mds->mdcache->uncommitted_slave_updates[master].count(reqid)) {
dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds" << master
- << ": applying previously saved blob" << dendl;
- mds->mdcache->uncommitted_slave_updates[master][reqid].replay(mds);
+ << ": applying commit blob" << dendl;
+ mds->mdcache->uncommitted_slave_updates[master][reqid].commit.replay(mds, _segment);
mds->mdcache->uncommitted_slave_updates[master].erase(reqid);
} else {
dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds" << master
- << ": ignoring, no previously saved blob" << dendl;
+ << ": ignoring, no previously saved blobs" << dendl;
}
break;
case ESlaveUpdate::OP_ROLLBACK:
if (mds->mdcache->uncommitted_slave_updates[master].count(reqid)) {
dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds" << master
- << ": discarding previously saved blob" << dendl;
+ << ": applying rollback blob" << dendl;
assert(mds->mdcache->uncommitted_slave_updates[master].count(reqid));
+ mds->mdcache->uncommitted_slave_updates[master][reqid].rollback.replay(mds, _segment);
mds->mdcache->uncommitted_slave_updates[master].erase(reqid);
} else {
dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds" << master
- << ": ignoring, no previously saved blob" << dendl;
+ << ": ignoring, no previously saved blobs" << dendl;
}
break;
// -----------------------
// ESubtreeMap
-bool ESubtreeMap::has_expired(MDS *mds)
-{
- assert(!mds->mdlog->subtree_maps.empty());
- set<off_t>::iterator p = mds->mdlog->subtree_maps.begin();
- off_t first = *p;
- if (get_start_off() != first) {
- dout(10) << "ESubtreeMap.has_expired -- not the oldest segment" << dendl;
- return false;
- }
-
- // i am the oldest.
-
- // capped and last event?
- if (mds->mdlog->is_capped() &&
- mds->mdlog->subtree_maps.size() == 1 &&
- (mds->mdlog->trimming.empty() ||
- (mds->mdlog->trimming.size() == 1 &&
- mds->mdlog->trimming.begin()->second == this))) {
- dout(10) << "ESubtreeMap.has_expired -- capped and last one" << dendl;
- return true;
- }
-
- p++;
- if (p == mds->mdlog->subtree_maps.end()) {
- dout(10) << "ESubtreeMap.has_expired -- only segment" << dendl;
- return false;
- }
- off_t next = *p;
-
- if (mds->mdlog->get_read_pos() < next) {
- dout(10) << "ESubtreeMap.has_expired -- haven't read this segment, read pos "
- << mds->mdlog->get_read_pos() << " < next map at " << next
- << dendl;
- return false;
- }
-
- map<off_t,LogEvent*>::iterator trimp = mds->mdlog->trimming.begin();
- assert(trimp->first == get_start_off());
- trimp++;
- if (trimp != mds->mdlog->trimming.end() &&
- trimp->first < next) {
- dout(10) << "ESubtreeMap.has_expired -- segment still trimming at " << trimp->first << dendl;
- return false;
- }
-
- dout(10) << "ESubtreeMap.has_expired -- segment is empty" << dendl;
- return true;
-}
-
-void ESubtreeMap::expire(MDS *mds, Context *c)
-{
- dout(10) << "ESubtreeMap.has_expire -- waiting for a newer map to be written (or for shutdown)" << dendl;
- mds->mdlog->subtree_map_expire_waiters[get_start_off()].push_back(c);
-}
-
void ESubtreeMap::replay(MDS *mds)
{
- // note location
- mds->mdlog->subtree_maps.insert(get_start_off());
-
+ // suck up the subtree map?
if (mds->mdcache->is_subtrees()) {
dout(10) << "ESubtreeMap.replay -- ignoring, already have import map" << dendl;
- } else {
- dout(10) << "ESubtreeMap.replay -- reconstructing (auth) subtree spanning tree" << dendl;
-
- // first, stick the spanning tree in my cache
- //metablob.print(cout);
- metablob.replay(mds);
-
- // restore import/export maps
- for (map<dirfrag_t, list<dirfrag_t> >::iterator p = subtrees.begin();
- p != subtrees.end();
- ++p) {
- CDir *dir = mds->mdcache->get_dirfrag(p->first);
- mds->mdcache->adjust_bounded_subtree_auth(dir, p->second, mds->get_nodeid());
- }
+ return;
}
+
+ dout(10) << "ESubtreeMap.replay -- reconstructing (auth) subtree spanning tree" << dendl;
+
+ // first, stick the spanning tree in my cache
+ //metablob.print(cout);
+ metablob.replay(mds, _segment);
+
+ // restore import/export maps
+ for (map<dirfrag_t, list<dirfrag_t> >::iterator p = subtrees.begin();
+ p != subtrees.end();
+ ++p) {
+ CDir *dir = mds->mdcache->get_dirfrag(p->first);
+ mds->mdcache->adjust_bounded_subtree_auth(dir, p->second, mds->get_nodeid());
+ }
+
mds->mdcache->show_subtrees();
}
// -----------------------
// EFragment
-bool EFragment::has_expired(MDS *mds)
-{
- return metablob.has_expired(mds);
-}
-
-void EFragment::expire(MDS *mds, Context *c)
-{
- metablob.expire(mds, c);
-}
-
void EFragment::replay(MDS *mds)
{
dout(10) << "EFragment.replay " << ino << " " << basefrag << " by " << bits << dendl;
list<Context*> waiters;
mds->mdcache->adjust_dir_fragments(in, basefrag, bits, resultfrags, waiters);
- metablob.replay(mds);
+ metablob.replay(mds, _segment);
}
assert(0);
}
+void EPurgeFinish::update_segment()
+{
+ // ** update purge lists?
+}
+
void EPurgeFinish::replay(MDS *mds)
{
- dout(10) << "EPurgeFinish.replay " << ino << " to " << newsize << dendl;
- mds->mdcache->remove_recovered_purge(ino, newsize);
+ dout(10) << "EPurgeFinish.replay " << ino << " " << oldsize << " -> " << newsize << dendl;
+ CInode *in = mds->mdcache->get_inode(ino);
+ assert(in);
+ mds->mdcache->remove_recovered_purge(in, newsize, oldsize);
}
void EExport::replay(MDS *mds)
{
dout(10) << "EExport.replay " << base << dendl;
- metablob.replay(mds);
+ metablob.replay(mds, _segment);
CDir *dir = mds->mdcache->get_dirfrag(base);
assert(dir);
// -----------------------
// EImportStart
-bool EImportStart::has_expired(MDS *mds)
-{
- return metablob.has_expired(mds);
-}
-
-void EImportStart::expire(MDS *mds, Context *c)
-{
- dout(10) << "EImportStart.expire " << base << dendl;
- metablob.expire(mds, c);
-}
-
void EImportStart::replay(MDS *mds)
{
dout(10) << "EImportStart.replay " << base << dendl;
- metablob.replay(mds);
+ metablob.replay(mds, _segment);
// put in ambiguous import list
mds->mdcache->add_ambiguous_import(base, bounds);
#include <cassert>
#include "include/frag.h"
+#include "include/xlist.h"
#define MDS_REF_SET // define me for improved debug output, sanity checking
struct metareqid_t {
+ uint64_t tid;
int32_t client;
- tid_t tid;
- metareqid_t() : client(-1), tid(0) {}
- metareqid_t(int c, tid_t t) : client(c), tid(t) {}
+ int32_t _pad;
+ metareqid_t() : tid(0), client(-1), _pad(0) {}
+ metareqid_t(int c, tid_t t) : tid(t), client(c), _pad(0) {}
};
inline ostream& operator<<(ostream& out, const metareqid_t& r) {
struct dirfrag_t {
inodeno_t ino;
frag_t frag;
+ uint32_t _pad;
- dirfrag_t() { }
- dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f) { }
+ dirfrag_t() : ino(0), _pad(0) { }
+ dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f), _pad(0) { }
};
inline ostream& operator<<(ostream& out, const dirfrag_t df) {
return l.ino == r.ino && l.frag == r.frag;
}
+namespace __gnu_cxx {
+ template<> struct hash<dirfrag_t> {
+ size_t operator()(const dirfrag_t &df) const {
+ static rjhash<uint64_t> H;
+ static rjhash<uint32_t> I;
+ return H(df.ino) ^ I(df.frag);
+ }
+ };
+}
+
+
// ================================================================
dirfrag_t dirfrag;
string dname;
+ MDSCacheObjectInfo() : ino(0) {}
+
void _encode(bufferlist& bl) const {
::_encode(ino, bl);
::_encode(dirfrag, bl);
::_decode(dirfrag, bl, off);
::_decode(dname, bl, off);
}
+ void _decode(bufferlist::iterator& p) {
+ ::_decode_simple(ino, p);
+ ::_decode_simple(dirfrag, p);
+ ::_decode_simple(dname, p);
+ }
};
const static int PIN_LOCK = -1002;
const static int PIN_REQUEST = -1003;
const static int PIN_WAITER = 1004;
- const static int PIN_DIRTYSCATTERED = 1005;
+ const static int PIN_DIRTYSCATTERED = 1005; // make this neg if we start using multiple scatterlocks?
static const int PIN_AUTHPIN = 1006;
+ static const int PIN_PTRWAITER = -1007;
+ const static int PIN_TEMPEXPORTING = 1008; // temp pin between encode_ and finish_export
const char *generic_pin_name(int p) {
switch (p) {
case PIN_WAITER: return "waiter";
case PIN_DIRTYSCATTERED: return "dirtyscattered";
case PIN_AUTHPIN: return "authpin";
+ case PIN_PTRWAITER: return "ptrwaiter";
+ case PIN_TEMPEXPORTING: return "tempexporting";
default: assert(0); return 0;
}
}
// -- wait --
const static int WAIT_SINGLEAUTH = (1<<30);
- const static int WAIT_AUTHPINNABLE = (1<<29);
- const static int WAIT_UNFREEZE = WAIT_AUTHPINNABLE;
+ const static int WAIT_UNFREEZE = (1<<29); // pka AUTHPINNABLE
// ============================================
// cons
public:
MDSCacheObject() :
- state(0),
+ state(0),
ref(0),
replica_nonce(0) {}
virtual ~MDSCacheObject() {}
unsigned state; // state bits
public:
- unsigned get_state() { return state; }
+ unsigned get_state() const { return state; }
+ unsigned state_test(unsigned mask) const { return (state & mask); }
void state_clear(unsigned mask) { state &= ~mask; }
void state_set(unsigned mask) { state |= mask; }
- unsigned state_test(unsigned mask) { return state & mask; }
void state_reset(unsigned s) { state = s; }
- bool is_auth() { return state_test(STATE_AUTH); }
- bool is_dirty() { return state_test(STATE_DIRTY); }
- bool is_clean() { return !is_dirty(); }
- bool is_rejoining() { return state_test(STATE_REJOINING); }
+ bool is_auth() const { return state_test(STATE_AUTH); }
+ bool is_dirty() const { return state_test(STATE_DIRTY); }
+ bool is_clean() const { return !is_dirty(); }
+ bool is_rejoining() const { return state_test(STATE_REJOINING); }
// --------------------------------------------
// authority
virtual void add_lock_waiter(int type, int mask, Context *c) { assert(0); }
virtual bool is_lock_waiting(int type, int mask) { assert(0); return false; }
+ virtual void clear_dirty_scattered(int type) { assert(0); }
// ---------------------------------------------
// ordering
class MClientMount : public Message {
public:
entity_addr_t addr;
- int instance; // on this node
+ int32_t instance; // on this node
MClientMount() : Message(MSG_CLIENT_MOUNT) { }
MClientMount(entity_addr_t a, int i = 0) :
#define __MCLIENTREPLY_H
#include "include/types.h"
-
+#include "include/encodable.h"
#include "MClientRequest.h"
#include "msg/Message.h"
*
*/
-class InodeStat {
- public:
+struct DirStat {
+ // mds distribution hints
+ frag_t frag;
+ int auth;
+ set<int> dist;
+ bool is_rep;
+
+ DirStat() {}
+ DirStat(bufferlist::iterator& p) {
+ _decode(p);
+ }
+
+ void _decode(bufferlist::iterator& p) {
+ ::_decode_simple(frag, p);
+ ::_decode_simple(auth, p);
+ ::_decode_simple(dist, p);
+ ::_decode_simple(is_rep, p);
+ }
+
+ static void _encode(bufferlist& bl, CDir *dir, int whoami) {
+ frag_t frag = dir->get_frag();
+ int auth;
+ set<int> dist;
+ bool is_rep;
+
+ auth = dir->get_dir_auth().first;
+ if (dir->is_auth())
+ dir->get_dist_spec(dist, whoami);
+ is_rep = dir->is_rep();
+
+ ::_encode_simple(frag, bl);
+ ::_encode_simple(auth, bl);
+ ::_encode_simple(dist, bl);
+ ::_encode_simple(is_rep, bl);
+ }
+};
+
+struct InodeStat {
inode_t inode;
string symlink; // symlink content (if symlink)
fragtree_t dirfragtree;
uint32_t mask;
- // mds distribution hints
- map<frag_t,int> dirfrag_auth;
- map<frag_t,set<int> > dirfrag_dist;
- set<frag_t> dirfrag_rep;
-
public:
InodeStat() {}
- InodeStat(CInode *in, int whoami) :
- inode(in->inode),
- mask(STAT_MASK_INO|STAT_MASK_TYPE|STAT_MASK_BASE)
- {
+ InodeStat(bufferlist::iterator& p) {
+ _decode(p);
+ }
+
+ void _decode(bufferlist::iterator &p) {
+ ::_decode_simple(mask, p);
+ ::_decode_simple(inode, p);
+ ::_decode_simple(symlink, p);
+ dirfragtree._decode(p);
+ }
+
+ static void _encode(bufferlist &bl, CInode *in) {
+ int mask = STAT_MASK_INO|STAT_MASK_TYPE|STAT_MASK_BASE;
+
// mask
if (in->authlock.can_rdlock(0)) mask |= STAT_MASK_AUTH;
if (in->linklock.can_rdlock(0)) mask |= STAT_MASK_LINK;
if (in->filelock.can_rdlock(0)) mask |= STAT_MASK_FILE;
-
- // symlink content?
- if (in->is_symlink())
- symlink = in->symlink;
-
- // dirfragtree
- dirfragtree = in->dirfragtree;
-
- // dirfrag info
- list<CDir*> ls;
- in->get_dirfrags(ls);
- for (list<CDir*>::iterator p = ls.begin();
- p != ls.end();
- ++p) {
- CDir *dir = *p;
- dirfrag_auth[dir->dirfrag().frag] = dir->get_dir_auth().first;
- if (dir->is_auth())
- dir->get_dist_spec(dirfrag_dist[dir->dirfrag().frag], whoami);
- if (dir->is_rep())
- dirfrag_rep.insert(dir->dirfrag().frag);
- }
- }
-
- void _encode(bufferlist &bl) {
- ::_encode(mask, bl);
- ::_encode(inode, bl);
- ::_encode(dirfrag_auth, bl);
- ::_encode(dirfrag_dist, bl);
- ::_encode(dirfrag_rep, bl);
- ::_encode(symlink, bl);
- dirfragtree._encode(bl);
+
+ ::_encode_simple(mask, bl);
+ ::_encode_simple(in->inode, bl);
+ ::_encode_simple(in->symlink, bl);
+ in->dirfragtree._encode(bl);
}
- void _decode(bufferlist &bl, int& off) {
- ::_decode(mask, bl, off);
- ::_decode(inode, bl, off);
- ::_decode(dirfrag_auth, bl, off);
- ::_decode(dirfrag_dist, bl, off);
- ::_decode(dirfrag_rep, bl, off);
- ::_decode(symlink, bl, off);
- dirfragtree._decode(bl, off);
- }
};
class MClientReply : public Message {
// reply data
- struct {
+ struct st_ {
long tid;
int op;
int result; // error code
unsigned char file_caps; // for open
long file_caps_seq;
uint64_t file_data_version; // for client buffercache consistency
-
- int _num_trace_in;
- int _dir_size;
} st;
string path;
+
list<InodeStat*> trace_in;
+ list<DirStat*> trace_dir;
list<string> trace_dn;
+ bufferlist trace_bl;
- list<string> dir_dn;
+ DirStat *dir_dir;
list<InodeStat*> dir_in;
+ list<string> dir_dn;
+ bufferlist dir_bl;
public:
long get_tid() { return st.tid; }
inodeno_t get_ino() { return trace_in.back()->inode.ino; }
const inode_t& get_inode() { return trace_in.back()->inode; }
- const list<InodeStat*>& get_trace_in() { return trace_in; }
- const list<string>& get_trace_dn() { return trace_dn; }
-
- const list<InodeStat*>& get_dir_in() { return dir_in; }
- const list<string>& get_dir_dn() { return dir_dn; }
-
unsigned char get_file_caps() { return st.file_caps; }
long get_file_caps_seq() { return st.file_caps_seq; }
uint64_t get_file_data_version() { return st.file_data_version; }
void set_file_caps_seq(long s) { st.file_caps_seq = s; }
void set_file_data_version(uint64_t v) { st.file_data_version = v; }
- MClientReply() {};
+ MClientReply() : dir_dir(0) {};
MClientReply(MClientRequest *req, int result = 0) :
- Message(MSG_CLIENT_REPLY) {
+ Message(MSG_CLIENT_REPLY), dir_dir(0) {
memset(&st, 0, sizeof(st));
this->st.tid = req->get_tid();
this->st.op = req->get_op();
this->path = req->get_path();
this->st.result = result;
-
- st._dir_size = 0;
- st._num_trace_in = 0;
}
virtual ~MClientReply() {
list<InodeStat*>::iterator it;
// serialization
virtual void decode_payload() {
- int off = 0;
- payload.copy(off, sizeof(st), (char*)&st);
- off += sizeof(st);
-
- _decode(path, payload, off);
-
- for (int i=0; i<st._num_trace_in; ++i) {
- if (i) {
- string ref_dn;
- ::_decode(ref_dn, payload, off);
- trace_dn.push_back(ref_dn);
- }
- InodeStat *ci = new InodeStat;
- ci->_decode(payload, off);
- trace_in.push_back(ci);
- }
-
- // dir contents
- ::_decode(dir_dn, payload, off);
- for (int i=0; i<st._dir_size; ++i) {
- InodeStat *ci = new InodeStat;
- ci->_decode(payload, off);
- dir_in.push_back(ci);
- }
+ bufferlist::iterator p = payload.begin();
+ ::_decode_simple(st, p);
+ ::_decode_simple(path, p);
+ ::_decode_simple(trace_bl, p);
+ ::_decode_simple(dir_bl, p);
+ assert(p.end());
}
virtual void encode_payload() {
- payload.append((char*)&st, sizeof(st));
- _encode(path, payload);
-
- // trace
- list<string>::iterator pdn = trace_dn.begin();
- list<InodeStat*>::iterator pin;
- for (pin = trace_in.begin();
- pin != trace_in.end();
- ++pin) {
- if (pin != trace_in.begin()) {
- ::_encode(*pdn, payload);
- ++pdn;
- }
- (*pin)->_encode(payload);
- }
-
- // dir contents
- ::_encode(dir_dn, payload);
- for (pin = dir_in.begin();
- pin != dir_in.end();
- ++pin)
- (*pin)->_encode(payload);
+ ::_encode_simple(st, payload);
+ ::_encode_simple(path, payload);
+ ::_encode_simple(trace_bl, payload);
+ ::_encode_simple(dir_bl, payload);
}
- // builders
- /*
- void add_dir_item(string& dn, InodeStat *in) {
- dir_dn.push_back(dn);
- dir_in.push_back(in);
- ++st._dir_size;
- }*/
- void take_dir_items(list<string>& dnls,
- list<InodeStat*>& inls,
- int num) {
- dir_dn.swap(dnls);
- dir_in.swap(inls);
- st._dir_size = num;
+
+ // dir contents
+ void take_dir_items(bufferlist& bl) {
+ dir_bl.claim(bl);
}
- /*
- void copy_dir_items(const list<InodeStat*>& inls,
- const list<string>& dnls) {
- list<string>::const_iterator pdn = dnls.begin();
- list<InodeStat*>::const_iterator pin = inls.begin();
- while (pin != inls.end()) {
- // copy!
- InodeStat *i = new InodeStat;
- *i = **pin;
- dir_in.push_back(i);
- dir_dn.push_back(*pdn);
- ++pin;
- ++pdn;
- ++st._dir_size;
+ void _decode_dir() {
+ bufferlist::iterator p = dir_bl.begin();
+ dir_dir = new DirStat(p);
+ while (!p.end()) {
+ string dn;
+ ::_decode_simple(dn, p);
+ dir_dn.push_back(dn);
+ dir_in.push_back(new InodeStat(p));
}
}
- */
+ const list<InodeStat*>& get_dir_in() {
+ if (dir_in.empty() && dir_bl.length()) _decode_dir();
+ return dir_in;
+ }
+ const list<string>& get_dir_dn() {
+ if (dir_dn.empty() && dir_bl.length()) _decode_dir();
+ return dir_dn;
+ }
+ const DirStat* get_dir_dir() {
+ return dir_dir;
+ }
+
+
+ // trace
void set_trace_dist(CInode *in, int whoami) {
- st._num_trace_in = 0;
+ // inode, dentry, dir, ..., inode
while (in) {
- // add this inode to trace, along with referring dentry name
- if (in->get_parent_dn())
- trace_dn.push_front(in->get_parent_dn()->get_name());
- trace_in.push_front(new InodeStat(in, whoami));
- ++st._num_trace_in;
-
- in = in->get_parent_inode();
+ InodeStat::_encode(trace_bl, in);
+ CDentry *dn = in->get_parent_dn();
+ if (!dn) break;
+ ::_encode_simple(in->get_parent_dn()->get_name(), trace_bl);
+ DirStat::_encode(trace_bl, dn->get_dir(), whoami);
+ in = dn->get_dir()->get_inode();
}
}
+ void _decode_trace() {
+ bufferlist::iterator p = trace_bl.begin();
+ while (!p.end()) {
+ // inode
+ trace_in.push_front(new InodeStat(p));
+ if (!p.end()) {
+ // dentry
+ string ref_dn;
+ ::_decode_simple(ref_dn, p);
+ trace_dn.push_front(ref_dn);
+
+ // dir
+ trace_dir.push_front(new DirStat(p));
+ }
+ }
+ }
+
+ const list<InodeStat*>& get_trace_in() {
+ if (trace_in.empty() && trace_bl.length()) _decode_trace();
+ return trace_in;
+ }
+ const list<DirStat*>& get_trace_dir() {
+ if (trace_in.empty() && trace_bl.length()) _decode_trace();
+ return trace_dir;
+ }
+ const list<string>& get_trace_dn() {
+ if (trace_in.empty() && trace_bl.length()) _decode_trace();
+ return trace_dn;
+ }
+
};
class MClientRequestForward : public Message {
tid_t tid;
- int dest_mds;
- int num_fwd;
+ int32_t dest_mds;
+ int32_t num_fwd;
public:
MClientRequestForward() : Message(MSG_CLIENT_REQUEST_FORWARD) {}
int get_asker() { return asker; }
inodeno_t get_base_ino() { return base_ino; }
frag_t get_base_dir_frag() { return base_dir_frag; }
+
filepath& get_want() { return want; }
inodeno_t get_want_ino() { return want_ino; }
- const string& get_dentry(int n) { return want[n]; }
+ const string& get_dentry(int n) { return want[n]; }
bool wants_base_dir() { return want_base_dir; }
bool wants_xlocked() { return want_xlocked; }
-
+
void set_base_dir_frag(frag_t f) { base_dir_frag = f; }
MDiscover() { }
*/
class MDiscoverReply : public Message {
- inodeno_t base_ino;
- bool no_base_dir; // no base dir (but IS dentry+inode)
- bool no_base_dentry; // no base dentry (but IS inode)
- bool flag_error_dn;
+ // info about original request
+ inodeno_t base_ino;
+ frag_t base_dir_frag;
+ bool wanted_base_dir;
+ bool wanted_xlocked;
+ inodeno_t wanted_ino;
+
+ // and the response
+ bool flag_error_dn;
bool flag_error_ino;
- bool flag_error_dir;
- string error_dentry; // dentry that was not found (to trigger waiters on asker)
+ bool flag_error_dir;
+ bool no_base_dir; // no base dir (but IS dentry+inode)
+ bool no_base_dentry; // no base dentry (but IS inode)
+ string error_dentry; // dentry that was not found (to trigger waiters on asker)
+
int dir_auth_hint;
- bool wanted_xlocks_hint;
-
+
vector<CDirDiscover*> dirs; // not inode-aligned if no_base_dir = true.
vector<CDentryDiscover*> dentries; // not inode-aligned if no_base_dentry = true
vector<CInodeDiscover*> inodes;
- string path;
public:
// accessors
inodeno_t get_base_ino() { return base_ino; }
+ frag_t get_base_dir_frag() { return base_dir_frag; }
+ bool get_wanted_base_dir() { return wanted_base_dir; }
+ bool get_wanted_xlocked() { return wanted_xlocked; }
+ inodeno_t get_wanted_ino() { return wanted_ino; }
+
int get_num_inodes() { return inodes.size(); }
int get_num_dentries() { return dentries.size(); }
int get_num_dirs() { return dirs.size(); }
bool has_base_dentry() { return !no_base_dentry && dentries.size(); }
bool has_base_inode() { return no_base_dir && no_base_dentry; }
- const string& get_path() { return path; }
-
- // bool is_flag_forward() { return flag_forward; }
bool is_flag_error_dn() { return flag_error_dn; }
bool is_flag_error_ino() { return flag_error_ino; }
bool is_flag_error_dir() { return flag_error_dir; }
string& get_error_dentry() { return error_dentry; }
+
int get_dir_auth_hint() { return dir_auth_hint; }
- bool get_wanted_xlocks_hint() { return wanted_xlocks_hint; }
- void set_wanted_xlocks_hint(bool w) { wanted_xlocks_hint = w; }
// these index _arguments_ are aligned to each ([[dir, ] dentry, ] inode) set.
CInodeDiscover& get_inode(int n) { return *(inodes[n]); }
// cons
MDiscoverReply() {}
- MDiscoverReply(inodeno_t base_ino) :
- Message(MSG_MDS_DISCOVERREPLY) {
- this->base_ino = base_ino;
- flag_error_dn = false;
- flag_error_dir = false;
- no_base_dir = no_base_dentry = false;
- dir_auth_hint = CDIR_AUTH_UNKNOWN;
+ MDiscoverReply(MDiscover *dis) :
+ Message(MSG_MDS_DISCOVERREPLY),
+ base_ino(dis->get_base_ino()),
+ base_dir_frag(dis->get_base_dir_frag()),
+ wanted_base_dir(dis->wants_base_dir()),
+ wanted_xlocked(dis->wants_xlocked()),
+ wanted_ino(dis->get_want_ino()),
+ flag_error_dn(false),
+ flag_error_ino(false),
+ flag_error_dir(false),
+ no_base_dir(false), no_base_dentry(false),
+ dir_auth_hint(CDIR_AUTH_UNKNOWN) {
+ }
+ MDiscoverReply(dirfrag_t df) :
+ Message(MSG_MDS_DISCOVERREPLY),
+ base_ino(df.ino),
+ base_dir_frag(df.frag),
+ wanted_base_dir(false),
+ wanted_xlocked(false),
+ wanted_ino(inodeno_t()),
+ flag_error_dn(false),
+ flag_error_ino(false),
+ flag_error_dir(false),
+ no_base_dir(false), no_base_dentry(false),
+ dir_auth_hint(CDIR_AUTH_UNKNOWN) {
}
~MDiscoverReply() {
for (vector<CDirDiscover*>::iterator it = dirs.begin();
bool is_empty() {
return dirs.empty() && dentries.empty() && inodes.empty() &&
!flag_error_dn &&
+ !flag_error_ino &&
!flag_error_dir &&
dir_auth_hint == CDIR_AUTH_UNKNOWN;
}
void add_dentry(CDentryDiscover* ddis) {
if (dentries.empty() && dirs.empty()) no_base_dir = true;
dentries.push_back(ddis);
- if (path.length()) path += "/";
- path += ddis->get_dname();
}
void add_inode(CInodeDiscover* din) {
dirs.push_back( dir );
}
+
// void set_flag_forward() { flag_forward = true; }
void set_flag_error_dn(const string& dn) {
flag_error_dn = true;
virtual void decode_payload() {
int off = 0;
::_decode(base_ino, payload, off);
- ::_decode(no_base_dir, payload, off);
- ::_decode(no_base_dentry, payload, off);
+ ::_decode(base_dir_frag, payload, off);
+ ::_decode(wanted_base_dir, payload, off);
+ ::_decode(wanted_xlocked, payload, off);
::_decode(flag_error_dn, payload, off);
::_decode(flag_error_ino, payload, off);
::_decode(flag_error_dir, payload, off);
+ ::_decode(no_base_dir, payload, off);
+ ::_decode(no_base_dentry, payload, off);
::_decode(error_dentry, payload, off);
::_decode(dir_auth_hint, payload, off);
- ::_decode(wanted_xlocks_hint, payload, off);
// dirs
int n;
}
void encode_payload() {
::_encode(base_ino, payload);
- ::_encode(no_base_dir, payload);
- ::_encode(no_base_dentry, payload);
+ ::_encode(base_dir_frag, payload);
+ ::_encode(wanted_base_dir, payload);
+ ::_encode(wanted_xlocked, payload);
::_encode(flag_error_dn, payload);
::_encode(flag_error_ino, payload);
::_encode(flag_error_dir, payload);
+ ::_encode(no_base_dir, payload);
+ ::_encode(no_base_dentry, payload);
::_encode(error_dentry, payload);
::_encode(dir_auth_hint, payload);
- ::_encode(wanted_xlocks_hint, payload);
// dirs
int n = dirs.size();
class MExportDir : public Message {
dirfrag_t dirfrag;
- list<bufferlist> dirstate; // a bl for reach dir
- list<dirfrag_t> bounds;
+ bufferlist dirstate;
+ list<dirfrag_t> bounds;
public:
MExportDir() {}
}
dirfrag_t get_dirfrag() { return dirfrag; }
- list<bufferlist>& get_dirstate() { return dirstate; }
+ bufferlist& get_dirstate() { return dirstate; }
list<dirfrag_t>& get_bounds() { return bounds; }
- void add_dir(bufferlist& dir) {
- dirstate.push_back(dir);
- }
- void set_dirstate(const list<bufferlist>& ls) {
- dirstate = ls;
+ void take_dirstate(bufferlist& bl) {
+ dirstate.claim(bl);
}
void add_export(dirfrag_t df) {
bounds.push_back(df);
}
virtual void decode_payload() {
- int off = 0;
- payload.copy(off, sizeof(dirfrag), (char*)&dirfrag);
- off += sizeof(dirfrag);
- ::_decode(path, payload, off);
+ bufferlist::iterator p = payload.begin();
+ ::_decode_simple(dirfrag, p);
+ ::_decode_simple(path, p);
}
virtual void encode_payload() {
- payload.append((char*)&dirfrag, sizeof(dirfrag));
- ::_encode(path, payload);
+ ::_encode_simple(dirfrag, payload);
+ ::_encode_simple(path, payload);
}
};
#define LOCK_AC_LOCKACK 3
#define LOCK_AC_REQSCATTER 7
+#define LOCK_AC_REQUNSCATTER 8
#define LOCK_AC_FOR_REPLICA(a) ((a) < 0)
#define LOCK_AC_FOR_AUTH(a) ((a) > 0)
case LOCK_AC_MIXEDACK: return "mixedack";
case LOCK_AC_LOCKACK: return "lockack";
case LOCK_AC_REQSCATTER: return "reqscatter";
+ case LOCK_AC_REQUNSCATTER: return "requnscatter";
default: assert(0); return 0;
}
}
class MLock : public Message {
- int asker; // who is initiating this request
- int action; // action type
+ int32_t action; // action type
+ int32_t asker; // who is initiating this request
metareqid_t reqid; // for remote lock requests
char lock_type; // lock object type
MDSCacheObjectInfo &get_object_info() { return object_info; }
MLock() {}
- MLock(int action, int asker) :
- Message(MSG_MDS_LOCK) {
- this->action = action;
- this->asker = asker;
- }
- MLock(SimpleLock *lock, int action, int asker) :
- Message(MSG_MDS_LOCK) {
- this->lock_type = lock->get_type();
+ MLock(int ac, int as) :
+ Message(MSG_MDS_LOCK),
+ action(ac), asker(as),
+ lock_type(0) { }
+ MLock(SimpleLock *lock, int ac, int as) :
+ Message(MSG_MDS_LOCK),
+ action(ac), asker(as),
+ lock_type(lock->get_type()) {
lock->get_parent()->set_object_info(object_info);
- this->action = action;
- this->asker = asker;
}
- MLock(SimpleLock *lock, int action, int asker, bufferlist& bl) :
- Message(MSG_MDS_LOCK) {
- this->lock_type = lock->get_type();
+ MLock(SimpleLock *lock, int ac, int as, bufferlist& bl) :
+ Message(MSG_MDS_LOCK),
+ action(ac), asker(as), lock_type(lock->get_type()) {
lock->get_parent()->set_object_info(object_info);
- this->action = action;
- this->asker = asker;
data.claim(bl);
}
virtual char *get_type_name() { return "ILock"; }
inode_full(const inode_t& i, const string& s, const fragtree_t& f) :
inode(i), symlink(s), dirfragtree(f) {}
- void _decode(bufferlist& bl, int& off) {
- ::_decode(inode, bl, off);
- ::_decode(symlink, bl, off);
- ::_decode(dirfragtree, bl, off);
+ void _decode(bufferlist::iterator& p) {
+ ::_decode_simple(inode, p);
+ ::_decode_simple(symlink, p);
+ dirfragtree._decode(p);
}
void _encode(bufferlist& bl) const {
::_encode(inode, bl);
::_encode(symlink, bl);
- ::_encode(dirfragtree, bl);
+ dirfragtree._encode(bl);
}
};
::_encode(xlocked_dentries, payload);
}
void decode_payload() {
- int off = 0;
- ::_decode(op, payload, off);
- ::_decode(strong_inodes, payload, off);
- ::_decode_complex(full_inodes, payload, off);
- ::_decode(authpinned_inodes, payload, off);
- ::_decode(xlocked_inodes, payload, off);
- ::_decode(cap_export_bl, payload, off);
+ bufferlist::iterator p = payload.begin();
+ ::_decode_simple(op, p);
+ ::_decode_simple(strong_inodes, p);
+ ::_decode_complex(full_inodes, p);
+ ::_decode_simple(authpinned_inodes, p);
+ ::_decode_simple(xlocked_inodes, p);
+ ::_decode_simple(cap_export_bl, p);
if (cap_export_bl.length()) {
- int off = 0;
- ::_decode(cap_exports, cap_export_bl, off);
- ::_decode(cap_export_paths, cap_export_bl, off);
+ bufferlist::iterator q = cap_export_bl.begin();
+ ::_decode_simple(cap_exports, q);
+ ::_decode_simple(cap_export_paths, q);
}
- ::_decode(strong_dirfrags, payload, off);
- ::_decode(weak, payload, off);
- ::_decode(weak_inodes, payload, off);
- ::_decode(strong_dentries, payload, off);
- ::_decode(authpinned_dentries, payload, off);
- ::_decode(xlocked_dentries, payload, off);
+ ::_decode_simple(strong_dirfrags, p);
+ ::_decode_simple(weak, p);
+ ::_decode_simple(weak_inodes, p);
+ ::_decode_simple(strong_dentries, p);
+ ::_decode_simple(authpinned_dentries, p);
+ ::_decode_simple(xlocked_dentries, p);
}
};
static const int OP_RENAMEPREP = 7;
static const int OP_RENAMEPREPACK = -7;
- static const int OP_RENAMEGETINODE = 8;
- static const int OP_RENAMEGETINODEACK = -8;
-
static const int OP_FINISH = 17;
static const int OP_ABORT = 20; // used for recovery only
case OP_RENAMEPREP: return "rename_prep";
case OP_RENAMEPREPACK: return "rename_prep_ack";
- case OP_RENAMEGETINODE: return "rename_get_inode";
- case OP_RENAMEGETINODEACK: return "rename_get_inode_ack";
case OP_FINISH: return "finish"; // commit
case OP_ABORT: return "abort";
// for rename prep
string srcdnpath;
string destdnpath;
- set<int> srcdn_replicas;
+ set<int> witnesses;
bufferlist inode_export;
version_t inode_export_v;
+ bufferlist srci_replica;
utime_t now;
bufferlist stray; // stray dir + dentry
::_encode_complex(authpins, payload);
::_encode(srcdnpath, payload);
::_encode(destdnpath, payload);
- ::_encode(srcdn_replicas, payload);
+ ::_encode(witnesses, payload);
::_encode(now, payload);
::_encode(inode_export, payload);
::_encode(inode_export_v, payload);
+ ::_encode(srci_replica, payload);
::_encode(stray, payload);
}
void decode_payload() {
- int off = 0;
- ::_decode(reqid, payload, off);
- ::_decode(op, payload, off);
- ::_decode(lock_type, payload, off);
- object_info._decode(payload, off);
- ::_decode_complex(authpins, payload, off);
- ::_decode(srcdnpath, payload, off);
- ::_decode(destdnpath, payload, off);
- ::_decode(srcdn_replicas, payload, off);
- ::_decode(now, payload, off);
- ::_decode(inode_export, payload, off);
- ::_decode(inode_export_v, payload, off);
- ::_decode(stray, payload, off);
+ bufferlist::iterator p = payload.begin();
+ ::_decode_simple(reqid, p);
+ ::_decode_simple(op, p);
+ ::_decode_simple(lock_type, p);
+ object_info._decode(p);
+ ::_decode_complex(authpins, p);
+ ::_decode_simple(srcdnpath, p);
+ ::_decode_simple(destdnpath, p);
+ ::_decode_simple(witnesses, p);
+ ::_decode_simple(now, p);
+ ::_decode_simple(inode_export, p);
+ ::_decode_simple(inode_export_v, p);
+ ::_decode_simple(srci_replica, p);
+ ::_decode_simple(stray, p);
}
char *get_type_name() { return "slave_request"; }
#include "config.h"
-#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".client "
-#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".client "
+#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".client v" << client_map.version << " "
+#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".client v" << client_map.version << " "
inc._decode(bl, off);
client_map.apply_incremental(inc);
- dout(1) << "v" << client_map.version << ": "
- << client_map.client_addr.size() << " clients (+"
+ dout(1) << client_map.client_addr.size() << " clients (+"
<< inc.mount.size() << " -" << inc.unmount.size() << ")"
<< dendl;
class MonMap {
public:
- epoch_t epoch; // what epoch/version of the monmap
- int num_mon;
+ epoch_t epoch; // what epoch/version of the monmap
+ int32_t num_mon;
vector<entity_inst_t> mon_inst;
int last_mon; // last mon i talked to
}
void encode(bufferlist& blist) {
- blist.append((char*)&epoch, sizeof(epoch));
- blist.append((char*)&num_mon, sizeof(num_mon));
-
- _encode(mon_inst, blist);
+ ::_encode(epoch, blist);
+ ::_encode(num_mon, blist);
+ ::_encode(mon_inst, blist);
}
void decode(bufferlist& blist) {
int off = 0;
- blist.copy(off, sizeof(epoch), (char*)&epoch);
- off += sizeof(epoch);
- blist.copy(off, sizeof(num_mon), (char*)&num_mon);
- off += sizeof(num_mon);
-
- _decode(mon_inst, blist, off);
+ ::_decode(epoch, blist, off);
+ ::_decode(num_mon, blist, off);
+ ::_decode(mon_inst, blist, off);
}
// read from/write to a file
char tfn[200];
sprintf(tfn, "%s.new", fn);
- int fd = ::open(tfn, O_WRONLY|O_CREAT);
+ int fd = ::open(tfn, O_WRONLY|O_CREAT, 0644);
assert(fd > 0);
- ::fchmod(fd, 0644);
::write(fd, vs, strlen(vs));
::close(fd);
::rename(tfn, fn);
char tfn[200];
sprintf(tfn, "%s.new", fn);
- int fd = ::open(tfn, O_WRONLY|O_CREAT);
+ int fd = ::open(tfn, O_WRONLY|O_CREAT, 0644);
assert(fd);
- // chmod
- ::fchmod(fd, 0644);
-
// write data
for (list<bufferptr>::const_iterator it = bl.buffers().begin();
it != bl.buffers().end();
#include "config.h"
-#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd(e" << osdmap.get_epoch() << ") "
-#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd(e" << osdmap.get_epoch() << ") "
+#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd e" << osdmap.get_epoch() << " "
+#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd e" << osdmap.get_epoch() << " "
// FAKING
::_decode(pg_stat_updates, bl, off);
::_decode(osd_stat_updates, bl, off);
}
+
+ Incremental() : version(0) {}
};
void apply_incremental(Incremental& inc) {
dout(10) << "accepter.start" << dendl;
char hostname[100];
+ memset(hostname, 0, 100);
gethostname(hostname, 100);
dout(2) << "accepter.start my hostname is " << hostname << dendl;
// set a harmless handle for SIGUSR1 (we'll use it to stop the accepter)
struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
sa.sa_handler = noop_signal_handler;
sa.sa_flags = 0;
+ sigemptyset(&sa.sa_mask);
sigaction(SIGUSR1, &sa, NULL);
// start thread
* EntityMessenger
*/
-Rank::EntityMessenger::EntityMessenger(entity_name_t myaddr) :
- Messenger(myaddr),
- stop(false),
- dispatch_thread(this)
-{
-}
-Rank::EntityMessenger::~EntityMessenger()
-{
- // join dispatch thread
- if (dispatch_thread.is_started())
- dispatch_thread.join();
-}
-
void Rank::EntityMessenger::dispatch_entry()
{
lock.Lock();
while (!stop) {
- if (!dispatch_queue.empty()) {
+ if (!dispatch_queue.empty() || !prio_dispatch_queue.empty()) {
list<Message*> ls;
- ls.swap(dispatch_queue);
- qlen = 0;
+ if (!prio_dispatch_queue.empty()) {
+ ls.swap(prio_dispatch_queue);
+ pqlen = 0;
+ } else {
+ if (0) {
+ ls.swap(dispatch_queue);
+ qlen = 0;
+ } else {
+ // limit how much low-prio stuff we grab, to avoid starving high-prio messages!
+ ls.push_back(dispatch_queue.front());
+ dispatch_queue.pop_front();
+ qlen--;
+ }
+ }
lock.Unlock();
{
Mutex lock;
Cond cond;
list<Message*> dispatch_queue;
+ list<Message*> prio_dispatch_queue;
bool stop;
- int qlen;
+ int qlen, pqlen;
class DispatchThread : public Thread {
EntityMessenger *m;
m->set_recv_stamp(g_clock.now());
lock.Lock();
- dispatch_queue.push_back(m);
- qlen++;
- cond.Signal();
- lock.Unlock();
- }
- void queue_messages(list<Message*> ls) {
- lock.Lock();
- qlen += ls.size();
- dispatch_queue.splice(dispatch_queue.end(), ls);
+ if (m->get_source().is_mon()) {
+ prio_dispatch_queue.push_back(m);
+ pqlen++;
+ } else {
+ qlen++;
+ dispatch_queue.push_back(m);
+ }
cond.Signal();
lock.Unlock();
}
public:
- EntityMessenger(entity_name_t myaddr);
- ~EntityMessenger();
+ EntityMessenger(entity_name_t myaddr) :
+ Messenger(myaddr),
+ stop(false),
+ qlen(0), pqlen(0),
+ dispatch_thread(this) { }
+ ~EntityMessenger() {
+ // join dispatch thread
+ if (dispatch_thread.is_started())
+ dispatch_thread.join();
+ }
void ready();
bool is_stopped() { return stop; }
const entity_addr_t &get_myaddr();
- int get_dispatch_queue_len() { return qlen; }
+ int get_dispatch_queue_len() { return qlen + pqlen; }
void reset_myname(entity_name_t m);
*/
struct entity_addr_t {
struct ceph_entity_addr v;
+ uint32_t _pad;
- entity_addr_t() {
+ entity_addr_t() : _pad(0) {
v.port = v.nonce = 0;
v.ipq[0] = v.ipq[1] = v.ipq[2] = v.ipq[3] = 0;
}
#define intabs(x) ((x) >= 0 ? (x):(-(x)))
+#include <mpi.h>
+
#include <sys/stat.h>
#include <iostream>
#include <string>
/*
* start up NewMessenger via MPI.
*/
-#include <mpi.h>
pair<int,int> mpi_bootstrap_new(int& argc, char**& argv, MonMap *monmap)
{
bool OSD::_share_map_incoming(const entity_inst_t& inst, epoch_t epoch)
{
bool shared = false;
+ dout(20) << "_share_map_incoming " << inst << " " << epoch << dendl;
+ assert(osd_lock.is_locked());
// does client have old map?
if (inst.name.is_client()) {
// does peer have old map?
if (inst.name.is_osd()) {
// remember
- if (peer_map_epoch[inst.name] < epoch)
+ if (peer_map_epoch[inst.name] < epoch) {
+ dout(20) << "peer " << inst.name << " has " << epoch << dendl;
peer_map_epoch[inst.name] = epoch;
+ }
// older?
if (peer_map_epoch[inst.name] < osdmap->get_epoch()) {
int64_t num_blocks; // in 4k blocks
int64_t num_objects;
- pg_stat_t() : state(0), size(0), num_blocks(0), num_objects(0) {}
+ pg_stat_t() : reported(0), state(0), size(0), num_blocks(0), num_objects(0) {}
};
void Objecter::maybe_request_map()
{
utime_t now;
-
- if (last_epoch_requested <= osdmap->get_epoch() ||
- (now = g_clock.now()) - last_epoch_requested_stamp > g_conf.objecter_map_request_interval) {
- dout(10) << "maybe_request_map requesting next osd map" << dendl;
- last_epoch_requested_stamp = now;
- last_epoch_requested = osdmap->get_epoch()+1;
- messenger->send_message(new MOSDGetMap(osdmap->get_epoch(), last_epoch_requested),
- monmap->get_inst(monmap->pick_mon()));
- }
+ if (!osdmap) goto yes;
+ if (last_epoch_requested <= osdmap->get_epoch()) goto yes;
+ now = g_clock.now();
+ if (now - last_epoch_requested_stamp > g_conf.objecter_map_request_interval) goto yes;
+ return;
+
+ yes:
+ dout(10) << "maybe_request_map requesting next osd map" << dendl;
+ last_epoch_requested_stamp = now;
+ last_epoch_requested = osdmap->get_epoch()+1;
+ messenger->send_message(new MOSDGetMap(osdmap->get_epoch(), last_epoch_requested),
+ monmap->get_inst(monmap->pick_mon()));
}
messenger(m), monmap(mm), osdmap(om),
last_tid(0), client_inc(-1),
num_unacked(0), num_uncommitted(0),
+ last_epoch_requested(0),
client_lock(l), timer(l)
{ }
~Objecter() { }
obj:*
obj:*
}
+
+# gethostbyname
+{
+ gethostbyname on issdm
+ Memcheck:Param
+ socketcall.sendto(msg)
+ fun:send
+ fun:get_mapping
+ fun:__nscd_get_map_ref
+ fun:nscd_gethst_r
+ fun:__nscd_gethostbyname_r
+ fun:gethostbyname_r@@GLIBC_2.2.5
+ fun:gethostbyname
+ fun:_ZN4Rank8Accepter5startEv
+ fun:_ZN4Rank10start_rankEv
+ fun:main
+}
+
+# gethostbyname
+
+{
+ gethostbyname on foil
+ Memcheck:Addr8
+ obj:/lib/ld-2.6.1.so
+ obj:/lib/ld-2.6.1.so
+ obj:/lib/ld-2.6.1.so
+ obj:/lib/ld-2.6.1.so
+ obj:/lib/ld-2.6.1.so
+ obj:/lib/ld-2.6.1.so
+ obj:/lib/ld-2.6.1.so
+ obj:/lib/libc-2.6.1.so
+ obj:/lib/ld-2.6.1.so
+ fun:__libc_dlopen_mode
+ fun:__nss_lookup_function
+ obj:/lib/libc-2.6.1.so
+}
+