// uint64_t end = ((uint64_t)(mds->get_nodeid()+2) << 25) - 1;
//#endif
free.insert(start, end);
+
+ projected_free.m = free.m;
}
-inodeno_t InoTable::alloc_id(inodeno_t id)
+inodeno_t InoTable::project_alloc_id(inodeno_t id)
{
assert(is_active());
-
- // pick one
if (!id)
- id = free.start();
- free.erase(id);
- dout(10) << "alloc id " << id << dendl;
-
- version++;
-
+ id = projected_free.start();
+ projected_free.erase(id);
+ dout(10) << "project_alloc_id " << id << dendl;
+ ++projected_version;
return id;
}
-
-void InoTable::alloc_ids(vector<inodeno_t>& ids)
+void InoTable::apply_alloc_id(inodeno_t id)
{
- assert(is_active());
- dout(10) << "alloc_ids " << ids << dendl;
- for (vector<inodeno_t>::iterator p = ids.begin(); p != ids.end(); p++)
- free.erase(*p);
- version++;
+ dout(10) << "apply_alloc_id " << id << dendl;
+ free.erase(id);
+ ++version;
}
-void InoTable::alloc_ids(deque<inodeno_t>& ids, int want)
+void InoTable::project_alloc_ids(deque<inodeno_t>& ids, int want)
{
assert(is_active());
for (int i=0; i<want; i++) {
- inodeno_t id = free.start();
- free.erase(id);
- dout(10) << "alloc_ids " << id << dendl;
+ inodeno_t id = projected_free.start();
+ projected_free.erase(id);
ids.push_back(id);
}
- version++;
+ dout(10) << "project_alloc_ids " << ids << dendl;
+ ++projected_version;
}
+void InoTable::apply_alloc_ids(deque<inodeno_t>& ids)
+{
+ dout(10) << "apply_alloc_ids " << ids << dendl;
+ for (deque<inodeno_t>::iterator p = ids.begin();
+ p != ids.end();
+ p++)
+ free.erase(*p);
+ ++version;
+}
+
-void InoTable::release_ids(deque<inodeno_t>& ids)
+void InoTable::project_release_ids(deque<inodeno_t>& ids)
{
- assert(is_active());
- dout(10) << "release_ids " << ids << dendl;
+ dout(10) << "project_release_ids " << ids << dendl;
+ for (deque<inodeno_t>::iterator p = ids.begin(); p != ids.end(); p++)
+ projected_free.insert(*p);
+ ++projected_version;
+}
+void InoTable::apply_release_ids(deque<inodeno_t>& ids)
+{
+ dout(10) << "apply_release_ids " << ids << dendl;
for (deque<inodeno_t>::iterator p = ids.begin(); p != ids.end(); p++)
free.insert(*p);
- version++;
+ ++version;
}
+//
+
+void InoTable::replay_alloc_id(inodeno_t id)
+{
+ dout(10) << "replay_alloc_id " << id << dendl;
+ free.erase(id);
+ projected_free.erase(id);
+ projected_version = ++version;
+}
+void InoTable::replay_alloc_ids(deque<inodeno_t>& ids)
+{
+ dout(10) << "replay_alloc_ids " << ids << dendl;
+ for (deque<inodeno_t>::iterator p = ids.begin(); p != ids.end(); p++) {
+ free.erase(*p);
+ projected_free.erase(*p);
+ }
+ projected_version = ++version;
+}
+void InoTable::replay_release_ids(deque<inodeno_t>& ids)
+{
+ dout(10) << "replay_release_ids " << ids << dendl;
+ for (deque<inodeno_t>::iterator p = ids.begin(); p != ids.end(); p++) {
+ free.insert(*p);
+ projected_free.insert(*p);
+ }
+ projected_version = ++version;
+}
+
class InoTable : public MDSTable {
interval_set<inodeno_t> free; // unused ids
+ interval_set<inodeno_t> projected_free;
public:
InoTable(MDS *m) : MDSTable(m, "inotable") { }
- // alloc or reclaim ids
- inodeno_t alloc_id(inodeno_t id=0);
- void alloc_ids(vector<inodeno_t>& inos);
- void alloc_ids(deque<inodeno_t>& inos, int want);
- void release_ids(vector<inodeno_t>& inos);
- void release_ids(deque<inodeno_t>& inos);
+ inodeno_t project_alloc_id(inodeno_t id=0);
+ void apply_alloc_id(inodeno_t id);
+
+ void project_alloc_ids(deque<inodeno_t>& inos, int want);
+ void apply_alloc_ids(deque<inodeno_t>& inos);
+
+ void project_release_ids(deque<inodeno_t>& inos);
+ void apply_release_ids(deque<inodeno_t>& inos);
+
+ void replay_alloc_id(inodeno_t ino);
+ void replay_alloc_ids(deque<inodeno_t>& inos);
+ void replay_release_ids(deque<inodeno_t>& inos);
void init_inode();
void reset_state();
}
void decode_state(bufferlist::iterator& bl) {
::decode(free.m, bl);
+ projected_free.m = free.m;
}
};
CInode *ref_snapdiri;
snapid_t ref_snapid;
- inodeno_t alloc_ino;
+ inodeno_t alloc_ino, used_prealloc_ino;
deque<inodeno_t> prealloc_inos;
// -- i am a slave request
dout(10) << "load_2 got " << bl.length() << " bytes" << dendl;
bufferlist::iterator p = bl.begin();
::decode(version, p);
- committed_version = version;
+ projected_version = committed_version = version;
dout(10) << "load_2 loaded v" << version << dendl;
decode_state(p);
}
//static const int STATE_COMMITTING = 3;
int state;
- version_t version, committing_version, committed_version;
+ version_t version, committing_version, committed_version, projected_version;
map<version_t, list<Context*> > waitfor_save;
mds(m), table_name(n),
ino(0),
state(STATE_UNDEF),
- version(0), committing_version(0), committed_version(0) {}
+ version(0), committing_version(0), committed_version(0), projected_version(0) {}
virtual ~MDSTable() {}
version_t get_version() { return version; }
version_t get_committed_version() { return committed_version; }
version_t get_committing_version() { return committing_version; }
+ version_t get_projected_version() { return projected_version; }
+
+ //version_t project_version() { return ++projected_version; }
+ //version_t inc_version() { return ++version; }
// load/save from disk (hack)
bool is_undef() { return state == STATE_UNDEF; }
Session *session;
bool open;
version_t cmapv;
+ deque<inodeno_t> inos;
+ version_t inotablev;
public:
C_MDS_session_finish(MDS *m, Session *se, bool s, version_t mv) :
- mds(m), session(se), open(s), cmapv(mv) { }
+ mds(m), session(se), open(s), cmapv(mv), inotablev(0) { }
+ C_MDS_session_finish(MDS *m, Session *se, bool s, version_t mv, deque<inodeno_t>& i, version_t iv) :
+ mds(m), session(se), open(s), cmapv(mv), inos(i), inotablev(iv) { }
void finish(int r) {
assert(r == 0);
- mds->server->_session_logged(session, open, cmapv);
+ mds->server->_session_logged(session, open, cmapv, inos, inotablev);
}
};
void Server::handle_client_session(MClientSession *m)
{
- version_t pv, piv;
+ version_t pv, piv = 0;
Session *session = mds->sessionmap.get_session(m->get_source());
dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
}
mds->sessionmap.set_state(session, Session::STATE_CLOSING);
pv = ++mds->sessionmap.projected;
- if (session->inos.size()) {
- mds->inotable->release_ids(session->inos);
- piv = mds->inotable->get_version();
+ if (session->prealloc_inos.size()) {
+ assert(session->projected_inos == 0);
+ mds->inotable->project_release_ids(session->prealloc_inos);
+ piv = mds->inotable->get_projected_version();
} else
piv = 0;
- mdlog->submit_entry(new ESession(m->get_source_inst(), false, pv, session->inos, piv),
- new C_MDS_session_finish(mds, session, false, pv));
+ mdlog->submit_entry(new ESession(m->get_source_inst(), false, pv, session->prealloc_inos, piv),
+ new C_MDS_session_finish(mds, session, false, pv, session->prealloc_inos, piv));
break;
default:
}
}
-void Server::_session_logged(Session *session, bool open, version_t pv)
+void Server::_session_logged(Session *session, bool open, version_t pv, deque<inodeno_t>& inos, version_t piv)
{
dout(10) << "_session_logged " << session->inst << " " << (open ? "open":"close")
<< " " << pv << dendl;
dout(20) << " killing client lease of " << *p << dendl;
p->remove_client_lease(r, r->mask, mds->locker);
}
+
+ if (piv) {
+ mds->inotable->apply_release_ids(inos);
+ assert(mds->inotable->get_version() == piv);
+ }
+
if (session->is_closing())
mds->messenger->send_message(new MClientSession(CEPH_SESSION_CLOSE), session->inst);
else if (session->is_stale_closing())
}
// give any preallocated inos to the session
- Session *session = mdr->session;
- for (deque<inodeno_t>::iterator p = mdr->prealloc_inos.begin();
- p != mdr->prealloc_inos.end();
- p++)
- session->inos.push_back(*p);
+ apply_allocated_inos(mdr);
// clean up request, drop locks, etc.
// do this before replying, so that we can issue leases
+ Session *session = mdr->session;
entity_inst_t client_inst = req->get_orig_source_inst();
mdcache->request_finish(mdr);
mdr = 0;
CInode *in = new CInode(mdcache);
// assign ino
- int want = 1 + g_conf.mds_client_prealloc_inos - mdr->session->projected_inos;
- mds->inotable->alloc_ids(mdr->prealloc_inos, want);
- if (mdr->session->inos.size()) {
- in->inode.ino = mdr->session->take_ino(); // previously preallocated
+ if (mdr->session->prealloc_inos.size()) {
+ mdr->used_prealloc_ino =
+ in->inode.ino = mdr->session->take_ino(); // prealloc -> used
+ mds->sessionmap.projected++;
+ dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino << dendl;
} else {
- in->inode.ino =
- mdr->alloc_ino = mdr->prealloc_inos.front(); // ok, take one we just allocated.
- mdr->prealloc_inos.pop_front();
+ mdr->alloc_ino =
+ in->inode.ino = mds->inotable->project_alloc_id();
+ dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
+ }
+
+ int want = g_conf.mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
+ if (want > 0) {
+ mds->inotable->project_alloc_ids(mdr->prealloc_inos, want);
+ assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
+ mdr->session->projected_inos += mdr->prealloc_inos.size();
+ mds->sessionmap.projected++;
+ dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
}
- mdr->session->projected_inos += mdr->prealloc_inos.size();
in->inode.version = 1;
in->inode.nlink = 1; // FIXME
return in;
}
+void Server::journal_allocated_inos(MDRequest *mdr, EMetaBlob *blob)
+{
+ blob->set_ino_alloc(mdr->alloc_ino,
+ mdr->used_prealloc_ino,
+ mdr->prealloc_inos,
+ mdr->client_request->get_orig_source(),
+ mds->sessionmap.projected,
+ mds->inotable->get_projected_version());
+}
+
+void Server::apply_allocated_inos(MDRequest *mdr)
+{
+ Session *session = mdr->session;
+ dout(10) << "apply_allocated_inos " << mdr->alloc_ino
+ << " / " << mdr->prealloc_inos
+ << " / " << mdr->used_prealloc_ino << dendl;
+
+ if (mdr->alloc_ino) {
+ mds->inotable->apply_alloc_id(mdr->alloc_ino);
+ }
+ if (mdr->prealloc_inos.size()) {
+ for (deque<inodeno_t>::iterator p = mdr->prealloc_inos.begin();
+ p != mdr->prealloc_inos.end();
+ p++)
+ session->prealloc_inos.push_back(*p);
+ session->projected_inos -= mdr->prealloc_inos.size();
+ mds->sessionmap.version++;
+ mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
+ }
+ if (mdr->used_prealloc_ino) {
+ assert(session->used_inos.front() == mdr->used_prealloc_ino);
+ session->used_inos.pop_front();
+ mds->sessionmap.version++;
+ }
+}
+
CDir *Server::traverse_to_auth_dir(MDRequest *mdr, vector<CDentry*> &trace, filepath refpath)
};
-void Server::note_allocated_inos(MDRequest *mdr, EMetaBlob *blob)
-{
- if (mdr->alloc_ino)
- blob->add_allocated_ino(mdr->alloc_ino, mds->inotable->get_version());
- if (mdr->prealloc_inos.size())
- for (deque<inodeno_t>::iterator p = mdr->prealloc_inos.begin();
- p != mdr->prealloc_inos.end();
- p++)
- blob->add_allocated_ino(*p, mds->inotable->get_version());
-}
-
void Server::handle_client_mknod(MDRequest *mdr)
{
MClientRequest *req = mdr->client_request;
mdr->ls = mdlog->get_current_segment();
EUpdate *le = new EUpdate(mdlog, "mknod");
le->metablob.add_client_req(req->get_reqid());
- note_allocated_inos(mdr, &le->metablob);
+ journal_allocated_inos(mdr, &le->metablob);
mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
le->metablob.add_primary_dentry(dn, true, newi);
mdr->ls = mdlog->get_current_segment();
EUpdate *le = new EUpdate(mdlog, "mkdir");
le->metablob.add_client_req(req->get_reqid());
- note_allocated_inos(mdr, &le->metablob);
+ journal_allocated_inos(mdr, &le->metablob);
mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
le->metablob.add_primary_dentry(dn, true, newi, &newi->inode);
le->metablob.add_dir(newdir, true, true, true); // dirty AND complete AND new
mdr->ls = mdlog->get_current_segment();
EUpdate *le = new EUpdate(mdlog, "symlink");
le->metablob.add_client_req(req->get_reqid());
- note_allocated_inos(mdr, &le->metablob);
+ journal_allocated_inos(mdr, &le->metablob);
mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
le->metablob.add_primary_dentry(dn, true, newi);
mdr->ls = mdlog->get_current_segment();
EUpdate *le = new EUpdate(mdlog, "openc");
le->metablob.add_client_req(req->get_reqid());
- note_allocated_inos(mdr, &le->metablob);
+ journal_allocated_inos(mdr, &le->metablob);
mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
le->metablob.add_primary_dentry(dn, true, in);
set<int> client_reconnect_gather; // clients i need a reconnect msg from.
void handle_client_session(class MClientSession *m);
- void _session_logged(Session *session, bool open, version_t pv);
+ void _session_logged(Session *session, bool open, version_t pv, deque<inodeno_t>& inos,version_t piv);
version_t prepare_force_open_sessions(map<__u32,entity_inst_t> &cm);
void finish_force_open_sessions(map<__u32,entity_inst_t> &cm);
void terminate_sessions();
CDir *traverse_to_auth_dir(MDRequest *mdr, vector<CDentry*> &trace, filepath refpath);
CDentry *prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dname, bool okexist=false);
CInode* prepare_new_inode(MDRequest *mdr, CDir *dir);
- void note_allocated_inos(MDRequest *mdr, EMetaBlob *blob);
+ void journal_allocated_inos(MDRequest *mdr, EMetaBlob *blob);
+ void apply_allocated_inos(MDRequest *mdr);
CInode* rdlock_path_pin_ref(MDRequest *mdr, bool want_auth, bool rdlock_dft=false);
CDentry* rdlock_path_xlock_dentry(MDRequest *mdr, bool okexist, bool mustexist);
entity_inst_t inst;
xlist<Session*>::item session_list_item;
- deque<inodeno_t> inos;
- int projected_inos;
+ int projected_inos; // journaling prealloc, will be added to prealloc_inos
+ deque<inodeno_t> prealloc_inos; // preallocated, ready to use.
+ deque<inodeno_t> used_inos; // journaling use
inodeno_t take_ino() {
- assert(!inos.empty());
- inodeno_t i = inos.front();
- inos.pop_front();
- projected_inos--;
+ assert(!prealloc_inos.empty());
+ inodeno_t i = prealloc_inos.front();
+ prealloc_inos.pop_front();
+ used_inos.push_back(i);
return i;
}
+ int get_num_projected_prealloc_inos() {
+ return prealloc_inos.size() + projected_inos;
+ }
int get_client() { return inst.name.num(); }
::encode(inst, bl);
::encode(cap_push_seq, bl);
::encode(completed_requests, bl);
+ ::encode(prealloc_inos, bl); // hacky, see below.
+ ::encode(used_inos, bl);
}
void decode(bufferlist::iterator& p) {
::decode(inst, p);
::decode(cap_push_seq, p);
::decode(completed_requests, p);
+ ::decode(prealloc_inos, p);
+ ::decode(used_inos, p);
+ prealloc_inos.insert(prealloc_inos.begin(), used_inos.begin(), used_inos.end()); // HACK
+ used_inos.clear();
}
};
WRITE_CLASS_ENCODER(Session)
list<pair<__u8,version_t> > table_tids; // tableclient transactions
- // ino's i've allocated
- vector<inodeno_t> allocated_inos;
- version_t inotablev;
+ // ino (pre)allocation. may involve both inotable AND session state.
+ version_t inotablev, sessionmapv;
+ inodeno_t allocated_ino; // inotable
+ deque<inodeno_t> preallocated_inos; // inotable + session
+ inodeno_t used_preallocated_ino; // session
+ entity_name_t client_name; // session
// inodes i've truncated
list< triple<inodeno_t,uint64_t,uint64_t> > truncated_inodes;
::encode(lump_order, bl);
::encode(lump_map, bl);
::encode(table_tids, bl);
- ::encode(allocated_inos, bl);
- if (!allocated_inos.empty())
- ::encode(inotablev, bl);
+ ::encode(allocated_ino, bl);
+ ::encode(used_preallocated_ino, bl);
+ ::encode(preallocated_inos, bl);
+ ::encode(client_name, bl);
+ ::encode(inotablev, bl);
+ ::encode(sessionmapv, bl);
::encode(truncated_inodes, bl);
::encode(destroyed_inodes, bl);
::encode(client_reqs, bl);
::decode(lump_order, bl);
::decode(lump_map, bl);
::decode(table_tids, bl);
- ::decode(allocated_inos, bl);
- if (!allocated_inos.empty())
- ::decode(inotablev, bl);
+ ::decode(allocated_ino, bl);
+ ::decode(used_preallocated_ino, bl);
+ ::decode(preallocated_inos, bl);
+ ::decode(client_name, bl);
+ ::decode(inotablev, bl);
+ ::decode(sessionmapv, bl);
::decode(truncated_inodes, bl);
::decode(destroyed_inodes, bl);
::decode(client_reqs, bl);
// for replay, in certain cases
LogSegment *_segment;
- EMetaBlob() : last_subtree_map(0), my_offset(0), _segment(0) { }
+ EMetaBlob() : inotablev(0), allocated_ino(0),
+ last_subtree_map(0), my_offset(0), _segment(0) { }
EMetaBlob(MDLog *mdl); // defined in journal.cc
void print(ostream& out) {
void add_table_transaction(int table, version_t tid) {
table_tids.push_back(pair<__u8, version_t>(table, tid));
- }
+ }
- void add_allocated_ino(inodeno_t ino, version_t tablev) {
- allocated_inos.push_back(ino);
- inotablev = tablev;
+ void set_ino_alloc(inodeno_t alloc,
+ inodeno_t used_prealloc,
+ deque<inodeno_t>& prealloc,
+ entity_name_t client,
+ version_t sv, version_t iv) {
+ allocated_ino = alloc;
+ used_preallocated_ino = used_prealloc;
+ preallocated_inos = prealloc;
+ client_name = client;
+ sessionmapv = sv;
+ inotablev = iv;
}
void add_inode_truncate(inodeno_t ino, uint64_t newsize, uint64_t oldsize) {
out << " " << lump_order.front() << ", " << lump_map.size() << " dirs";
if (!table_tids.empty())
out << " table_tids=" << table_tids;
- if (!allocated_inos.empty())
- out << " inos=" << allocated_inos << " v" << inotablev;
+ if (allocated_ino || preallocated_inos.size()) {
+ if (allocated_ino)
+ out << " alloc_ino=" << allocated_ino;
+ if (preallocated_inos.size())
+ out << " prealloc_ino=" << preallocated_inos;
+ if (used_preallocated_ino)
+ out << " used_prealloc_ino=" << used_preallocated_ino;
+ out << " v" << inotablev;
+ }
out << "]";
}
// -> handled directly by Server.cc, replay()
// alloc table update?
- if (!allocated_inos.empty())
+ if (inotablev)
ls->inotablev = inotablev;
+ if (sessionmapv)
+ ls->sessionmapv = sessionmapv;
// truncated inodes
// -> handled directly by Server.cc
}
// allocated_inos
- if (!allocated_inos.empty()) {
+ if (inotablev) {
if (mds->inotable->get_version() >= inotablev) {
dout(10) << "EMetaBlob.replay inotable tablev " << inotablev
<< " <= table " << mds->inotable->get_version() << dendl;
} else {
- dout(10) << " EMetaBlob.replay inotable " << allocated_inos << " tablev " << inotablev
- << " - 1 == table " << mds->inotable->get_version() << dendl;
- mds->inotable->alloc_ids(allocated_inos);
+ dout(10) << " EMetaBlob.replay inotable v " << inotablev
+ << " - 1 == table " << mds->inotable->get_version()
+ << " allocated+used " << allocated_ino
+ << " prealloc " << preallocated_inos
+ << dendl;
+ if (allocated_ino)
+ mds->inotable->replay_alloc_id(allocated_ino);
+ if (preallocated_inos.size())
+ mds->inotable->replay_alloc_ids(preallocated_inos);
assert(inotablev == mds->inotable->get_version());
}
}
+ if (sessionmapv) {
+ if (mds->sessionmap.version >= sessionmapv) {
+ dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
+ << " <= table " << mds->sessionmap.version << dendl;
+ } else {
+ dout(10) << " EMetaBlob.replay sessionmap v" << sessionmapv
+ << " -1 == table " << mds->sessionmap.version
+ << " prealloc " << preallocated_inos
+ << " used " << used_preallocated_ino
+ << dendl;
+ Session *session = mds->sessionmap.get_session(client_name);
+ assert(session);
+ if (used_preallocated_ino) {
+ inodeno_t i = session->take_ino();
+ assert(i == used_preallocated_ino);
+ session->used_inos.clear();
+ mds->sessionmap.projected = mds->sessionmap.version++;
+ }
+ if (preallocated_inos.size()) {
+ session->prealloc_inos.insert(session->prealloc_inos.end(),
+ preallocated_inos.begin(),
+ preallocated_inos.end());
+ mds->sessionmap.projected = mds->sessionmap.version++;
+ }
+ assert(sessionmapv == mds->sessionmap.version);
+ }
+ }
// truncated inodes
for (list< triple<inodeno_t,uint64_t,uint64_t> >::iterator p = truncated_inodes.begin();
dout(10) << "ESession.replay inotable " << mds->inotable->get_version()
<< " < " << inotablev << " " << (open ? "add":"remove") << dendl;
assert(!open); // for now
- mds->inotable->release_ids(inos);
+ mds->inotable->replay_release_ids(inos);
assert(mds->inotable->get_version() == inotablev);
}
}