From 58227ec7dbd17d34382a1271b1dcf3c48f484232 Mon Sep 17 00:00:00 2001 From: sage Date: Fri, 18 Nov 2005 00:26:50 +0000 Subject: [PATCH] *** empty log message *** git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@508 29311d96-e01e-0410-9327-a35deaab8ce9 --- ceph/TODO | 14 +- ceph/client/Buffercache.cc | 138 ++++++++-------- ceph/client/Buffercache.h | 98 ++++++------ ceph/client/Client.cc | 268 ++++++++++++++++++++------------ ceph/client/Client.h | 45 ++++-- ceph/config.h | 2 +- ceph/include/types.h | 2 +- ceph/mds/CInode.cc | 4 +- ceph/mds/CInode.h | 20 ++- ceph/mds/Capability.h | 11 ++ ceph/mds/Lock.h | 72 ++++----- ceph/mds/MDCache.cc | 72 +++++---- ceph/mds/MDS.cc | 9 +- ceph/messages/MClientFileCaps.h | 51 ++++-- 14 files changed, 481 insertions(+), 325 deletions(-) diff --git a/ceph/TODO b/ceph/TODO index 05247ff9b6b38..eb0dfcf14167d 100644 --- a/ceph/TODO +++ b/ceph/TODO @@ -1,6 +1,16 @@ +client +- some heuristic behavior to consolidate caps to inode auth +- client will re-tx anything it needed to say upon rx of new mds notification + +mds +/- fold imported caps into existing caps +/ - new auth sends notify +/- old auth _drops_ stray Caps msgs + + + +??? - open -> export -> caps update race potential... -- caps from multiple mds's! -- import/export of caps totally busted.. (merging??) diff --git a/ceph/client/Buffercache.cc b/ceph/client/Buffercache.cc index 8a94d66fe1e4c..aa1e72ade4e04 100644 --- a/ceph/client/Buffercache.cc +++ b/ceph/client/Buffercache.cc @@ -19,7 +19,7 @@ Bufferhead::Bufferhead(Inode *inode, Buffercache *bc) : // buffers are allocated later } -Bufferhead::Bufferhead(Inode *inode, long long off, Buffercache *bc) : +Bufferhead::Bufferhead(Inode *inode, off_t off, Buffercache *bc) : ref(0), miss_len(0), dirty_since(0), visited(false) { dout(10) << "bc: new bufferhead ino: " << inode->ino() << " offset: " << off << endl; this->inode = inode; @@ -59,21 +59,21 @@ Bufferhead::~Bufferhead() } } -void Bufferhead::set_offset(long long offset) +void Bufferhead::set_offset(off_t offset) { this->offset = offset; assert(!fc->buffer_map.count(offset)); // fail loudly if offset already exists! fc->insert(offset, this); } -void Bufferhead::alloc_buffers(unsigned long long size) +void Bufferhead::alloc_buffers(off_t size) { dout(10) << "bc: allocating buffers size: " << size << endl; assert(size > 0); while (size > 0) { if (size <= (unsigned)g_conf.client_bcache_alloc_maxsize) { - unsigned long long k = g_conf.client_bcache_alloc_minsize; - unsigned long long asize = size - size % k + (size % k > 0) * k; + off_t k = g_conf.client_bcache_alloc_minsize; + off_t asize = size - size % k + (size % k > 0) * k; buffer *b = new buffer(asize); b->set_length(size); bl.push_back(b); @@ -92,7 +92,7 @@ void Bufferhead::alloc_buffers(unsigned long long size) assert(bl.length() == size); } -void Bufferhead::miss_start(unsigned long long miss_len) +void Bufferhead::miss_start(off_t miss_len) { assert(state == BUFHD_STATE_CLEAN); get(); @@ -195,7 +195,7 @@ void Bufferhead::claim_append(Bufferhead *other) other->bl.clear(); } -void Bufferhead::splice(long long rel_off, unsigned long long length, Bufferhead *claim_by) +void Bufferhead::splice(off_t rel_off, off_t length, Bufferhead *claim_by) { dout(10) << "bc: Bufferhead::splice rel_off: " << rel_off << " length: " << length << " claim_by: " << claim_by << endl; assert(length <= this->length()); @@ -280,7 +280,7 @@ bool Dirtybuffers::exist(Bufferhead* bh) } -void Dirtybuffers::get_expired(time_t ttl, unsigned long long left_dirty, set& to_flush) +void Dirtybuffers::get_expired(time_t ttl, off_t left_dirty, set& to_flush) { dout(6) << "bc: get_expired ttl: " << ttl << " left_dirty: " << left_dirty << endl; if (_dbufs.empty() || left_dirty >= bc->get_dirty_size()) { @@ -288,8 +288,8 @@ void Dirtybuffers::get_expired(time_t ttl, unsigned long long left_dirty, set > > consolidation_map; - unsigned long long cleaned = 0; + map > > consolidation_map; + off_t cleaned = 0; if (cleaned < bc->get_dirty_size() - left_dirty) { time_t now = time(NULL); for (multimap::iterator it = _dbufs.begin(); @@ -307,15 +307,15 @@ void Dirtybuffers::get_expired(time_t ttl, unsigned long long left_dirty, setbuffer_map.empty()); } assert(fc->buffer_map.count(it->second->offset)); - list offlist; - unsigned long long length = fc->consolidation_opp( + list offlist; + off_t length = fc->consolidation_opp( now - ttl, bc->get_dirty_size() - left_dirty - cleaned, it->second->offset, offlist); if (offlist.size() > 1) { offlist.sort(); - long long start_off = offlist.front(); + off_t start_off = offlist.front(); offlist.pop_front(); consolidation_map[it->second->inode][start_off] = offlist; to_flush.insert(fc->buffer_map[start_off]); @@ -336,15 +336,15 @@ void Dirtybuffers::get_expired(time_t ttl, unsigned long long left_dirty, set::iterator, bool> rvalue; - rvalue = buffer_map.insert(pair (offset, bh)); + pair::iterator, bool> rvalue; + rvalue = buffer_map.insert(pair (offset, bh)); // The following is just to get the pieces for the last two assertions - map::iterator next_buf = buffer_map.upper_bound(offset); + map::iterator next_buf = buffer_map.upper_bound(offset); - map::iterator prev_buf = rvalue.first; + map::iterator prev_buf = rvalue.first; if (prev_buf != buffer_map.begin()) { prev_buf--; } else { @@ -358,7 +358,7 @@ void Filecache::insert(long long offset, Bufferhead* bh) (unsigned)prev_buf->first + prev_buf->second->length() <= (unsigned)offset); } -void Filecache::splice(long long offset, unsigned long long size) +void Filecache::splice(off_t offset, off_t size) { // insert Bufferhead at offset with size. only works if all overlapping // buffers are clean. Creates at most two new bufferheads at (offset, size) @@ -371,7 +371,7 @@ void Filecache::splice(long long offset, unsigned long long size) // FIXME: does not work with sparse files #if 0 dout(1) << "bc: before align offset: " << offset << " size: " << size << endl; - unsigned long long align = g_conf.client_bcache_align; + off_t align = g_conf.client_bcache_align; while (inode->inode.layout.stripe_size % align) align >>= 1; offset = offset / align * align; size = (size / align + (size % align > 0)) * align; @@ -379,9 +379,9 @@ void Filecache::splice(long long offset, unsigned long long size) #endif // get current buffer - map::iterator curbuf = get_buf(offset); + map::iterator curbuf = get_buf(offset); assert(curbuf != buffer_map.end()); - unsigned long long orig_len = curbuf->second->length(); + off_t orig_len = curbuf->second->length(); // insert new buffer leaving front part to original buffer if (curbuf->second->state == BUFHD_STATE_CLEAN && curbuf->first < offset ) { @@ -432,9 +432,9 @@ void Filecache::splice(long long offset, unsigned long long size) } -map::iterator Filecache::get_buf(long long off) +map::iterator Filecache::get_buf(off_t off) { - map::iterator curbuf = buffer_map.lower_bound(off); + map::iterator curbuf = buffer_map.lower_bound(off); if (curbuf == buffer_map.end() || curbuf->first > off) { if (curbuf == buffer_map.begin()) { return buffer_map.end(); @@ -451,7 +451,7 @@ map::iterator Filecache::get_buf(long long off) } } -map::iterator Filecache::overlap(unsigned long long len, long long off) +map::iterator Filecache::overlap(off_t len, off_t off) { // returns iterator to buffer overlapping specified extent or end() if no overlap exists dout(7) << "bc: overlap " << len << " " << off << endl; @@ -459,7 +459,7 @@ map::iterator Filecache::overlap(unsigned long long len, if (buffer_map.empty()) return buffer_map.end(); // find first buffer with offset >= off - map::iterator it = buffer_map.lower_bound(off); + map::iterator it = buffer_map.lower_bound(off); // Found buffer with exact offset if (it != buffer_map.end() && it->first == off) { @@ -490,18 +490,18 @@ map::iterator Filecache::overlap(unsigned long long len, return buffer_map.end(); } -map::iterator -Filecache::map_existing(unsigned long long len, - long long start_off, - map& hits, - map& rx, - map& tx, - map& holes) +map::iterator +Filecache::map_existing(off_t len, + off_t start_off, + map& hits, + map& rx, + map& tx, + map& holes) { dout(7) << "bc: map_existing len: " << len << " off: " << start_off << endl; - long long need_off = start_off; - long long actual_off = start_off; - map::iterator existing, rvalue = overlap(len, start_off); + off_t need_off = start_off; + off_t actual_off = start_off; + map::iterator existing, rvalue = overlap(len, start_off); for (existing = rvalue; existing != buffer_map.end() && (unsigned)existing->first < (unsigned)start_off + len; existing++) { @@ -511,7 +511,7 @@ Filecache::map_existing(unsigned long long len, if (actual_off > need_off) { assert(buffer_map.count(need_off) == 0); - holes[need_off] = (unsigned long long) (actual_off - need_off); + holes[need_off] = actual_off - need_off; dout(6) << "bc: map: hole " << need_off << " " << holes[need_off] << endl; need_off = actual_off; } @@ -542,27 +542,27 @@ Filecache::map_existing(unsigned long long len, // no buffers or no buffers at tail if ((unsigned)need_off < (unsigned)start_off + len) { - holes[need_off] = (unsigned long long) (start_off + len - need_off); + holes[need_off] = start_off + len - need_off; dout(6) << "bc: map: last hole " << need_off << " " << holes[need_off] << endl; assert(buffer_map.count(need_off) == 0); } return rvalue; } -unsigned long long -Filecache::consolidation_opp(time_t max_dirty_since, unsigned long long clean_goal, - long long offset, list& offlist) +off_t +Filecache::consolidation_opp(time_t max_dirty_since, off_t clean_goal, + off_t offset, list& offlist) { dout(6) << "bc: consolidation_opp max_dirty_since: " << max_dirty_since << " clean_goal: " << clean_goal << " offset: " << offset << endl; - unsigned long long length = 0; - map::iterator cur, orig = buffer_map.find(offset); + off_t length = 0; + map::iterator cur, orig = buffer_map.find(offset); assert(orig != buffer_map.end()); length += orig->second->length(); offlist.push_back(offset); // search left cur = orig; - long long need_off = offset; + off_t need_off = offset; while (cur != buffer_map.begin()) { cur--; if (cur->second->state != BUFHD_STATE_DIRTY || @@ -597,17 +597,17 @@ Filecache::consolidation_opp(time_t max_dirty_since, unsigned long long clean_go void Filecache::get_dirty(set& to_flush) { dout(6) << "bc: fc.get_dirty" << endl; - map > > consolidation_map; + map > > consolidation_map; for (set::iterator it = dirty_buffers.begin(); it != dirty_buffers.end(); it++) { if (!(*it)->visited) { (*it)->visited = true; - list offlist; + list offlist; consolidation_opp( time(NULL), bc->get_dirty_size(), (*it)->offset, offlist); if (offlist.size() > 1) { offlist.sort(); - long long start_off = offlist.front(); + off_t start_off = offlist.front(); consolidation_map[inode][start_off] = offlist; to_flush.insert(buffer_map[start_off]); } else { @@ -626,7 +626,7 @@ void Filecache::simplify() { dout(7) << "bc: simplify" << endl; list removed; - map::iterator start, next; + map::iterator start, next; start = buffer_map.begin(); next = buffer_map.begin(); int count = 0; @@ -669,14 +669,14 @@ void Filecache::simplify() } #endif -int Filecache::copy_out(unsigned long long size, long long offset, char *dst) +int Filecache::copy_out(off_t size, off_t offset, char *dst) { dout(7) << "bc: copy_out size: " << size << " offset: " << offset << endl; assert(offset >= 0); //assert(offset + size <= length()); doesn't hold after trim_bcache int rvalue = size; - map::iterator curbuf = buffer_map.lower_bound(offset); + map::iterator curbuf = buffer_map.lower_bound(offset); if (curbuf == buffer_map.end() || curbuf->first > offset) { if (curbuf == buffer_map.begin()) { return -1; @@ -717,14 +717,14 @@ int Filecache::copy_out(unsigned long long size, long long offset, char *dst) // -- Buffercache methods -void Buffercache::dirty(Inode *inode, unsigned long long size, long long offset, const char *src) +void Buffercache::dirty(Inode *inode, off_t size, off_t offset, const char *src) { dout(6) << "bc: dirty ino: " << inode->ino() << " size: " << size << " offset: " << offset << endl; assert(bcache_map.count(inode->ino())); // filecache has to be already allocated!! Filecache *fc = get_fc(inode); assert(offset >= 0); - map::iterator curbuf = fc->get_buf(offset); + map::iterator curbuf = fc->get_buf(offset); assert(curbuf != fc->buffer_map.end()); if (curbuf->second->state == BUFHD_STATE_CLEAN) { @@ -753,17 +753,17 @@ void Buffercache::dirty(Inode *inode, unsigned long long size, long long offset, } } -unsigned long long Buffercache::touch_continuous(map& hits, unsigned long long size, long long offset) +off_t Buffercache::touch_continuous(map& hits, off_t size, off_t offset) { dout(7) << "bc: touch_continuous size: " << size << " offset: " << offset << endl; if (hits.empty()) return 0; - long long next_off = offset; + off_t next_off = offset; if (hits.begin()->first > offset || (unsigned)hits.begin()->first + hits.begin()->second->length() <= (unsigned)offset) { return 0; } - for (map::iterator curbuf = hits.begin(); + for (map::iterator curbuf = hits.begin(); curbuf != hits.end(); curbuf++) { if (curbuf == hits.begin()) { @@ -774,21 +774,21 @@ unsigned long long Buffercache::touch_continuous(map& hi lru.lru_touch(curbuf->second); next_off += curbuf->second->length(); } - return (unsigned long long)(next_off - offset) >= size ? size : (next_off - offset); + return (next_off - offset) >= size ? size : (next_off - offset); } -void Buffercache::map_or_alloc(Inode *inode, unsigned long long size, long long offset, - map& buffers, - map& rx, - map& tx) +void Buffercache::map_or_alloc(Inode *inode, off_t size, off_t offset, + map& buffers, + map& rx, + map& tx) { dout(7) << "bc: map_or_alloc len: " << size << " off: " << offset << endl; Filecache *fc = get_fc(inode); - map holes; + map holes; holes.clear(); fc->map_existing(size, offset, buffers, rx, tx, holes); // stuff buffers into holes - for (map::iterator hole = holes.begin(); + for (map::iterator hole = holes.begin(); hole != holes.end(); hole++) { Bufferhead *bh; @@ -807,19 +807,19 @@ void Buffercache::map_or_alloc(Inode *inode, unsigned long long size, long long } } -void Buffercache::consolidate(map > > cons_map) +void Buffercache::consolidate(map > > cons_map) { dout(6) << "bc: consolidate" << endl; int deleted = 0; - for (map > >::iterator it_ino = cons_map.begin(); + for (map > >::iterator it_ino = cons_map.begin(); it_ino != cons_map.end(); it_ino++) { Filecache *fc = get_fc(it_ino->first); - for (map >::iterator it_off = it_ino->second.begin(); + for (map >::iterator it_off = it_ino->second.begin(); it_off != it_ino->second.end(); it_off++) { Bufferhead *first_bh = fc->buffer_map[it_off->first]; - for (list::iterator it_list = it_off->second.begin(); + for (list::iterator it_list = it_off->second.begin(); it_list != it_off->second.end(); it_list++) { Bufferhead *bh = fc->buffer_map[*it_list]; @@ -837,7 +837,7 @@ void Buffercache::consolidate(map > > con dout(6) << "bc: consolidate: deleted: " << deleted << endl; } -void Buffercache::get_reclaimable(unsigned long long min_size, list& reclaimed) +void Buffercache::get_reclaimable(off_t min_size, list& reclaimed) { while (min_size > 0) { if (Bufferhead *bh = (Bufferhead*)lru.lru_expire()) { @@ -850,10 +850,10 @@ void Buffercache::get_reclaimable(unsigned long long min_size, list } -unsigned long long Buffercache::reclaim(unsigned long long min_size) +off_t Buffercache::reclaim(off_t min_size) { dout(7) << "bc: reclaim min_size: " << min_size << endl; - unsigned long long freed_size = 0; + off_t freed_size = 0; while (freed_size < min_size) { Bufferhead *bh = (Bufferhead*)lru.lru_expire(); if (!bh) { diff --git a/ceph/client/Buffercache.h b/ceph/client/Buffercache.h index 8c29d6271cf69..c32e577ef1a33 100644 --- a/ceph/client/Buffercache.h +++ b/ceph/client/Buffercache.h @@ -44,8 +44,8 @@ class Bufferhead : public LRUObject { return --ref; } - long long offset; - unsigned long long miss_len; // only valid during misses + off_t offset; + off_t miss_len; // only valid during misses class Inode *inode; time_t dirty_since; int state; @@ -60,20 +60,20 @@ class Bufferhead : public LRUObject { // cons/destructors Bufferhead(class Inode *inode, Buffercache *bc); - Bufferhead(class Inode *inode, long long off, Buffercache *bc); + Bufferhead(class Inode *inode, off_t off, Buffercache *bc); ~Bufferhead(); - //Bufferhead(inodeno_t ino, long long off, unsigned long long len, int state); + //Bufferhead(inodeno_t ino, off_t off, off_t len, int state); // ~Bufferhead(); FIXME: need to mesh with allocator scheme - void set_offset(long long offset); + void set_offset(off_t offset); - unsigned long long length() { + off_t length() { if (is_hole() || state == BUFHD_STATE_RX) return miss_len; return bl.length(); } - void alloc_buffers(unsigned long long size); + void alloc_buffers(off_t size); /** wait_for_(read|write) * put Cond on local stack, block until woken up. @@ -113,14 +113,14 @@ class Bufferhead : public LRUObject { write_waiters.clear(); } - void miss_start(unsigned long long miss_len); + void miss_start(off_t miss_len); void miss_finish(); void dirty(); void dirtybuffers_erase(); void flush_start(); void flush_finish(); void claim_append(Bufferhead* other); - void splice(long long offset, unsigned long long length, Bufferhead *claim_by); + void splice(off_t offset, off_t length, Bufferhead *claim_by); friend ostream& operator<<(ostream& out, Bufferhead& bh); }; @@ -160,7 +160,7 @@ class Dirtybuffers { void insert(Bufferhead* bh); bool empty() { assert(_revind.empty() == _dbufs.empty()); return _dbufs.empty(); } bool exist(Bufferhead* bh); - void get_expired(time_t ttl, unsigned long long left_dirty, set& to_flush); + void get_expired(time_t ttl, off_t left_dirty, set& to_flush); time_t get_age() { time_t age; if (_dbufs.empty()) { @@ -192,7 +192,7 @@ class Filecache { public: class Inode *inode; - map buffer_map; + map buffer_map; set dirty_buffers; set inflight_buffers; Buffercache *bc; @@ -207,9 +207,9 @@ class Filecache { ~Filecache() { dout(6) << "bc: delete fc of ino: " << inode->ino() << endl; - map to_delete = buffer_map; + map to_delete = buffer_map; buffer_map.clear(); - for (map::iterator it = to_delete.begin(); + for (map::iterator it = to_delete.begin(); it != to_delete.end(); it++) { delete it->second; @@ -217,9 +217,9 @@ class Filecache { } #if 0 - unsigned long long length() { - unsigned long long len = 0; - for (map::iterator it = buffer_map.begin(); + off_t length() { + off_t len = 0; + for (map::iterator it = buffer_map.begin(); it != buffer_map.end(); it++) { len += it->second->bl.length(); @@ -228,9 +228,9 @@ class Filecache { } #endif - void insert(long long offset, Bufferhead* bh); + void insert(off_t offset, Bufferhead* bh); - void splice(long long offset, unsigned long long size); + void splice(off_t offset, off_t size); void wait_for_inflight(Mutex *lock) { Cond cond; @@ -247,22 +247,22 @@ class Filecache { inflight_waiters.clear(); } - map::iterator get_buf(long long off); - map::iterator overlap(unsigned long long len, long long off); - int copy_out(unsigned long long size, long long offset, char *dst); - map::iterator map_existing(unsigned long long len, long long start_off, - map& hits, - map& rx, - map& tx, - map& holes); - unsigned long long consolidation_opp(time_t ttl, unsigned long long clean_goal, - long long offset, list& offlist); + map::iterator get_buf(off_t off); + map::iterator overlap(off_t len, off_t off); + int copy_out(off_t size, off_t offset, char *dst); + map::iterator map_existing(off_t len, off_t start_off, + map& hits, + map& rx, + map& tx, + map& holes); + off_t consolidation_opp(time_t ttl, off_t clean_goal, + off_t offset, list& offlist); void get_dirty(set& to_flush); }; class Buffercache { private: - unsigned long long dirty_size, rx_size, tx_size, clean_size; + off_t dirty_size, rx_size, tx_size, clean_size; list inflight_waiters; public: @@ -309,50 +309,50 @@ class Buffercache { inflight_waiters.clear(); } - void clean_to_dirty(unsigned long long size) { + void clean_to_dirty(off_t size) { clean_size -= size; assert(clean_size >= 0); dirty_size += size; } - void dirty_to_tx(unsigned long long size) { + void dirty_to_tx(off_t size) { dirty_size -= size; assert(dirty_size >= 0); tx_size += size; } - void tx_to_dirty(unsigned long long size) { + void tx_to_dirty(off_t size) { tx_size -= size; assert(tx_size >= 0); dirty_size += size; } - void tx_to_clean(unsigned long long size) { + void tx_to_clean(off_t size) { tx_size -= size; assert(tx_size >= 0); clean_size += size; } - void increase_size(unsigned long long size) { + void increase_size(off_t size) { clean_size += size; } - void decrease_size(unsigned long long size) { + void decrease_size(off_t size) { clean_size -= size; assert(clean_size >= 0); } - unsigned long long get_clean_size() { return clean_size; } - unsigned long long get_dirty_size() { return dirty_size; } - unsigned long long get_rx_size() { return rx_size; } - unsigned long long get_tx_size() { return tx_size; } - unsigned long long get_total_size() { return clean_size + dirty_size + rx_size + tx_size; } - void get_reclaimable(unsigned long long min_size, list&); + off_t get_clean_size() { return clean_size; } + off_t get_dirty_size() { return dirty_size; } + off_t get_rx_size() { return rx_size; } + off_t get_tx_size() { return tx_size; } + off_t get_total_size() { return clean_size + dirty_size + rx_size + tx_size; } + void get_reclaimable(off_t min_size, list&); void insert(Bufferhead *bh); - void dirty(Inode *inode, unsigned long long size, long long offset, const char *src); - unsigned long long touch_continuous(map& hits, unsigned long long size, long long offset); - void map_or_alloc(class Inode *inode, unsigned long long len, long long off, - map& buffers, - map& rx, - map& tx); - void consolidate(map > > cons_map); + void dirty(Inode *inode, off_t size, off_t offset, const char *src); + off_t touch_continuous(map& hits, off_t size, off_t offset); + void map_or_alloc(class Inode *inode, off_t len, off_t off, + map& buffers, + map& rx, + map& tx); + void consolidate(map > > cons_map); void release_file(inodeno_t ino); - unsigned long long reclaim(unsigned long long min_size); + off_t reclaim(off_t min_size); }; diff --git a/ceph/client/Client.cc b/ceph/client/Client.cc index cf88537c8c80d..255429936bad6 100644 --- a/ceph/client/Client.cc +++ b/ceph/client/Client.cc @@ -242,7 +242,7 @@ Inode* Client::insert_inode_info(Dir *dir, c_inode_info *in_info) dn->inode->inode = in_info->inode; // or do we have newer size/mtime from writing? - if (dn->inode->file_caps & CAP_FILE_WR) { + if (dn->inode->file_caps() & CAP_FILE_WR) { if (dn->inode->file_wr_size > dn->inode->inode.size) dn->inode->inode.size = dn->inode->file_wr_size; if (dn->inode->file_wr_mtime > dn->inode->inode.mtime) @@ -530,7 +530,7 @@ public: } }; -void Client::flush_buffers(int ttl, unsigned long long dirty_size) +void Client::flush_buffers(int ttl, off_t dirty_size) { // ttl = 0 or dirty_size = 0: flush all if (!bc->dirty_buffers->empty()) { @@ -562,19 +562,19 @@ void Client::trim_bcache() if (bc->get_total_size() > g_conf.client_bcache_size) { // need to free buffers if (bc->get_dirty_size() > - (unsigned long long)g_conf.client_bcache_hiwater * - (unsigned long long)g_conf.client_bcache_size / 100ULL) { + g_conf.client_bcache_hiwater * + g_conf.client_bcache_size / 100LL) { // flush buffers until we have low water mark - unsigned long long want_target_size = - (unsigned long long)g_conf.client_bcache_lowater * - (unsigned long long)g_conf.client_bcache_size / 100ULL; + off_t want_target_size = + g_conf.client_bcache_lowater * + g_conf.client_bcache_size / 100LL; dout(3) << "bc: flush_buffers started" << endl; flush_buffers(g_conf.client_bcache_ttl, want_target_size); } // Now reclaim buffers - unsigned long long reclaim_size = bc->get_total_size() - - (unsigned long long)g_conf.client_bcache_size * - (unsigned long long)g_conf.client_bcache_hiwater / 100ULL; + off_t reclaim_size = bc->get_total_size() - + g_conf.client_bcache_size * + g_conf.client_bcache_hiwater / 100LL; dout(6) << "bc: trim_bcache: reclaim: " << reclaim_size << endl; while (reclaim_size > 0 && bc->reclaim(reclaim_size) == 0) { // cannot reclaim any buffers: wait for inflight buffers @@ -600,8 +600,8 @@ void Client::release_inode_buffers(Inode *in) inodeno_t ino = in->ino(); if (fc->buffer_map.empty()) return; - map to_release = fc->buffer_map; - for (map::iterator it = to_release.begin(); + map to_release = fc->buffer_map; + for (map::iterator it = to_release.begin(); it != to_release.end(); it++) { Bufferhead *bh = it->second; @@ -624,19 +624,82 @@ void Client::release_inode_buffers(Inode *in) } + +/**** + * caps + */ + + void Client::handle_file_caps(MClientFileCaps *m) { - if (inode_map.count(m->get_ino()) == 0) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " seq " << m->get_seq() << " " << cap_string(m->get_caps()) << ", which we don't have, releasing." << endl; - m->set_caps(0); - m->set_wanted(0); - messenger->send_message(m, m->get_source(), m->get_source_port()); - return; + int mds = MSG_ADDR_NUM(m->get_source()); + Inode *in = 0; + if (inode_map.count(m->get_ino())) in = inode_map[ m->get_ino() ]; + + // reap? + if (m->get_special() == MClientFileCaps::FILECAP_REAP) { + int other = m->get_mds(); + + if (in && in->stale_caps.count(other)) { + dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " reap on mds" << other << endl; + + // fresh from new mds? + if (!in->caps.count(mds)) { + if (in->caps.empty()) in->get(); + in->caps[mds].seq = m->get_seq(); + in->caps[mds].caps = m->get_caps(); + } + + in->stale_caps.erase(other); + if (in->stale_caps.empty()) put_inode(in); // note: this will never delete *in + + // fall-thru! + } else { + dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " premature (!!) reap on mds" << other << endl; + // delay! + cap_reap_queue[in->ino()][other] = m; + return; + } } - Inode *in = inode_map[ m->get_ino() ]; assert(in); + + // stale? + if (m->get_special() == MClientFileCaps::FILECAP_STALE) { + dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " now stale" << endl; + + // put in stale list + assert(in->caps.count(mds)); + if (in->stale_caps.empty()) in->get(); + in->stale_caps[mds] = in->caps[mds]; + in->caps.erase(mds); + + // delayed reap? + if (cap_reap_queue.count(in->ino()) && + cap_reap_queue[in->ino()].count(mds)) { + dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " delayed reap on mds" << m->get_mds() << endl; + + // process delayed reap + handle_file_caps( cap_reap_queue[in->ino()][mds] ); + + cap_reap_queue[in->ino()].erase(mds); + if (cap_reap_queue[in->ino()].empty()) + cap_reap_queue.erase(in->ino()); + } + return; + } + + // release? + if (m->get_special() == MClientFileCaps::FILECAP_RELEASE) { + dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " release" << endl; + assert(in->caps.count(mds)); + in->caps.erase(mds); + if (in->caps.empty()) put_inode(in); + return; + } + + // don't want? if (in->file_caps_wanted() == 0) { dout(5) << "handle_file_caps on ino " << m->get_ino() << " seq " << m->get_seq() << " " << cap_string(m->get_caps()) << ", which we don't want caps for, releasing." << endl; m->set_caps(0); @@ -644,46 +707,47 @@ void Client::handle_file_caps(MClientFileCaps *m) messenger->send_message(m, m->get_source(), m->get_source_port()); return; } + + /* if (m->get_seq() <= in->file_caps_seq) { + assert(0); // no ooo support yet dout(5) << "handle_file_caps on ino " << m->get_ino() << " old seq " << m->get_seq() << " <= " << in->file_caps_seq << endl; delete m; return; } + */ - // new mds auth? - if (m->get_mds() >= 0) { - in->file_mds = m->get_mds(); - dout(5) << "handle_file_caps on ino " << m->get_ino() << " mds now " << in->file_mds << endl; - } - + assert(in->caps.count(mds)); - int old_caps = in->file_caps; - in->file_caps = m->get_caps(); - in->file_caps_seq = m->get_seq(); - dout(5) << "handle_file_caps on in " << m->get_ino() << " seq " << m->get_seq() << " caps now " << cap_string(in->file_caps) << " was " << cap_string(old_caps) << endl; + // update caps + const int old_caps = in->caps[mds].caps; + const int new_caps = m->get_caps(); + in->caps[mds].caps = new_caps; + in->caps[mds].seq = m->get_seq(); + dout(5) << "handle_file_caps on in " << m->get_ino() << " seq " << m->get_seq() << " caps now " << cap_string(new_caps) << " was " << cap_string(old_caps) << endl; // update inode in->inode = m->get_inode(); // might have updated size... FIXME this is overkill! - + // flush buffers? - if (in->file_caps & CAP_FILE_WRBUFFER == 0) { + if (in->file_caps() & CAP_FILE_WRBUFFER == 0) { flush_inode_buffers(in); Filecache *fc = bc->get_fc(in); fc->wait_for_inflight(client_lock); // FIXME: this isn't actually allowed to block is it?!? } // release buffers? - if (in->file_caps & CAP_FILE_RDCACHE == 0) + if (new_caps & CAP_FILE_RDCACHE == 0) release_inode_buffers(in); // ack? - if (old_caps & ~in->file_caps) { - dout(5) << " we lost caps " << cap_string(old_caps & ~in->file_caps) << ", acking" << endl; + if (old_caps & ~new_caps) { + dout(5) << " we lost caps " << cap_string(old_caps & ~new_caps) << ", acking" << endl; messenger->send_message(m, m->get_source(), m->get_source_port()); } // wake up waiters? - if (in->file_caps & CAP_FILE_RD) { + if (new_caps & CAP_FILE_RD) { for (list::iterator it = in->waitfor_read.begin(); it != in->waitfor_read.end(); it++) { @@ -692,7 +756,7 @@ void Client::handle_file_caps(MClientFileCaps *m) } in->waitfor_read.clear(); } - if (in->file_caps & CAP_FILE_WR) { + if (new_caps & CAP_FILE_WR) { for (list::iterator it = in->waitfor_write.begin(); it != in->waitfor_write.end(); it++) { @@ -710,32 +774,30 @@ void Client::release_caps(Inode *in, int retain) { dout(5) << "releasing caps on ino " << in->inode.ino - << " had " << cap_string(in->file_caps) + << " had " << cap_string(in->file_caps()) << " retaining " << cap_string(retain) << endl; - in->file_caps = retain; - - // release - MClientFileCaps *m = new MClientFileCaps(in->inode, - in->file_caps_seq, - in->file_caps, - in->file_caps_wanted(), - whoami); - messenger->send_message(m, - MSG_ADDR_MDS(in->file_mds), MDS_PORT_CACHE); + for (map::iterator it = in->caps.begin(); + it != in->caps.end(); + it++) { + //if (it->second.caps & ~retain) { + if (1) { + // release (some of?) these caps + it->second.caps = retain & it->second.caps; + // note: tell mds _full_ wanted; it'll filter/behave based on what it is allowed to do + MClientFileCaps *m = new MClientFileCaps(in->inode, + it->second.seq, + it->second.caps, + in->file_caps_wanted()); + messenger->send_message(m, MSG_ADDR_MDS(it->first), MDS_PORT_CACHE); + } + } - if ((in->file_caps & CAP_FILE_WR) == 0) { + if ((in->file_caps() & CAP_FILE_WR) == 0) { in->file_wr_mtime = 0; in->file_wr_size = 0; } - - // release caps completely? - if (in->file_caps == 0) { - in->file_caps_seq = 0; - in->file_mds = 0; - put_inode(in); - } } void Client::update_caps_wanted(Inode *in) @@ -744,13 +806,17 @@ void Client::update_caps_wanted(Inode *in) << " to " << cap_string(in->file_caps_wanted()) << endl; - MClientFileCaps *m = new MClientFileCaps(in->inode, - in->file_caps_seq, - in->file_caps, - in->file_caps_wanted(), - whoami); - messenger->send_message(m, - MSG_ADDR_MDS(in->file_mds), MDS_PORT_CACHE); + // FIXME: pick a single mds and let the others off the hook.. + for (map::iterator it = in->caps.begin(); + it != in->caps.end(); + it++) { + MClientFileCaps *m = new MClientFileCaps(in->inode, + it->second.seq, + it->second.caps, + in->file_caps_wanted()); + messenger->send_message(m, + MSG_ADDR_MDS(it->first), MDS_PORT_CACHE); + } } @@ -1033,7 +1099,7 @@ int Client::symlink(const char *target, const char *link) return res; } -int Client::readlink(const char *path, char *buf, unsigned long long size) +int Client::readlink(const char *path, char *buf, off_t size) { client_lock->Lock(); dout(3) << "op: client->readlink(\"" << path << "\", readlinkbuf, readlinkbuf_len);" << endl; @@ -1119,8 +1185,8 @@ int Client::lstat(const char *path, struct stat *stbuf) stbuf->st_ctime = inode.ctime; stbuf->st_atime = inode.atime; stbuf->st_mtime = inode.mtime; - stbuf->st_size = (long long) inode.size; //FIXME long long is signed 64 vs size is unsigned 64 - stbuf->st_blocks = (inode.size - 1) / 1024 + 1; + stbuf->st_size = inode.size; + stbuf->st_blocks = inode.size ? ((inode.size - 1) / 1024 + 1):0; stbuf->st_blksize = 1024; //stbuf->st_flags = //stbuf->st_gen = @@ -1384,7 +1450,6 @@ int Client::open(const char *path, int mode) f->inode = inode_map[trace[trace.size()-1]->inode.ino]; assert(f->inode); f->inode->get(); - f->inode->file_mds = MSG_ADDR_NUM(reply->get_source()); if (cmode & FILE_MODE_R) f->inode->num_rd++; @@ -1392,24 +1457,25 @@ int Client::open(const char *path, int mode) f->inode->num_wr++; // caps included? - assert(reply->get_file_caps_seq() >= f->inode->file_caps_seq); - if (reply->get_file_caps_seq() > f->inode->file_caps_seq) { + int mds = MSG_ADDR_NUM(reply->get_source()); + + if (f->inode->caps.empty()) // first caps? + f->inode->get(); + + assert(reply->get_file_caps_seq() >= f->inode->caps[mds].seq); + if (reply->get_file_caps_seq() > f->inode->caps[mds].seq) { dout(7) << "open got caps " << cap_string(reply->get_file_caps()) << " seq " << reply->get_file_caps_seq() << endl; - // first ones? - if (f->inode->file_caps_seq == 0) - f->inode->get(); - - int old_caps = f->inode->file_caps; - f->inode->file_caps = reply->get_file_caps(); - f->inode->file_caps_seq = reply->get_file_caps_seq(); + int old_caps = f->inode->caps[mds].caps; + f->inode->caps[mds].caps = reply->get_file_caps(); + f->inode->caps[mds].seq = reply->get_file_caps_seq(); // ack if we lost any caps - if (old_caps & ~f->inode->file_caps) { - dout(5) << " we lost caps " << cap_string(old_caps & ~f->inode->file_caps) << ", acking" << endl; + if (old_caps & ~f->inode->caps[mds].caps) { + dout(5) << " we lost caps " << cap_string(old_caps & ~f->inode->caps[mds].caps) << ", acking" << endl; messenger->send_message(new MClientFileCaps(f->inode->inode, - f->inode->file_caps_seq, - f->inode->file_caps, + f->inode->caps[mds].seq, + f->inode->caps[mds].caps, f->inode->file_caps_wanted(), whoami), reply->get_source(), reply->get_source_port()); @@ -1421,7 +1487,7 @@ int Client::open(const char *path, int mode) assert(fh_map.count(fh) == 0); fh_map[fh] = f; - dout(3) << "open success, fh is " << fh << " caps " << f->inode->file_caps << endl;//f->caps << " fh size " << f->size << endl; + dout(3) << "open success, fh is " << fh << " combined caps " << cap_string(f->inode->file_caps()) << endl; } delete reply; @@ -1539,7 +1605,7 @@ public: }; -int Client::read(fh_t fh, char *buf, unsigned long long size, long long offset) +int Client::read(fh_t fh, char *buf, off_t size, off_t offset) { client_lock->Lock(); @@ -1555,7 +1621,7 @@ int Client::read(fh_t fh, char *buf, unsigned long long size, long long offset) Inode *in = f->inode; // do we have read file cap? - while (in->file_caps & CAP_FILE_RD == 0) { + while ((in->file_caps() & CAP_FILE_RD) == 0) { dout(7) << " don't have read cap, waiting" << endl; Cond cond; in->waitfor_read.push_back(&cond); @@ -1565,11 +1631,11 @@ int Client::read(fh_t fh, char *buf, unsigned long long size, long long offset) // determine whether read range overlaps with file // ...ONLY if we're doing async io - if (in->file_caps & (CAP_FILE_WRBUFFER|CAP_FILE_RDCACHE)) { + if (in->file_caps() & (CAP_FILE_WRBUFFER|CAP_FILE_RDCACHE)) { // we're doing buffered i/o. make sure we're inside the file. // we can trust size info bc we get accurate info when buffering/caching caps are issued. dout(10) << "file size: " << in->inode.size << endl; - if (offset > 0 && (unsigned long long)offset >= in->inode.size) { + if (offset > 0 && offset >= in->inode.size) { client_lock->Unlock(); return 0; } @@ -1603,10 +1669,10 @@ int Client::read(fh_t fh, char *buf, unsigned long long size, long long offset) // buffer cache ON // map buffercache - map hits, rx, tx, hits_tx; - map::iterator it; - map holes; - map::iterator hole; + map hits, rx, tx, hits_tx; + map::iterator it; + map holes; + map::iterator hole; Filecache *fc = bc->get_fc(in); hits.clear(); rx.clear(); tx.clear(); holes.clear(); @@ -1618,18 +1684,18 @@ int Client::read(fh_t fh, char *buf, unsigned long long size, long long offset) if ((rvalue = (int)bc->touch_continuous(hits_tx, size, offset)) > 0) { // sweet -- we can return stuff immediately dout(6) << "read bc hit on clean, dirty, or tx buffer, rvalue: " << rvalue << endl; - rvalue = fc->copy_out((unsigned long long)rvalue, offset, buf); + rvalue = fc->copy_out(rvalue, offset, buf); dout(6) << "read bc hit: immediately returning " << rvalue << " bytes" << endl; assert(rvalue > 0); } - assert(!(rvalue >= 0 && (unsigned long long)rvalue == size) || holes.empty()); + assert(!(rvalue >= 0 && rvalue == size) || holes.empty()); // issue reads for holes int hole_rvalue = 0; //FIXME: don't really need to track rvalue in MissFinish context for (hole = holes.begin(); hole != holes.end(); hole++) { dout(6) << "read bc miss" << endl; - long long hole_offset = hole->first; - unsigned long long hole_size = hole->second; + off_t hole_offset = hole->first; + off_t hole_size = hole->second; // either get "hole" bufferhead or insert new bufferhead without // allocated buffers (Filer::handle_osd_read_reply allocates them) @@ -1653,7 +1719,7 @@ int Client::read(fh_t fh, char *buf, unsigned long long size, long long offset) if (rvalue == 0) { // we need to wait for the first buffer dout(7) << "read bc miss: waiting for first buffer" << endl; - map::iterator it = fc->get_buf(offset); + map::iterator it = fc->get_buf(offset); assert(it != fc->buffer_map.end()); Bufferhead *bh = it->second; #if 0 @@ -1713,7 +1779,7 @@ public: }; -int Client::write(fh_t fh, const char *buf, unsigned long long size, long long offset) +int Client::write(fh_t fh, const char *buf, off_t size, off_t offset) { client_lock->Lock(); @@ -1733,7 +1799,7 @@ int Client::write(fh_t fh, const char *buf, unsigned long long size, long long o // do we have write file cap? - while (in->file_caps & CAP_FILE_WR == 0) { + while ((in->file_caps() & CAP_FILE_WR) == 0) { dout(7) << " don't have write cap, waiting" << endl; Cond cond; in->waitfor_write.push_back(&cond); @@ -1742,12 +1808,12 @@ int Client::write(fh_t fh, const char *buf, unsigned long long size, long long o if (g_conf.client_bcache && // buffer cache ON? - in->file_caps & CAP_FILE_WRBUFFER) { // caps buffered write? + (in->file_caps() & CAP_FILE_WRBUFFER)) { // caps buffered write? // buffered write dout(7) << "buffered/async write" << endl; // map buffercache for writing - map buffers, rx, tx; + map buffers, rx, tx; buffers.clear(); rx.clear(); tx.clear(); bc->map_or_alloc(in, size, offset, buffers, rx, tx); @@ -1809,10 +1875,10 @@ int Client::write(fh_t fh, const char *buf, unsigned long long size, long long o // assume success for now. FIXME. - unsigned long long totalwritten = size; + off_t totalwritten = size; // extend file? - if (totalwritten + (unsigned long long)offset > in->inode.size) { + if (totalwritten + offset > in->inode.size) { in->inode.size = in->file_wr_size = totalwritten + offset; dout(7) << "wrote to " << totalwritten+offset << ", extending file size" << endl; } else { @@ -1828,7 +1894,7 @@ int Client::write(fh_t fh, const char *buf, unsigned long long size, long long o } -int Client::truncate(const char *file, unsigned long long size) +int Client::truncate(const char *file, off_t size) { client_lock->Lock(); dout(3) << "op: client->truncate(\"" << file << "\", " << size << ");" << endl; @@ -1879,7 +1945,7 @@ int Client::fsync(fh_t fh, bool syncdataonly) fc->wait_for_inflight(client_lock); if (syncdataonly && - (in->file_caps & CAP_FILE_WR)) { + (in->file_caps() & CAP_FILE_WR)) { // flush metadata too.. size, mtime // ... } diff --git a/ceph/client/Client.h b/ceph/client/Client.h index 632a220c6bd51..518b0722dad5a 100644 --- a/ceph/client/Client.h +++ b/ceph/client/Client.h @@ -88,6 +88,13 @@ class Dir { }; +class InodeCap { + public: + int caps; + long seq; + InodeCap() : caps(0), seq(0) {} +}; + class Inode { public: @@ -96,11 +103,12 @@ class Inode { set mds_contacts; time_t last_updated; - int file_caps; - long file_caps_seq; - int file_mds; // semi-hack + // per-mds caps + map caps; // mds -> InodeCap + map stale_caps; // mds -> cap .. stale + time_t file_wr_mtime; // [writers] time of last write - unsigned long long file_wr_size; // [writers] largest offset we've written to + off_t file_wr_size; // [writers] largest offset we've written to int num_rd, num_wr; // num readers, writers int ref; // ref count. 1 for each dentry, fh that links to me. @@ -117,7 +125,7 @@ class Inode { void put() { ref--; assert(ref >= 0); } Inode() : mds_dir_auth(-1), last_updated(0), - file_caps(0), file_caps_seq(0), file_mds(0), file_wr_mtime(0), file_wr_size(0), num_rd(0), num_wr(0), + file_wr_mtime(0), file_wr_size(0), num_rd(0), num_wr(0), ref(0), dir(0), dn(0), symlink(0) { } ~Inode() { if (symlink) { delete symlink; symlink = 0; } @@ -129,6 +137,19 @@ class Inode { return (inode.mode & INODE_TYPE_MASK) == INODE_MODE_DIR; } + int file_caps() { + int c = 0; + for (map::iterator it = caps.begin(); + it != caps.end(); + it++) + c |= it->second.caps; + for (map::iterator it = stale_caps.begin(); + it != stale_caps.end(); + it++) + c |= it->second.caps; + return c; + } + int file_caps_wanted() { int w = 0; if (num_rd) w |= CAP_FILE_RD|CAP_FILE_RDCACHE; @@ -207,6 +228,10 @@ class Client : public Dispatcher { Inode* root; LRU lru; // lru list of Dentry's in our local metadata cache. + // cap weirdness + map > cap_reap_queue; // ino -> mds -> msg .. set of (would-be) stale caps to reap + + // file handles rangeset free_fh_set; // unused fh's hash_map fh_map; @@ -331,7 +356,7 @@ class Client : public Dispatcher { // buffer cache class Buffercache *bc; - void flush_buffers(int ttl, unsigned long long dirty_size); // flush dirty buffers + void flush_buffers(int ttl, off_t dirty_size); // flush dirty buffers void trim_bcache(); void flush_inode_buffers(Inode *in); // flush buffered writes void release_inode_buffers(Inode *in); // release cached reads @@ -380,7 +405,7 @@ class Client : public Dispatcher { int rmdir(const char *path); // symlinks - int readlink(const char *path, char *buf, unsigned long long size); + int readlink(const char *path, char *buf, off_t size); int symlink(const char *existing, const char *newname); // inode stuff @@ -393,9 +418,9 @@ class Client : public Dispatcher { int mknod(const char *path, mode_t mode); int open(const char *path, int mode); int close(fh_t fh); - int read(fh_t fh, char *buf, unsigned long long size, long long offset); - int write(fh_t fh, const char *buf, unsigned long long size, long long offset); - int truncate(const char *file, unsigned long long size); + int read(fh_t fh, char *buf, off_t size, off_t offset); + int write(fh_t fh, const char *buf, off_t size, off_t offset); + int truncate(const char *file, off_t size); //int truncate(fh_t fh, long long size); int fsync(fh_t fh, bool syncdataonly); diff --git a/ceph/config.h b/ceph/config.h index acc17857b18b0..66e8a5cd8d48b 100644 --- a/ceph/config.h +++ b/ceph/config.h @@ -47,7 +47,7 @@ struct md_config_t { int client_bcache_alloc_minsize; int client_bcache_alloc_maxsize; int client_bcache_ttl; - unsigned long long client_bcache_size; + off_t client_bcache_size; int client_bcache_lowater; int client_bcache_hiwater; size_t client_bcache_align; diff --git a/ceph/include/types.h b/ceph/include/types.h index d0f9da762251c..e5fb55ceef592 100644 --- a/ceph/include/types.h +++ b/ceph/include/types.h @@ -156,7 +156,7 @@ struct inode_t { FileLayout layout; // soft - __uint64_t size; + off_t size; time_t atime, mtime; // maybe atime different? "lazy"? int nlink; diff --git a/ceph/mds/CInode.cc b/ceph/mds/CInode.cc index 728c748e20a52..787130b09a001 100644 --- a/ceph/mds/CInode.cc +++ b/ceph/mds/CInode.cc @@ -72,9 +72,7 @@ ostream& operator<<(ostream& out, CInode& in) // ====== CInode ======= -CInode::CInode(bool auth) : LRUObject(), - hardlock(LOCK_TYPE_BASIC), - filelock(LOCK_TYPE_FILE) { +CInode::CInode(bool auth) : LRUObject() { ref = 0; parent = NULL; diff --git a/ceph/mds/CInode.h b/ceph/mds/CInode.h index 695367a94bcff..a714982d78202 100644 --- a/ceph/mds/CInode.h +++ b/ceph/mds/CInode.h @@ -385,18 +385,34 @@ class CInode : LRUObject { return &client_caps[client]; return 0; } + /* void set_client_caps(map& cl) { if (client_caps.empty() && !cl.empty()) get(CINODE_PIN_CAPS); client_caps.clear(); client_caps = cl; } + */ void take_client_caps(map& cl) { if (!client_caps.empty()) put(CINODE_PIN_CAPS); cl = client_caps; client_caps.clear(); } + void merge_client_caps(map& cl, set& new_client_caps) { + for (map::iterator it = cl.begin(); + it != cl.end(); + it++) { + new_client_caps.insert(it->first); + if (client_caps.count(it->first)) { + // merge + client_caps[it->first].merge(it->second); + } else { + // new + client_caps[it->first] = it->second; + } + } + } // caps issued, wanted int get_caps_issued() { @@ -650,7 +666,7 @@ public: inodeno_t get_ino() { return st.inode.ino; } - void update_inode(CInode *in) { + void update_inode(CInode *in, set& new_client_caps) { in->inode = st.inode; in->version = st.version; @@ -674,7 +690,7 @@ public: in->filelock = filelock; // caps - in->set_client_caps(cap_map); + in->merge_client_caps(cap_map, new_client_caps); } void _encode(bufferlist& bl) { diff --git a/ceph/mds/Capability.h b/ceph/mds/Capability.h index 9646a722b7f8f..95fba3ee5384b 100644 --- a/ceph/mds/Capability.h +++ b/ceph/mds/Capability.h @@ -126,6 +126,17 @@ public: } long get_last_seq() { return last_sent; } + void merge(Capability& other) { + // issued + pending + int newpending = other.pending() | pending(); + if (other.issued() & ~newpending) + issue(other.issued() | newpending); + issue(newpending); + + // wanted + wanted_caps = wanted_caps | other.wanted(); + } + // confirm receipt of a previous sent/issued seq. int confirm_receipt(long seq, int caps) { int r = 0; diff --git a/ceph/mds/Lock.h b/ceph/mds/Lock.h index 3fa57416905a3..62954022c7bc8 100644 --- a/ceph/mds/Lock.h +++ b/ceph/mds/Lock.h @@ -9,46 +9,45 @@ using namespace std; #include "Capability.h" -// STATES -// basic lock -#define LOCK_SYNC 0 // AR -#define LOCK_LOCK 1 // AR -#define LOCK_GLOCKR 2 // AR gather to lock from sync +// states and such. +// C = cache reads, R = read, W = write, B = buffer writes -// file lock states -#define LOCK_GLOCKW 3 // A gather to lock from wronly -#define LOCK_GLOCKM 4 // A gather to lock from mixed -#define LOCK_MIXED 5 // AR -#define LOCK_GMIXEDR 6 // AR gather to mixed from sync -#define LOCK_GMIXEDW 7 // A gather to mixed from wronly +// basic lock -----auth---- ---replica--- +#define LOCK_SYNC 0 // AR R . / C R . . R . / C R . . stat() +#define LOCK_LOCK 1 // AR R W / C . . . . . / C . . . truncate() +#define LOCK_GLOCKR 2 // AR R . / C . . . . . / C . . . -#define LOCK_WRONLY 8 // A -#define LOCK_GWRONLYR 9 // A gather to wronly from sync -#define LOCK_GWRONLYM 10 // A gather to wronly from mixed +// file lock states +#define LOCK_GLOCKW 3 // A . . / . . . . +#define LOCK_GLOCKM 4 // A . . / . . . . +#define LOCK_MIXED 5 // AR . . / . R W . . . / . R . . +#define LOCK_GMIXEDR 6 // AR R . / . R . . . . / . R . . +#define LOCK_GMIXEDW 7 // A . . / . . W . -#define LOCK_GSYNCW 11 // A gather (clients) to sync from wronly -#define LOCK_GSYNCM 12 // A gather (clients) to sync from mixed +#define LOCK_WRONLY 8 // A . . / . . W B (lock) +#define LOCK_GWRONLYR 9 // A . . / . . . . +#define LOCK_GWRONLYM 10 // A . . / . . W . +#define LOCK_GSYNCW 11 // A . . / . . . . +#define LOCK_GSYNCM 12 // A . . / . R . . -#define LOCK_TYPE_BASIC 0 -#define LOCK_TYPE_FILE 1 +// 4 stable +// +9 transition +// 13 total -// -- lock (basic or soft lock) +// -- lock... hard or file class CLock { protected: // lock state - char type; char state; set gather_set; // auth int nread, nwrite; public: - CLock() {} - CLock(char t) : - type(t), + CLock() : state(LOCK_LOCK), nread(0), nwrite(0) { @@ -56,7 +55,6 @@ class CLock { // encode/decode void encode_state(bufferlist& bl) { - bl.append((char*)&type, sizeof(char)); bl.append((char*)&state, sizeof(state)); bl.append((char*)&nread, sizeof(nread)); bl.append((char*)&nwrite, sizeof(nwrite)); @@ -64,8 +62,6 @@ class CLock { _encode(gather_set, bl); } void decode_state(bufferlist& bl, int& off) { - bl.copy(off, sizeof(type), (char*)&type); - off += sizeof(type); bl.copy(off, sizeof(state), (char*)&state); off += sizeof(state); bl.copy(off, sizeof(nread), (char*)&nread); @@ -141,15 +137,6 @@ class CLock { return (nwrite+nread)>0 ? true:false; } - /* - void twiddle_export() { // was auth, now replica - gather_set.clear(); - if (state == LOCK_GLOCK) state = LOCK_LOCK; - } - void twiddle_import() { // was replica, now auth - - } - */ // stable bool is_stable() { @@ -189,6 +176,12 @@ class CLock { } // client caps allowed + int caps_allowed_ever(bool auth) { + if (auth) + return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WRBUFFER; + else + return CAP_FILE_RDCACHE | CAP_FILE_RD; + } int caps_allowed(bool auth) { if (auth) switch (state) { @@ -206,13 +199,12 @@ class CLock { return CAP_FILE_WR | CAP_FILE_WRBUFFER; case LOCK_LOCK: case LOCK_GLOCKR: + return CAP_FILE_RDCACHE; case LOCK_GLOCKW: case LOCK_GLOCKM: case LOCK_GWRONLYR: case LOCK_GSYNCW: return 0; - default: - assert(0); } else switch (state) { @@ -223,7 +215,7 @@ class CLock { return CAP_FILE_RD; case LOCK_LOCK: case LOCK_GLOCKR: - return 0; + return CAP_FILE_RDCACHE; } assert(0); return 0; @@ -248,8 +240,6 @@ class CLock { case LOCK_GLOCKW: case LOCK_GLOCKM: return 0; - default: - assert(0); } else switch (state) { @@ -261,8 +251,6 @@ class CLock { case LOCK_LOCK: case LOCK_GLOCKR: return 0; - default: - assert(0); } assert(0); return 0; diff --git a/ceph/mds/MDCache.cc b/ceph/mds/MDCache.cc index 415b38bfe2bc3..8c9282d30bca4 100644 --- a/ceph/mds/MDCache.cc +++ b/ceph/mds/MDCache.cc @@ -3524,8 +3524,7 @@ bool MDCache::issue_caps(CInode *in) mds->messenger->send_message(new MClientFileCaps(in->inode, it->second.get_last_seq(), it->second.pending(), - it->second.wanted(), - it->first), + it->second.wanted()), MSG_ADDR_CLIENT(it->first), 0, MDS_PORT_CACHE); } } @@ -3584,48 +3583,52 @@ void MDCache::handle_inode_file_caps(MInodeFileCaps *m) */ void MDCache::handle_client_file_caps(MClientFileCaps *m) { + int client = MSG_ADDR_NUM(m->get_source()); CInode *in = get_inode(m->get_ino()); Capability *cap = 0; if (in) - cap = in->get_client_cap(m->get_client()); + cap = in->get_client_cap(client); if (!in || !cap) { - int next; if (!in) { - dout(7) << "handle_client_file_caps on unknown ino " << m->get_ino() << " passing buck" << endl; + dout(7) << "handle_client_file_caps on unknown ino " << m->get_ino() << ", dropping" << endl; } else { - dout(7) << "handle_client_file_caps no cap for client" << m->get_client() << " on " << *in << endl; - //next = in->authority(); + dout(7) << "handle_client_file_caps no cap for client" << client << " on " << *in << endl; } - next = mds->get_nodeid() + 1; - if (next >= mds->get_cluster()->get_num_mds()) next = 0; - - mds->messenger->send_message(m, - MSG_ADDR_MDS(next), m->get_dest_port()); + delete m; return; } assert(cap); + // filter wanted based on what we could ever give out (given auth/replica status) + int wanted = m->get_wanted() & in->filelock.caps_allowed_ever(in->is_auth()); + dout(7) << "handle_client_file_caps seq " << m->get_seq() << " confirms caps " << cap_string(m->get_caps()) - << " wants " << cap_string(m->get_wanted()) - << " from client" << m->get_client() + << " wants " << cap_string(wanted) + << " from client" << client << " on " << *in << endl; // update wanted - if (cap->wanted() != m->get_wanted()) - cap->set_wanted(m->get_wanted()); + if (cap->wanted() != wanted) + cap->set_wanted(wanted); // confirm caps int had = cap->confirm_receipt(m->get_seq(), m->get_caps()); int has = cap->confirmed(); if (cap->is_null()) { - dout(7) << " cap for client" << m->get_client() << " is now null, removing from " << *in << endl; - in->remove_client_cap(m->get_client()); + dout(7) << " cap for client" << client << " is now null, removing from " << *in << endl; + in->remove_client_cap(client); if (!in->is_auth()) request_inode_file_caps(in); + + // tell client. + MClientFileCaps *r = new MClientFileCaps(in->inode, + 0, 0, 0, + MClientFileCaps::FILECAP_RELEASE); + mds->messenger->send_message(r, m->get_source()); } // merge in atime? @@ -5948,17 +5951,17 @@ void MDCache::encode_export_inode(CInode *in, bufferlist& enc_state, int new_aut { in->version++; // so local log entries are ignored, etc. (FIXME ??) - // tell (all) clients about new inode auth + // tell (all) clients about migrating caps.. mark STALE for (map::iterator it = in->client_caps.begin(); it != in->client_caps.end(); it++) { - dout(7) << "encode_export_inode " << *in << " telling client " << it->first << " new auth " << new_auth << endl; - mds->messenger->send_message(new MClientFileCaps(in->inode, - it->second.get_last_seq(), - it->second.pending(), - it->second.wanted(), - it->first, new_auth), - MSG_ADDR_CLIENT(it->first)); + dout(7) << "encode_export_inode " << *in << " telling client" << it->first << " stale caps" << endl; + MClientFileCaps *m = new MClientFileCaps(in->inode, + it->second.get_last_seq(), + it->second.pending(), + it->second.wanted(), + MClientFileCaps::FILECAP_STALE); + mds->messenger->send_message(m, MSG_ADDR_CLIENT(it->first)); } // relax locks? @@ -6735,7 +6738,8 @@ void MDCache::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int old } // state after link - istate.update_inode(in); + set merged_client_caps; + istate.update_inode(in, merged_client_caps); // add inode? @@ -6762,7 +6766,19 @@ void MDCache::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int old inode_hard_eval(in); } - // file + // caps + for (set::iterator it = merged_client_caps.begin(); + it != merged_client_caps.end(); + it++) { + mds->messenger->send_message(new MClientFileCaps(in->inode, + in->client_caps[*it].get_last_seq(), + in->client_caps[*it].pending(), + in->client_caps[*it].wanted(), + MClientFileCaps::FILECAP_REAP), + MSG_ADDR_CLIENT(*it)); + } + + // filelock if (!in->filelock.is_stable()) { // take me and old auth out of gather set in->filelock.gather_set.erase(mds->get_nodeid()); diff --git a/ceph/mds/MDS.cc b/ceph/mds/MDS.cc index 8011a93666685..c04ded680ca9b 100644 --- a/ceph/mds/MDS.cc +++ b/ceph/mds/MDS.cc @@ -252,7 +252,14 @@ void MDS::handle_shutdown_finish(Message *m) dout(1) << " shut down so far: " << did_shut_down << endl; if (did_shut_down.size() == (unsigned)mdcluster->get_num_mds()) { - // MDS's all shut down! + // MDS's all ready to shut down! + + /* + for (int i=1; isend_message(new MGenericMessage(MSG_SHUTDOWN), + MSG_ADDR_MDS(i), 0, 0); + }*/ // shut down osd's for (int i=0; iinode = inode; this->seq = seq; this->caps = caps; this->wanted = wanted; - - this->client = client; - - this->inode = inode; - this->mds = new_mds; + this->special = special; + this->mds = mds; } virtual char *get_type_name() { return "Cfcap";} @@ -51,18 +67,21 @@ class MClientFileCaps : public Message { off += sizeof(caps); s.copy(off, sizeof(wanted), (char*)&wanted); off += sizeof(wanted); + //s.copy(off, sizeof(client), (char*)&client); + //off += sizeof(client); s.copy(off, sizeof(mds), (char*)&mds); off += sizeof(mds); - s.copy(off, sizeof(client), (char*)&client); - off += sizeof(client); + s.copy(off, sizeof(special), (char*)&special); + off += sizeof(special); } virtual void encode_payload(crope& s) { s.append((char*)&seq, sizeof(seq)); s.append((char*)&inode, sizeof(inode)); s.append((char*)&caps, sizeof(caps)); s.append((char*)&wanted, sizeof(wanted)); + //s.append((char*)&client, sizeof(client)); s.append((char*)&mds,sizeof(mds)); - s.append((char*)&client, sizeof(client)); + s.append((char*)&special,sizeof(special)); } }; -- 2.39.5