From: Sage Weil Date: Wed, 11 Jun 2008 23:40:23 +0000 (-0700) Subject: mds: frag fixups. but there is a lost auth_pin somewhere... X-Git-Tag: v0.3~133 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=86c96671decccf168bc33f1ffc032530e7d741b0;p=ceph.git mds: frag fixups. but there is a lost auth_pin somewhere... --- diff --git a/src/config.cc b/src/config.cc index 8a560fc6f64..1f6f7103405 100644 --- a/src/config.cc +++ b/src/config.cc @@ -338,7 +338,7 @@ md_config_t g_conf = { mds_bal_sample_interval: 3.0, // every 5 seconds mds_bal_replicate_threshold: 8000, mds_bal_unreplicate_threshold: 0,//500, - mds_bal_frag: false, + mds_bal_frag: true, mds_bal_split_size: 10000, mds_bal_split_rd: 25000, mds_bal_split_wr: 10000, diff --git a/src/mds/CDentry.cc b/src/mds/CDentry.cc index 95f4c7b61d1..8d7636c9f89 100644 --- a/src/mds/CDentry.cc +++ b/src/mds/CDentry.cc @@ -291,7 +291,7 @@ void CDentry::auth_pin() << " now " << auth_pins << "+" << nested_auth_pins << dendl; - dir->adjust_nested_auth_pins(1); + dir->adjust_nested_auth_pins(1, 1); } void CDentry::auth_unpin() @@ -305,19 +305,19 @@ void CDentry::auth_unpin() << dendl; assert(auth_pins >= 0); - dir->adjust_nested_auth_pins(-1); + dir->adjust_nested_auth_pins(-1, -1); } -void CDentry::adjust_nested_auth_pins(int by) +void CDentry::adjust_nested_auth_pins(int by, int dirby) { nested_auth_pins += by; - dout(15) << "adjust_nested_auth_pins by " << by + dout(35) << "adjust_nested_auth_pins by " << by << " now " << auth_pins << "+" << nested_auth_pins << dendl; assert(nested_auth_pins >= 0); - dir->adjust_nested_auth_pins(by); + dir->adjust_nested_auth_pins(by, dirby); } bool CDentry::is_frozen() diff --git a/src/mds/CDentry.h b/src/mds/CDentry.h index 55f2a49c117..d5b4c5a0f29 100644 --- a/src/mds/CDentry.h +++ b/src/mds/CDentry.h @@ -167,7 +167,7 @@ public: bool can_auth_pin(); void auth_pin(); void auth_unpin(); - void adjust_nested_auth_pins(int by); + void adjust_nested_auth_pins(int by, int dirby); bool is_frozen(); void adjust_nested_anchors(int by); diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index 5347bfcbe22..2c80babcf9b 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -78,7 +78,9 @@ ostream& operator<<(ostream& out, CDir& dir) } if (dir.get_cum_auth_pins()) - out << " ap=" << dir.get_auth_pins() << "+" << dir.get_nested_auth_pins(); + out << " ap=" << dir.get_auth_pins() + << "+" << dir.get_dir_auth_pins() + << "+" << dir.get_nested_auth_pins(); if (dir.get_nested_anchors()) out << " na=" << dir.get_nested_anchors(); @@ -163,6 +165,7 @@ CDir::CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth) : auth_pins = 0; nested_auth_pins = 0; + dir_auth_pins = 0; request_pins = 0; nested_anchors = 0; @@ -368,7 +371,7 @@ void CDir::link_inode_work( CDentry *dn, CInode *in) // adjust auth pin count if (in->auth_pins + in->nested_auth_pins) - dn->adjust_nested_auth_pins(in->auth_pins + in->nested_auth_pins); + dn->adjust_nested_auth_pins(in->auth_pins + in->nested_auth_pins, in->auth_pins); if (in->inode.anchored + in->nested_anchors) dn->adjust_nested_anchors(in->nested_anchors + in->inode.anchored); @@ -453,7 +456,7 @@ void CDir::unlink_inode_work( CDentry *dn ) // unlink auth_pin count if (in->auth_pins + in->nested_auth_pins) - dn->adjust_nested_auth_pins(0 - (in->auth_pins + in->nested_auth_pins)); + dn->adjust_nested_auth_pins(0 - (in->auth_pins + in->nested_auth_pins), 0 - in->auth_pins); if (in->inode.anchored + in->nested_anchors) dn->adjust_nested_anchors(0 - (in->nested_anchors + in->inode.anchored)); @@ -1563,13 +1566,15 @@ void CDir::auth_unpin() inode->adjust_nested_auth_pins(-1); } -void CDir::adjust_nested_auth_pins(int inc) +void CDir::adjust_nested_auth_pins(int inc, int dirinc) { nested_auth_pins += inc; + dir_auth_pins += dirinc; - dout(15) << "adjust_nested_auth_pins " << inc << " on " << *this + dout(15) << "adjust_nested_auth_pins " << inc << "/" << dirinc << " on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << dendl; assert(nested_auth_pins >= 0); + assert(dir_auth_pins >= 0); maybe_finish_freeze(); // pending freeze? diff --git a/src/mds/CDir.h b/src/mds/CDir.h index 9835409834c..5c672b39c31 100644 --- a/src/mds/CDir.h +++ b/src/mds/CDir.h @@ -211,7 +211,7 @@ protected: // lock nesting, freeze int auth_pins; - int nested_auth_pins; + int nested_auth_pins, dir_auth_pins; int request_pins; int nested_anchors; @@ -444,10 +444,11 @@ public: int get_cum_auth_pins() { return auth_pins + nested_auth_pins; } int get_auth_pins() { return auth_pins; } int get_nested_auth_pins() { return nested_auth_pins; } + int get_dir_auth_pins() { return dir_auth_pins; } void auth_pin(); void auth_unpin(); - void adjust_nested_auth_pins(int inc); + void adjust_nested_auth_pins(int inc, int dirinc); void verify_fragstat(); int get_nested_anchors() { return nested_anchors; } @@ -463,15 +464,19 @@ public: void unfreeze_dir(); void maybe_finish_freeze() { - if (auth_pins != 1 || nested_auth_pins != 0) + if (auth_pins != 1 || + dir_auth_pins != 0) return; - if (state_test(STATE_FREEZINGTREE)) { - _freeze_tree(); + // we can freeze the _dir_ even with nested pins... + if (state_test(STATE_FREEZINGDIR)) { + _freeze_dir(); auth_unpin(); finish_waiting(WAIT_FROZEN); } - if (state_test(STATE_FREEZINGDIR)) { - _freeze_dir(); + if (nested_auth_pins != 0) + return; + if (state_test(STATE_FREEZINGTREE)) { + _freeze_tree(); auth_unpin(); finish_waiting(WAIT_FROZEN); } @@ -499,7 +504,7 @@ public: return true; } bool is_freezeable_dir(bool freezing=false) { - if ((auth_pins-freezing) > 0) + if ((auth_pins-freezing) > 0 || dir_auth_pins > 0) return false; // if not subtree root, inode must not be frozen (tree--frozen_dir is okay). diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index b23fbdc8a7e..f8bea2b5b47 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -67,6 +67,9 @@ ostream& operator<<(ostream& out, CInode& in) out << " v" << in.get_version(); + if (in.is_auth_pinned()) + out << " ap=" << in.get_num_auth_pins(); + if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH"; if (in.state_test(CInode::STATE_NEEDSRECOVER)) out << " needsrecover"; if (in.state_test(CInode::STATE_RECOVERING)) out << " recovering"; @@ -612,11 +615,25 @@ void CInode::decode_lock_state(int type, bufferlist& bl) ::decode(authfrags, p); if (is_auth()) { // auth. believe replica's auth frags only. - for (set::iterator p = authfrags.begin(); p != authfrags.end(); ++p) - dirfragtree.force_to_leaf(*p); + for (set::iterator p = authfrags.begin(); p != authfrags.end(); ++p) + if (!dirfragtree.is_leaf(*p)) { + dout(10) << " forcing frag " << *p << " to leaf (split|merge)" << dendl; + dirfragtree.force_to_leaf(*p); + dirfragtreelock.set_updated(); + } } else { - // replica. just take the tree. + // replica. take the new tree, BUT make sure any open + // dirfrags remain leaves (they may have split _after_ this + // dft was scattered, or we may still be be waiting on the + // notify from the auth) dirfragtree.swap(temp); + for (map::iterator p = dirfrags.begin(); + p != dirfrags.end(); + p++) + if (!dirfragtree.is_leaf(p->first)) { + dout(10) << " forcing open dirfrag " << p->first << " to leaf (racing with split|merge)" << dendl; + dirfragtree.force_to_leaf(p->first); + } } } break; @@ -732,6 +749,12 @@ void CInode::finish_scatter_gather_update(int type) } break; + case CEPH_LOCK_IDFT: + { + assert(is_auth()); + } + break; + default: assert(0); } @@ -830,7 +853,7 @@ void CInode::auth_pin() << dendl; if (parent) - parent->adjust_nested_auth_pins( 1 ); + parent->adjust_nested_auth_pins(1, 1); } void CInode::auth_unpin() @@ -846,7 +869,7 @@ void CInode::auth_unpin() assert(auth_pins >= 0); if (parent) - parent->adjust_nested_auth_pins( -1 ); + parent->adjust_nested_auth_pins(-1, -1); if (is_freezing_inode() && auth_pins == auth_pin_freeze_allowance) { @@ -864,12 +887,12 @@ void CInode::adjust_nested_auth_pins(int a) if (!parent) return; nested_auth_pins += a; - dout(15) << "adjust_nested_auth_pins by " << a + dout(35) << "adjust_nested_auth_pins by " << a << " now " << auth_pins << "+" << nested_auth_pins << dendl; assert(nested_auth_pins >= 0); - parent->adjust_nested_auth_pins(a); + parent->adjust_nested_auth_pins(a, 0); } void CInode::adjust_nested_anchors(int by) diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 7a15fb27b74..19ffd2541a9 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -501,9 +501,8 @@ public: // -- auth pins -- - int is_auth_pinned() { - return auth_pins; - } + int is_auth_pinned() { return auth_pins; } + int get_num_auth_pins() { return auth_pins; } void adjust_nested_auth_pins(int a); bool can_auth_pin(); void auth_pin(); diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index fd9d4982192..a9d10448b87 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -4735,6 +4735,19 @@ MDRequest *MDCache::request_start_slave(metareqid_t ri, int by) return mdr; } +MDRequest *MDCache::request_start_internal(int op) +{ + MDRequest *mdr = new MDRequest; + mdr->reqid.name = entity_name_t::MDS(mds->get_nodeid()); + mdr->reqid.tid = mds->issue_tid(); + mdr->internal_op = op; + + assert(active_requests.count(mdr->reqid) == 0); + active_requests[mdr->reqid] = mdr; + dout(7) << "request_start_internal " << *mdr << " op " << op << dendl; + return mdr; +} + MDRequest *MDCache::request_get(metareqid_t rid) { @@ -4784,12 +4797,19 @@ void MDCache::dispatch_request(MDRequest *mdr) mds->server->dispatch_client_request(mdr); } else if (mdr->slave_request) { mds->server->dispatch_slave_request(mdr); - } else - assert(0); + } else { + switch (mdr->internal_op) { + case MDS_INTERNAL_OP_FRAGMENT: + dispatch_fragment(mdr); + break; + + default: + assert(0); + } + } } - void MDCache::request_forget_foreign_locks(MDRequest *mdr) { // xlocks @@ -6185,6 +6205,10 @@ void MDCache::handle_dentry_unlink(MDentryUnlink *m) +// =================================================================== + + + // =================================================================== // FRAGMENT @@ -6207,11 +6231,12 @@ void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits, // yuck. we may have discovered the inode while it was being fragmented. if (!diri->dirfragtree.is_leaf(basefrag)) diri->dirfragtree.force_to_leaf(basefrag); - diri->dirfragtree.split(basefrag, bits); - dout(10) << " new fragtree is " << diri->dirfragtree << dendl; CDir *base = diri->get_or_open_dirfrag(this, basefrag); + diri->dirfragtree.split(basefrag, bits); + dout(10) << " new fragtree is " << diri->dirfragtree << dendl; + if (bits > 0) { if (base) { CDir *baseparent = base->get_parent_dir(); @@ -6259,20 +6284,6 @@ void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits, } } -class C_MDC_FragmentGo : public Context { - MDCache *mdcache; - CInode *diri; - list dirs; - frag_t basefrag; - int bits; -public: - C_MDC_FragmentGo(MDCache *m, CInode *di, list& dls, frag_t bf, int b) : - mdcache(m), diri(di), dirs(dls), basefrag(bf), bits(b) { } - virtual void finish(int r) { - mdcache->fragment_go(diri, dirs, basefrag, bits); - } -}; - void MDCache::split_dir(CDir *dir, int bits) { dout(7) << "split_dir " << *dir << " bits " << bits << dendl; @@ -6301,64 +6312,107 @@ void MDCache::split_dir(CDir *dir, int bits) return; } - // wrlock - if (!mds->locker->scatter_wrlock_try(&dir->inode->dirfragtreelock, 0, false)) { - dout(7) << "can't wrlock dirfragtree on " << *dir->inode << dendl; - return; - } + // register request + // this is primary so we can hold multiple locks, remote auth_pins, and all that + MDRequest *mdr = request_start_internal(MDS_INTERNAL_OP_FRAGMENT); - list startfrags; - startfrags.push_back(dir); - + // describe the fragment mutation + mdr->more()->fragment_in = dir->inode; + mdr->more()->fragment_base = dir->dirfrag().frag; + mdr->more()->fragment_start.push_back(dir); + mdr->more()->fragment_bits = bits; + + // mark start frag + //mdr->auth_pin(dir); // this will block the freeze, until mark_complete completes + //dir->auth_pin(); dir->state_set(CDir::STATE_FRAGMENTING); - - fragment_freeze(dir->get_inode(), startfrags, dir->get_frag(), bits); - fragment_mark_and_complete(dir->get_inode(), startfrags, dir->get_frag(), bits); + + dispatch_request(mdr); } -/* - * initial the freeze, blocking with an auth_pin. - * - * some reason(s) we have to freeze: - * - on merge, version/projected version are unified from all fragments; - * concurrent pipelined updates in the directory will have divergent - * versioning... and that's no good. - */ -void MDCache::fragment_freeze(CInode *diri, list& frags, frag_t basefrag, int bits) +class C_MDC_FragmentGo : public Context { + MDCache *mdcache; + MDRequest *mdr; +public: + C_MDC_FragmentGo(MDCache *m, MDRequest *r) : mdcache(m), mdr(r) {} + virtual void finish(int r) { + mdcache->fragment_go(mdr); + } +}; + +void MDCache::dispatch_fragment(MDRequest *mdr) { - C_Gather *gather = new C_Gather(new C_MDC_FragmentGo(this, diri, frags, basefrag, bits)); - - // freeze the dirs - for (list::iterator p = frags.begin(); - p != frags.end(); + CInode *diri = mdr->more()->fragment_in; + int bits = mdr->more()->fragment_bits; + dout(10) << "dispatch_fragment " << *mdr << " bits " << bits << " " << *diri << dendl; + + // (try to re-) auth_pin start fragments (acquire_locks may have to drop auth_pins to avoid deadlock) + for (list::iterator p = mdr->more()->fragment_start.begin(); + p != mdr->more()->fragment_start.end(); + ++p) { + CDir *dir = *p; + if (!mdr->is_auth_pinned(dir) && + (!dir->is_auth() || !dir->can_auth_pin())) { + dout(10) << " giving up, no longer auth+authpinnable on " << *dir << dendl; + for (list::iterator p = mdr->more()->fragment_start.begin(); + p != mdr->more()->fragment_start.end(); + ++p) + (*p)->state_clear(CDir::STATE_FRAGMENTING); + request_finish(mdr); + return; + } + mdr->auth_pin(dir); + } + + // wrlock dirlock, dftlock + set rdlocks, wrlocks, xlocks; + wrlocks.insert(&diri->dirfragtreelock); + wrlocks.insert(&diri->dirlock); + if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) + return; + + /* + * initiate the freeze, blocking with an auth_pin. + * + * some reason(s) we have to freeze: + * - on merge, version/projected version are unified from all fragments; + * concurrent pipelined updates in the directory will have divergent + * versioning... and that's no good. + */ + dout(10) << "dispatch_fragment freezing start frags" << dendl; + C_Gather *gather = new C_Gather(new C_MDC_FragmentGo(this, mdr)); + + for (list::iterator p = mdr->more()->fragment_start.begin(); + p != mdr->more()->fragment_start.end(); ++p) { CDir *dir = *p; - dir->auth_pin(); // this will block the freeze, until mark_and_complete dir->freeze_dir(); assert(dir->is_freezing_dir()); dir->add_waiter(CDir::WAIT_FROZEN, gather->new_sub()); } + + // initial mark+complete pass + fragment_mark_and_complete(mdr); } class C_MDC_FragmentMarking : public Context { MDCache *mdcache; - CInode *diri; - list dirs; - frag_t basefrag; - int bits; + MDRequest *mdr; public: - C_MDC_FragmentMarking(MDCache *m, CInode *di, list& dls, frag_t bf, int b) : - mdcache(m), diri(di), dirs(dls), basefrag(bf), bits(b) { } + C_MDC_FragmentMarking(MDCache *m, MDRequest *r) : mdcache(m), mdr(r) {} virtual void finish(int r) { - mdcache->fragment_mark_and_complete(diri, dirs, basefrag, bits); + mdcache->fragment_mark_and_complete(mdr); } }; -void MDCache::fragment_mark_and_complete(CInode *diri, - list& startfrags, - frag_t basefrag, int bits) +void MDCache::fragment_mark_and_complete(MDRequest *mdr) { - dout(10) << "fragment_mark_and_complete " << basefrag << " by " << bits + CInode *diri = mdr->more()->fragment_in; + list& startfrags = mdr->more()->fragment_start; + frag_t basefrag = mdr->more()->fragment_base; + int bits = mdr->more()->fragment_bits; + + dout(10) << "fragment_mark_and_complete " << *mdr << " " << basefrag << " by " << bits << " on " << *diri << dendl; C_Gather *gather = 0; @@ -6370,7 +6424,7 @@ void MDCache::fragment_mark_and_complete(CInode *diri, if (!dir->is_complete()) { dout(15) << " fetching incomplete " << *dir << dendl; - if (!gather) gather = new C_Gather(new C_MDC_FragmentMarking(this, diri, startfrags, basefrag, bits)); + if (!gather) gather = new C_Gather(new C_MDC_FragmentMarking(this, mdr)); dir->fetch(gather->new_sub(), true); // ignore authpinnability } @@ -6383,7 +6437,8 @@ void MDCache::fragment_mark_and_complete(CInode *diri, p->second->state_set(CDentry::STATE_FRAGMENTING); } dir->state_set(CDir::STATE_DNPINNEDFRAG); - dir->auth_unpin(); // allow our freeze to complete + mdr->auth_unpin(dir); // allow our freeze to complete + //dir->auth_unpin(); } else { dout(15) << " marked " << *dir << dendl; @@ -6391,34 +6446,32 @@ void MDCache::fragment_mark_and_complete(CInode *diri, } } - class C_MDC_FragmentStored : public Context { MDCache *mdcache; - CInode *diri; - frag_t basefrag; - int bits; - list resultfrags; + MDRequest *mdr; public: - C_MDC_FragmentStored(MDCache *m, CInode *di, frag_t bf, int b, - list& rf) : - mdcache(m), diri(di), basefrag(bf), bits(b), resultfrags(rf) { } + C_MDC_FragmentStored(MDCache *m, MDRequest *r) : mdcache(m), mdr(r) {} virtual void finish(int r) { - mdcache->fragment_stored(diri, basefrag, bits, resultfrags); + mdcache->fragment_stored(mdr); } }; -void MDCache::fragment_go(CInode *diri, list& startfrags, frag_t basefrag, int bits) +void MDCache::fragment_go(MDRequest *mdr) { - dout(10) << "fragment_go " << basefrag << " by " << bits + CInode *diri = mdr->more()->fragment_in; + frag_t basefrag = mdr->more()->fragment_base; + int bits = mdr->more()->fragment_bits; + + dout(10) << "fragment_go " << *mdr << " " << basefrag << " by " << bits << " on " << *diri << dendl; // refragment - list resultfrags; + list &resultfrags = mdr->more()->fragment_result; list waiters; adjust_dir_fragments(diri, basefrag, bits, resultfrags, waiters); mds->queue_waiters(waiters); - C_Gather *gather = new C_Gather(new C_MDC_FragmentStored(this, diri, basefrag, bits, resultfrags)); + C_Gather *gather = new C_Gather(new C_MDC_FragmentStored(this, mdr)); // freeze, store resulting frags for (list::iterator p = resultfrags.begin(); @@ -6434,58 +6487,59 @@ void MDCache::fragment_go(CInode *diri, list& startfrags, frag_t basefrag class C_MDC_FragmentLogged : public Context { MDCache *mdcache; - CInode *diri; - frag_t basefrag; - int bits; - list resultfrags; - Mutation *mut; + MDRequest *mdr; public: - C_MDC_FragmentLogged(MDCache *m, CInode *di, frag_t bf, int b, - list& rf, Mutation *mu) : - mdcache(m), diri(di), basefrag(bf), bits(b), mut(mu) { - resultfrags.swap(rf); - } + C_MDC_FragmentLogged(MDCache *m, MDRequest *r) : mdcache(m), mdr(r) {} virtual void finish(int r) { - mdcache->fragment_logged(diri, basefrag, bits, - resultfrags, mut); + mdcache->fragment_logged(mdr); } }; -void MDCache::fragment_stored(CInode *diri, frag_t basefrag, int bits, - list& resultfrags) +void MDCache::fragment_stored(MDRequest *mdr) { - dout(10) << "fragment_stored " << basefrag << " by " << bits + CInode *diri = mdr->more()->fragment_in; + list &resultfrags = mdr->more()->fragment_result; + frag_t basefrag = mdr->more()->fragment_base; + int bits = mdr->more()->fragment_bits; + + dout(10) << "fragment_stored " << *mdr << " " << basefrag << " by " << bits << " on " << *diri << dendl; - Mutation *mut = new Mutation; - mut->ls = mds->mdlog->get_current_segment(); + mdr->ls = mds->mdlog->get_current_segment(); EFragment *le = new EFragment(mds->mdlog, diri->ino(), basefrag, bits); le->metablob.add_dir_context(*resultfrags.begin()); + // dft lock mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock); - mut->ls->dirty_dirfrag_dirfragtree.push_back(&diri->xlist_dirty_dirfrag_dirfragtree); - mut->add_updated_scatterlock(&diri->dirfragtreelock); + mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->xlist_dirty_dirfrag_dirfragtree); + mdr->add_updated_scatterlock(&diri->dirfragtreelock); + + // dirlock + mds->locker->mark_updated_scatterlock(&diri->dirlock); + mdr->ls->dirty_dirfrag_dir.push_back(&diri->xlist_dirty_dirfrag_dir); + mdr->add_updated_scatterlock(&diri->dirlock); // journal new dirfrag fragstats for each new fragment. - // mark complete. but not dirty. for (list::iterator p = resultfrags.begin(); p != resultfrags.end(); p++) { CDir *dir = *p; dout(10) << " result frag " << *dir << dendl; - le->metablob.add_dir(dir, true).mark_complete(); // mark complete + le->metablob.add_dir(dir, false); } mds->mdlog->submit_entry(le, - new C_MDC_FragmentLogged(this, diri, basefrag, bits, - resultfrags, mut)); + new C_MDC_FragmentLogged(this, mdr)); } -void MDCache::fragment_logged(CInode *diri, frag_t basefrag, int bits, - list& resultfrags, - Mutation *mut) +void MDCache::fragment_logged(MDRequest *mdr) { - dout(10) << "fragment_logged " << basefrag << " bits " << bits + CInode *diri = mdr->more()->fragment_in; + list &resultfrags = mdr->more()->fragment_result; + frag_t basefrag = mdr->more()->fragment_base; + int bits = mdr->more()->fragment_bits; + + dout(10) << "fragment_logged " << *mdr << " " << basefrag << " bits " << bits << " on " << *diri << dendl; // tell peers @@ -6504,7 +6558,7 @@ void MDCache::fragment_logged(CInode *diri, frag_t basefrag, int bits, mds->send_message_mds(notify, p->first); } - mut->apply(); // mark scatterlocks, mainly. + mdr->apply(); // mark scatterlock // unfreeze resulting frags set peers; @@ -6514,7 +6568,7 @@ void MDCache::fragment_logged(CInode *diri, frag_t basefrag, int bits, CDir *dir = *p; dout(10) << " result frag " << *dir << dendl; - // dirty, unpin, unfreeze + // unmark, unfreeze dir->state_clear(CDir::STATE_FRAGMENTING); for (CDir::map_t::iterator p = dir->items.begin(); @@ -6528,7 +6582,8 @@ void MDCache::fragment_logged(CInode *diri, frag_t basefrag, int bits, dir->unfreeze_dir(); } - mds->locker->scatter_wrlock_finish(&diri->dirfragtreelock, 0); + // done! clean up, drop locks, etc. + request_finish(mdr); } diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 51e993b19f6..5f584295bb8 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -108,8 +108,10 @@ struct Mutation { list projected_fnodes; list updated_scatterlocks; - Mutation() : ls(0), - done_locking(false), committing(false), aborted(false) {} + Mutation() : + ls(0), + slave_to_mds(-1), + done_locking(false), committing(false), aborted(false) {} Mutation(metareqid_t ri, int slave_to=-1) : reqid(ri), ls(0), @@ -145,7 +147,7 @@ struct Mutation { } } void auth_unpin(MDSCacheObject *object) { - assert(is_auth_pinned(object)); + assert(auth_pins.count(object)); object->auth_unpin(); auth_pins.erase(object); } @@ -206,6 +208,10 @@ inline ostream& operator<<(ostream& out, Mutation &mut) } +enum { + MDS_INTERNAL_OP_FRAGMENT, +}; + /** active_request_t * state we track for requests we are currently processing. * mostly information about locks held, so that we can drop them all @@ -223,6 +229,9 @@ struct MDRequest : public Mutation { // -- i am a slave request MMDSSlaveRequest *slave_request; // slave request (if one is pending; implies slave == true) + // -- i am an internal op + int internal_op; + // break rarely-used fields into a separately allocated structure // to save memory for most ops @@ -250,10 +259,18 @@ struct MDRequest : public Mutation { Context *slave_commit; bufferlist rollback_bl; + // internal ops + CInode *fragment_in; + frag_t fragment_base; + list fragment_start; + list fragment_result; + int fragment_bits; + More() : src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0), destdn_was_remote_inode(0), was_link_merge(false), - slave_commit(0) { } + slave_commit(0), + fragment_in(0), fragment_bits(0) { } } *_more; @@ -261,16 +278,19 @@ struct MDRequest : public Mutation { MDRequest() : session(0), client_request(0), ref(0), slave_request(0), + internal_op(-1), _more(0) {} MDRequest(metareqid_t ri, MClientRequest *req) : Mutation(ri), session(0), client_request(req), ref(0), slave_request(0), + internal_op(-1), _more(0) {} MDRequest(metareqid_t ri, int by) : Mutation(ri, by), session(0), client_request(0), ref(0), slave_request(0), + internal_op(-1), _more(0) {} ~MDRequest() { delete _more; @@ -290,6 +310,7 @@ struct MDRequest : public Mutation { if (is_slave()) out << " slave_to mds" << slave_to_mds; if (client_request) out << " cr=" << client_request; if (slave_request) out << " sr=" << slave_request; + if (internal_op == MDS_INTERNAL_OP_FRAGMENT) out << " fragment"; out << ")"; } }; @@ -430,6 +451,7 @@ protected: public: MDRequest* request_start(MClientRequest *req); MDRequest* request_start_slave(metareqid_t rid, int by); + MDRequest* request_start_internal(int op); bool have_request(metareqid_t rid) { return active_requests.count(rid); } @@ -796,11 +818,11 @@ public: void split_dir(CDir *dir, int byn); private: - void fragment_freeze(CInode *diri, list& startfrags, frag_t basefrag, int bits); - void fragment_mark_and_complete(CInode *diri, list& startfrags, frag_t basefrag, int bits); - void fragment_go(CInode *diri, list& startfrags, frag_t basefrag, int bits); - void fragment_stored(CInode *diri, frag_t basefrag, int bits, list& resultfrags); - void fragment_logged(CInode *diri, frag_t basefrag, int bits, list& resultfrags, Mutation *mut); + void dispatch_fragment(MDRequest *mdr); + void fragment_mark_and_complete(MDRequest *mdr); + void fragment_go(MDRequest *mdr); + void fragment_stored(MDRequest *mdr); + void fragment_logged(MDRequest *mdr); friend class C_MDC_FragmentGo; friend class C_MDC_FragmentMarking; friend class C_MDC_FragmentStored; diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index 1b8f5aeca0c..a38f9e636bf 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -667,7 +667,7 @@ class MDSCacheObject { const static int PIN_LOCK = -1002; const static int PIN_REQUEST = -1003; const static int PIN_WAITER = 1004; - const static int PIN_DIRTYSCATTERED = 1005; // make this neg if we start using multiple scatterlocks? + const static int PIN_DIRTYSCATTERED = -1005; static const int PIN_AUTHPIN = 1006; static const int PIN_PTRWAITER = -1007; const static int PIN_TEMPEXPORTING = 1008; // temp pin between encode_ and finish_export diff --git a/src/vstartnew.sh b/src/vstartnew.sh index 67a4e399f49..738be4a69e5 100755 --- a/src/vstartnew.sh +++ b/src/vstartnew.sh @@ -40,7 +40,7 @@ $CEPH_BIN/cmon -d mondata/mon0 --debug_mon 20 --debug_ms 1 $CEPH_BIN/osdmaptool --clobber --createsimple .ceph_monmap 4 --print .ceph_osdmap # --pgbits 2 $CEPH_BIN/cmonctl osd setmap -i .ceph_osdmap -for osd in 0 1 #2 3 4 5 6 7 8 9 10 11 12 13 14 15 +for osd in 0 #1 #2 3 4 5 6 7 8 9 10 11 12 13 14 15 do $CEPH_BIN/cosd --mkfs_for_osd $osd dev/osd$osd # initialize empty object store #valgrind --tool=massif $CEPH_BIN/cosd dev/osd$osd --debug_ms 1 --debug_osd 20 --debug_filestore 10 1>out/o$osd & #--debug_osd 40 @@ -48,7 +48,7 @@ do done # mds -$CEPH_BIN/cmds $ARGS --debug_ms 1 --debug_mds 20 --mds_thrash_fragments 0 --mds_thrash_exports 0 #--debug_ms 20 +$CEPH_BIN/cmds $ARGS --debug_ms 1 --debug_mds 20 --mds_thrash_fragments 1 --mds_thrash_exports 0 #--debug_ms 20 #$CEPH_BIN/cmds $ARGS --debug_ms 1 --debug_mds 20 --mds_thrash_fragments 0 --mds_thrash_exports 0 #--debug_ms 20 ./cmonctl mds set_max_mds 2