From: Yan, Zheng Date: Sun, 12 Jan 2014 09:31:57 +0000 (+0800) Subject: mds: acquire locks required by exporting dir X-Git-Tag: v0.78~165^2~29 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=3154ee84fa6ff5d5fdfab29f1b076be01bccca32;p=ceph.git mds: acquire locks required by exporting dir Start internal MDS request to acquire locks required by exporting dir. It's more reliable than using Locker::rdlock_take_set(), It also allows acquiring locks besides rdlock. Only use Locker::acquire_locks() to acquire locks in the first stage of exporting dir (before freeze the subtree). After the subtree is frozen, to minimize the time of frozen tree, still use 'try lock' to re-acquire the locks. Signed-off-by: Yan, Zheng --- diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index f075da4e8f0f..714b2875a236 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -338,6 +338,7 @@ enum { // internal op CEPH_MDS_OP_FRAGMENTDIR= 0x01500, + CEPH_MDS_OP_EXPORTDIR = 0x01501, }; extern const char *ceph_mds_op_name(int op); diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 341614b27574..827bb9c26f24 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -1262,25 +1262,16 @@ bool Locker::rdlock_try_set(set& locks) return true; } -void Locker::rdlock_take_set(set& locks) +void Locker::rdlock_take_set(set& locks, Mutation *mut) { dout(10) << "rdlock_take_set " << locks << dendl; - for (set::iterator p = locks.begin(); p != locks.end(); ++p) - (*p)->get_rdlock(); -} - -void Locker::rdlock_finish_set(set& locks) -{ - dout(10) << "rdlock_finish_set " << locks << dendl; for (set::iterator p = locks.begin(); p != locks.end(); ++p) { - bool need_issue = false; - rdlock_finish(*p, 0, &need_issue); - if (need_issue) - issue_caps((CInode*)(*p)->get_parent()); + (*p)->get_rdlock(); + mut->rdlocks.insert(*p); + mut->locks.insert(*p); } } - // ------------------ // wrlock diff --git a/src/mds/Locker.h b/src/mds/Locker.h index a0824537c0cf..605686270b7a 100644 --- a/src/mds/Locker.h +++ b/src/mds/Locker.h @@ -133,8 +133,7 @@ public: void rdlock_finish(SimpleLock *lock, Mutation *mut, bool *pneed_issue); bool can_rdlock_set(set& locks); bool rdlock_try_set(set& locks); - void rdlock_take_set(set& locks); - void rdlock_finish_set(set& locks); + void rdlock_take_set(set& locks, Mutation *mut); void wrlock_force(SimpleLock *lock, Mutation *mut); bool wrlock_start(SimpleLock *lock, MDRequest *mut, bool nowait=false); diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index d1c4356cf62a..2e0f1c734238 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -8759,6 +8759,9 @@ void MDCache::dispatch_request(MDRequest *mdr) case CEPH_MDS_OP_FRAGMENTDIR: dispatch_fragment_dir(mdr); break; + case CEPH_MDS_OP_EXPORTDIR: + migrator->dispatch_export_dir(mdr); + break; default: assert(0); } diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index 117852c81a6e..3cec0b1abcda 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -24,6 +24,7 @@ #include "MDBalancer.h" #include "MDLog.h" #include "MDSMap.h" +#include "Mutation.h" #include "include/filepath.h" @@ -231,12 +232,18 @@ void Migrator::export_try_cancel(CDir *dir) int state = it->second.state; switch (state) { + case EXPORT_LOCKING: + dout(10) << "export state=locking : dropping locks and removing auth_pin" << dendl; + it->second.state = EXPORT_CANCELLED; + dir->auth_unpin(this); + dir->state_clear(CDir::STATE_EXPORTING); + break; + case EXPORT_DISCOVERING: dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << dendl; it->second.state = EXPORT_CANCELLED; dir->unfreeze_tree(); // cancel the freeze dir->auth_unpin(this); - export_unlock(dir); export_freeze_finish(dir); dir->state_clear(CDir::STATE_EXPORTING); if (mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer)) // tell them. @@ -284,7 +291,6 @@ void Migrator::export_try_cancel(CDir *dir) dir->unfreeze_tree(); cache->adjust_subtree_auth(dir, mds->get_nodeid()); cache->try_subtree_merge(dir); // NOTE: this may journal subtree_map as side effect - export_unlock(dir); dir->state_clear(CDir::STATE_EXPORTING); if (mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer)) // tell them. mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer); @@ -311,9 +317,20 @@ void Migrator::export_try_cancel(CDir *dir) if (it->second.state == EXPORT_CANCELLED) { // wake up any waiters mds->queue_waiters(it->second.waiting_for_finish); + // drop locks + if (state == EXPORT_LOCKING || state == EXPORT_DISCOVERING) { + MDRequest *mdr = dynamic_cast(it->second.mut); + assert(mdr); + if (mdr->more()->waiting_on_slave.empty()) + mds->mdcache->request_finish(mdr); + } else if (it->second.mut) { + Mutation *mut = it->second.mut; + mds->locker->drop_locks(mut); + mut->cleanup(); + delete mut; + } export_state.erase(it); - // send pending import_maps? (these need to go out when all exports have finished.) cache->maybe_send_pending_resolves(); @@ -360,6 +377,7 @@ void Migrator::handle_mds_failure_or_stop(int who) // - that aren't frozen yet (to avoid auth_pin deadlock) // - they havne't prepped yet (they may need to discover bounds to do that) if (p->second.peer == who || + p->second.state == EXPORT_LOCKING || p->second.state == EXPORT_DISCOVERING || p->second.state == EXPORT_FREEZING || p->second.state == EXPORT_PREPPING) { @@ -583,7 +601,8 @@ void Migrator::audit() p != export_state.end(); ++p) { CDir *dir = p->first; - if (p->second.state == EXPORT_DISCOVERING || + if (p->second.state == EXPORT_LOCKING || + p->second.state == EXPORT_DISCOVERING || p->second.state == EXPORT_FREEZING) continue; assert(dir->is_ambiguous_dir_auth()); assert(dir->authority().first == mds->get_nodeid() || @@ -613,6 +632,10 @@ void Migrator::export_dir_nicely(CDir *dir, int dest) void Migrator::maybe_do_queued_export() { + static bool running; + if (running) + return; + running = true; while (!export_queue.empty() && export_state.size() <= 4) { dirfrag_t df = export_queue.front().first; @@ -627,6 +650,7 @@ void Migrator::maybe_do_queued_export() export_dir(dir, dest); } + running = false; } @@ -707,48 +731,76 @@ void Migrator::export_dir(CDir *dir, int dest) dout(7) << "already exporting" << dendl; return; } - - // locks? - set locks; - get_export_lock_set(dir, locks); - if (!mds->locker->rdlock_try_set(locks)) { - dout(7) << "export_dir can't rdlock needed locks, failing." << dendl; - return; - } - // ok. - mds->locker->rdlock_take_set(locks); + dir->auth_pin(this); + dir->state_set(CDir::STATE_EXPORTING); + + MDRequest *mdr = mds->mdcache->request_start_internal(CEPH_MDS_OP_EXPORTDIR); + mdr->more()->export_dir = dir; assert(export_state.count(dir) == 0); export_state_t& stat = export_state[dir]; - stat.state = EXPORT_DISCOVERING; + stat.state = EXPORT_LOCKING; stat.peer = dest; - stat.tid = ++last_export_tid; - stat.locks.swap(locks); + stat.tid = mdr->reqid.tid; + stat.mut = mdr; + + dispatch_export_dir(mdr); +} + +void Migrator::dispatch_export_dir(MDRequest *mdr) +{ + dout(7) << "dispatch_export_dir " << *mdr << dendl; + CDir *dir = mdr->more()->export_dir; + + map::iterator it = export_state.find(dir); + if (it == export_state.end() || it->second.tid != mdr->reqid.tid) { + // export must have aborted. + dout(7) << "export must have aborted " << *mdr << dendl; + mds->mdcache->request_finish(mdr); + return; + } + assert(it->second.state == EXPORT_LOCKING); + + if (mdr->aborted || dir->is_frozen() || dir->is_freezing()) { + dout(7) << "wouldblock|freezing|frozen, canceling export" << dendl; + export_try_cancel(dir); + return; + } + + // locks? + set rdlocks; + set xlocks; + set wrlocks; + get_export_lock_set(dir, rdlocks); + + if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks, NULL, NULL, true)) { + if (mdr->aborted) + export_try_cancel(dir); + return; + } - dir->state_set(CDir::STATE_EXPORTING); assert(g_conf->mds_kill_export_at != 1); + it->second.state = EXPORT_DISCOVERING; // send ExportDirDiscover (ask target) filepath path; dir->inode->make_path(path); MExportDirDiscover *discover = new MExportDirDiscover(dir->dirfrag(), path, - mds->get_nodeid(), stat.tid); - mds->send_message_mds(discover, dest); + mds->get_nodeid(), + it->second.tid); + mds->send_message_mds(discover, it->second.peer); assert(g_conf->mds_kill_export_at != 2); - // start the freeze, but hold it up with an auth_pin. utime_t now = ceph_clock_now(g_ceph_context); export_freezing_dirs.insert(make_pair(now, dir)); export_freezing_state[dir].start_time = now; - dir->auth_pin(this); dir->freeze_tree(); assert(dir->is_freezing_tree()); dir->add_waiter(CDir::WAIT_FROZEN, new C_MDC_ExportFreeze(this, dir)); } - /* * called on receipt of MExportDirDiscoverAck * the importer now has the directory's _inode_ in memory, and pinned. @@ -771,7 +823,10 @@ void Migrator::handle_export_discover_ack(MExportDirDiscoverAck *m) } else { assert(it->second.state == EXPORT_DISCOVERING); // release locks to avoid deadlock - export_unlock(dir); + MDRequest *mdr = dynamic_cast(it->second.mut); + assert(mdr); + mds->mdcache->request_finish(mdr); + it->second.mut = NULL; // freeze the subtree it->second.state = EXPORT_FREEZING; dir->auth_unpin(this); @@ -826,9 +881,9 @@ void Migrator::export_frozen(CDir *dir) CInode *diri = dir->inode; // ok, try to grab all my locks. - set locks; - get_export_lock_set(dir, locks); - if (!mds->locker->can_rdlock_set(locks)) { + set rdlocks; + get_export_lock_set(dir, rdlocks); + if (!mds->locker->can_rdlock_set(rdlocks)) { dout(7) << "export_dir couldn't rdlock all needed locks, failing. " << *diri << dendl; @@ -843,8 +898,8 @@ void Migrator::export_frozen(CDir *dir) return; } - mds->locker->rdlock_take_set(locks); - it->second.locks.swap(locks); + it->second.mut = new Mutation; + mds->locker->rdlock_take_set(rdlocks, it->second.mut); cache->show_subtrees(); @@ -1548,8 +1603,6 @@ void Migrator::export_reverse(CDir *dir) // unfreeze dir->unfreeze_tree(); - export_unlock(dir); - cache->show_cache(); } @@ -1651,17 +1704,6 @@ void Migrator::handle_export_notify_ack(MExportDirNotifyAck *m) m->put(); } -void Migrator::export_unlock(CDir *dir) -{ - dout(10) << "export_unlock " << *dir << dendl; - - mds->locker->rdlock_finish_set(export_state[dir].locks); - export_state[dir].locks.clear(); - - list ls; - mds->queue_waiters(ls); -} - void Migrator::export_finish(CDir *dir) { dout(5) << "export_finish " << *dir << dendl; @@ -1712,9 +1754,6 @@ void Migrator::export_finish(CDir *dir) !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid())) dir->get_inode()->clear_scatter_dirty(); - // unpin path - export_unlock(dir); - // discard delayed expires cache->discard_delayed_expire(dir); @@ -1724,6 +1763,14 @@ void Migrator::export_finish(CDir *dir) // queue finishers mds->queue_waiters(it->second.waiting_for_finish); + // unpin path + Mutation *mut = it->second.mut; + if (mut) { + mds->locker->drop_locks(mut); + mut->cleanup(); + delete mut; + } + export_state.erase(it); cache->show_subtrees(); diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h index 39542a374879..dec961e0650a 100644 --- a/src/mds/Migrator.h +++ b/src/mds/Migrator.h @@ -48,26 +48,28 @@ class MExportCapsAck; class EImportStart; +struct Mutation; class Migrator { private: MDS *mds; MDCache *cache; - uint64_t last_export_tid; // -- exports -- public: // export stages. used to clean up intelligently if there's a failure. - const static int EXPORT_CANCELLED = 0; // cancelled - const static int EXPORT_DISCOVERING = 1; // dest is disovering export dir - const static int EXPORT_FREEZING = 2; // we're freezing the dir tree - const static int EXPORT_PREPPING = 3; // sending dest spanning tree to export bounds - const static int EXPORT_WARNING = 4; // warning bystanders of dir_auth_pending - const static int EXPORT_EXPORTING = 5; // sent actual export, waiting for ack - const static int EXPORT_LOGGINGFINISH = 6; // logging EExportFinish - const static int EXPORT_NOTIFYING = 7; // waiting for notifyacks + const static int EXPORT_CANCELLED = 0; // cancelled + const static int EXPORT_LOCKING = 1; // acquiring locks + const static int EXPORT_DISCOVERING = 2; // dest is disovering export dir + const static int EXPORT_FREEZING = 3; // we're freezing the dir tree + const static int EXPORT_PREPPING = 4; // sending dest spanning tree to export bounds + const static int EXPORT_WARNING = 5; // warning bystanders of dir_auth_pending + const static int EXPORT_EXPORTING = 6; // sent actual export, waiting for ack + const static int EXPORT_LOGGINGFINISH = 7; // logging EExportFinish + const static int EXPORT_NOTIFYING = 8; // waiting for notifyacks static const char *get_export_statename(int s) { switch (s) { + case EXPORT_LOCKING: return "locking"; case EXPORT_DISCOVERING: return "discovering"; case EXPORT_FREEZING: return "freezing"; case EXPORT_PREPPING: return "prepping"; @@ -85,11 +87,12 @@ protected: int state; int peer; uint64_t tid; - set locks; set warning_ack_waiting; set notify_ack_waiting; map > peer_imported; list waiting_for_finish; + Mutation *mut; + export_state_t() : mut(NULL) {} }; map export_state; @@ -145,7 +148,7 @@ protected: public: // -- cons -- - Migrator(MDS *m, MDCache *c) : mds(m), cache(c), last_export_tid(0) {} + Migrator(MDS *m, MDCache *c) : mds(m), cache(c) {} void dispatch(Message*); @@ -224,6 +227,7 @@ public: // -- import/export -- // exporter public: + void dispatch_export_dir(MDRequest *mdr); void export_dir(CDir *dir, int dest); void export_empty_import(CDir *dir); @@ -278,7 +282,6 @@ public: void handle_export_ack(MExportDirAck *m); void export_logged_finish(CDir *dir); void handle_export_notify_ack(MExportDirNotifyAck *m); - void export_unlock(CDir *dir); void export_finish(CDir *dir); void export_freeze_finish(CDir *dir) { diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h index dd0294ad4797..bd0901490321 100644 --- a/src/mds/Mutation.h +++ b/src/mds/Mutation.h @@ -237,6 +237,9 @@ struct MDRequest : public Mutation { list waiting_for_finish; + // export + CDir* export_dir; + More() : srcdn_auth_mds(-1), src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0), diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 823688ea5f3f..7c6e3bdf4b00 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -1430,7 +1430,7 @@ void Server::handle_slave_request_reply(MMDSSlaveRequest *m) assert(mdr->more()->waiting_on_slave.count(from)); mdr->more()->waiting_on_slave.erase(from); assert(mdr->more()->waiting_on_slave.empty()); - dispatch_client_request(mdr); + mdcache->dispatch_request(mdr); } break; @@ -1448,7 +1448,7 @@ void Server::handle_slave_request_reply(MMDSSlaveRequest *m) assert(mdr->more()->waiting_on_slave.count(from)); mdr->more()->waiting_on_slave.erase(from); assert(mdr->more()->waiting_on_slave.empty()); - dispatch_client_request(mdr); + mdcache->dispatch_request(mdr); } break; @@ -1771,7 +1771,7 @@ void Server::handle_slave_auth_pin_ack(MDRequest *mdr, MMDSSlaveRequest *ack) // go again? if (mdr->more()->waiting_on_slave.empty()) - dispatch_client_request(mdr); + mdcache->dispatch_request(mdr); else dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl; }