From ed6a18d90fdd1dc869369fb92c2aad43bc5c9a34 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 8 Jul 2009 09:39:35 -0700 Subject: [PATCH] mds: replay client ops one at a time Before we were assuming we could queue up all replayed ops and the locking would ensure they'd occur in the proper order (and in so doing unwind any dependencies). Not so fast. Instead, do one at a time and queue the next up as each finishes. --- src/TODO | 24 ++++++++++++++++++++++++ src/mds/MDCache.cc | 5 +++++ src/mds/MDS.cc | 12 ++++++++---- src/mds/MDS.h | 7 +++++++ src/mds/Server.cc | 12 +++++++++--- src/mds/journal.cc | 5 +++-- 6 files changed, 56 insertions(+), 9 deletions(-) diff --git a/src/TODO b/src/TODO index bd465e3996dd7..41453ae73167f 100644 --- a/src/TODO +++ b/src/TODO @@ -34,6 +34,30 @@ v0.10 bugs 09.07.07 16:54:31.246206 mds0.locker ignoring client capid 698 != my 1081 +mds/Locker.cc: In function 'void Locker::eval_gather(SimpleLock*, bool, bool*)': +mds/Locker.cc:386: FAILED assert(!lock->is_stable()) + 1: ./cmds(_Z18__ceph_assert_failPKcS0_iS0_+0x34) [0x91b17a] + 2: ./cmds(_ZN6Locker11eval_gatherEP10SimpleLockbPb+0xbf) [0x8040cf] + 3: ./cmds(_ZN7MDCache15do_file_recoverEv+0x336) [0x787eb6] + 4: ./cmds(_ZN7MDCache10_recoveredEP6CInodei+0x1bf) [0x7880c9] + 5: ./cmds(_ZN13C_MDC_Recover6finishEi+0x27) [0x7fd64f] + 6: ./cmds(_ZN5Filer7_probedEPNS_5ProbeERK8object_tm7utime_t+0xb60) [0x8d46d2] + 7: ./cmds(_ZN5Filer7C_Probe6finishEi+0x3f) [0x8d9f97] + 8: ./cmds(_ZN8Objecter6C_Stat6finishEi+0xa0) [0x8d9b34] + 9: ./cmds(_ZN8Objecter19handle_osd_op_replyEP11MOSDOpReply+0xa9d) [0x8aec39] + 10: ./cmds(_ZN3MDS9_dispatchEP7Message+0x8b0) [0x6e838e] + 11: ./cmds(_ZN3MDS13dispatch_implEP7Message+0x69d) [0x6e979b] + 12: ./cmds(_ZN10Dispatcher9_dispatchEP7Message+0x54) [0x6c4d8e] + 13: ./cmds(_ZN10Dispatcher8dispatchEP7Message+0x1d) [0x6cccff] + 14: ./cmds(_ZN9Messenger8dispatchEP7Message+0x56) [0x6db42e] + 15: ./cmds(_ZN15SimpleMessenger8Endpoint14dispatch_entryEv+0x5cb) [0x6d5351] + 16: ./cmds(_ZN15SimpleMessenger8Endpoint14DispatchThread5entryEv+0x19) [0x6e1627] + 17: ./cmds(_ZN6Thread11_entry_funcEPv+0x20) [0x6db97e] + 18: /lib/libpthread.so.0 [0x2b25cda38fc7] + 19: /lib/libc.so.6(clone+0x6d) [0x2b25ce4b75ad] + NOTE: a copy of the executable, or `objdump -rdS ` is needed to interpret this. + + later - authentication diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index e73529c19146b..b1ba58c71a4ce 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -6228,6 +6228,11 @@ void MDCache::request_finish(MDRequest *mdr) mds->logger->favg(l_mds_replyl, g_clock.now() - mdr->client_request->get_recv_stamp()); } + if (mdr->client_request && mdr->client_request->is_replay()) { + dout(10) << " queueing next replay op" << dendl; + mds->queue_one_replay(); + } + request_cleanup(mdr); } diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc index 4938d5882745d..7f1f1e006cbb3 100644 --- a/src/mds/MDS.cc +++ b/src/mds/MDS.cc @@ -1021,7 +1021,7 @@ void MDS::rejoin_done() void MDS::clientreplay_start() { dout(1) << "clientreplay_start" << dendl; - queue_waiters(waiting_for_replay); + queue_one_replay(); } void MDS::clientreplay_done() @@ -1309,9 +1309,13 @@ bool MDS::_dispatch(Message *m) if (!finishing && is_clientreplay() && mdcache->is_open() && - mdcache->get_num_active_requests() == 0 && - want_state == MDSMap::STATE_CLIENTREPLAY) - clientreplay_done(); + waiting_for_replay.empty() && + want_state == MDSMap::STATE_CLIENTREPLAY) { + dout(10) << " still have " << mdcache->get_num_active_requests() + << " active replay requests" << dendl; + if (mdcache->get_num_active_requests() == 0) + clientreplay_done(); + } } // hack: thrash exports diff --git a/src/mds/MDS.h b/src/mds/MDS.h index 80a8bf66a042e..91cc7fa2d0185 100644 --- a/src/mds/MDS.h +++ b/src/mds/MDS.h @@ -232,6 +232,13 @@ class MDS : public Dispatcher { void queue_waiters(list& ls) { finished_queue.splice( finished_queue.end(), ls ); } + bool queue_one_replay() { + if (waiting_for_replay.empty()) + return false; + queue_waiter(waiting_for_replay.front()); + waiting_for_replay.pop_front(); + return true; + } // -- keepalive beacon -- version_t beacon_last_seq; // last seq sent to monitor diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 78ad267636c98..72e6702d5ed98 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -619,7 +619,7 @@ void Server::journal_and_reply(MDRequest *mdr, CInode *in, CDentry *dn, LogEvent mdr->pin(dn); early_reply(mdr, in, dn); - + mdr->committing = true; mdlog->submit_entry(le, fin, mdr->did_ino_allocation()); @@ -660,10 +660,11 @@ void Server::early_reply(MDRequest *mdr, CInode *tracei, CDentry *tracedn) return; if (req->is_replay()) { - dout(10) << "early_reply - none for replay request" << dendl; + dout(10) << " no early reply on replay op" << dendl; return; } + MClientReply *reply = new MClientReply(mdr->client_request, 0); reply->set_unsafe(); @@ -899,6 +900,10 @@ void Server::handle_client_request(MClientRequest *req) if (session->have_completed_request(req->get_reqid().tid)) { dout(5) << "already completed " << req->get_reqid() << dendl; mds->messenger->send_message(new MClientReply(req, 0), req->get_orig_source_inst()); + + if (req->is_replay()) + mds->queue_one_replay(); + delete req; return; } @@ -1427,7 +1432,8 @@ CInode* Server::prepare_new_inode(MDRequest *mdr, CDir *dir, inodeno_t useino) in->inode.ino = mdr->session->take_ino(useino); // prealloc -> used mds->sessionmap.projected++; dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino - << " (" << mdr->session->prealloc_inos.size() << " left)" + << " (" << mdr->session->prealloc_inos + << ", " << mdr->session->prealloc_inos.size() << " left)" << dendl; } else { mdr->alloc_ino = diff --git a/src/mds/journal.cc b/src/mds/journal.cc index 439a8ca3a86d2..e26f2d6f7db46 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -544,7 +544,7 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg) dout(10) << "EMetaBlob.replay inotable tablev " << inotablev << " <= table " << mds->inotable->get_version() << dendl; } else { - dout(10) << " EMetaBlob.replay inotable v " << inotablev + dout(10) << "EMetaBlob.replay inotable v " << inotablev << " - 1 == table " << mds->inotable->get_version() << " allocated+used " << allocated_ino << " prealloc " << preallocated_inos @@ -570,13 +570,14 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg) dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv << " <= table " << mds->sessionmap.version << dendl; } else { - dout(10) << " EMetaBlob.replay sessionmap v" << sessionmapv + dout(10) << "EMetaBlob.replay sessionmap v" << sessionmapv << " -(1|2) == table " << mds->sessionmap.version << " prealloc " << preallocated_inos << " used " << used_preallocated_ino << dendl; Session *session = mds->sessionmap.get_session(client_name); assert(session); + dout(20) << " (session prealloc " << session->prealloc_inos << ")" << dendl; if (used_preallocated_ino) { inodeno_t i = session->take_ino(); assert(i == used_preallocated_ino); -- 2.39.5