git commit -s -F - <<EOF
ceph: MDS client
-The MDS client is responsible for submitting requests to the MDS
-cluster and parsing the response. We decide which MDS to submit each
-request to based on cached information about the current partition of
-the directory hierarchy across the cluster. A stateful session is
-opened with each MDS before we submit requests to it, and a mutex is
-used to control the ordering of messages within each session.
+The MDS (metadata server) client is responsible for submitting
+requests to the MDS cluster and parsing the response. We decide which
+MDS to submit each request to based on cached information about the
+current partition of the directory hierarchy across the cluster. A
+stateful session is opened with each MDS before we submit requests to
+it, and a mutex is used to control the ordering of messages within
+each session.
An MDS request may generate two responses. The first indicates the
operation was a success and returns any result. A second reply is
sent when the operation commits to disk. Note that locking on the MDS
ensures that the results of updates are visible only to the updating
-client before the operation commits.
-
-Requests are linked to the containing directory so that an fsync will
-wait for them to commit.
+client before the operation commits. Requests are linked to the
+containing directory so that an fsync will wait for them to commit.
If an MDS fails and/or recovers, we resubmit requests as needed. We
also reconnect existing capabilities to a recovering MDS to
-reestablish that shared session state.
+reestablish that shared session state. Old dentry leases are
+invalidated.
EOF
stored in the cluster, and ensuring that requests are retried or
redirected in the event of a node failure or data migration.
-If an OSD does not respond before a timeout expires, 'ping' messages
-are sent across the lossless, ordered communications channel to
-ensure that any break in the TCP is discovered. If the session does
-reset, a reconnection is attempted and affected requests are resent
-(by the message transport layer).
+If an OSD does not respond before a timeout expires, keepalive
+messages are sent across the lossless, ordered communications channel
+to ensure that any break in the TCP is discovered. If the session
+does reset, a reconnection is attempted and affected requests are
+resent (by the message transport layer).
EOF
#include "messenger.h"
#include "decode.h"
+/*
+ * A cluster of MDS (metadata server) daemons is responsible for
+ * managing the file system namespace (the directory hierarchy and
+ * inodes) and for coordinating shared access to storage. Metadata is
+ * partitioning hierarchically across a number of servers, and that
+ * partition varies over time as the cluster adjusts the distribution
+ * in order to balance load.
+ *
+ * The MDS client is primarily responsible to managing synchronous
+ * metadata requests for operations like open, unlink, and so forth.
+ * If there is a MDS failure, we find out about it when we (possibly
+ * request and) receive a new MDS map, and can resubmit affected
+ * requests.
+ *
+ * For the most part, though, we take advantage of a lossless
+ * communications channel to the MDS, and do not need to worry about
+ * timing out or resubmitting requests.
+ *
+ * We maintain a stateful "session" with each MDS we interact with.
+ * Within each session, we sent periodic heartbeat messages to ensure
+ * any capabilities or leases we have been issues remain valid. If
+ * the session times out and goes stale, our leases and capabilities
+ * are no longer valid.
+ */
+
static void __wake_requests(struct ceph_mds_client *mdsc,
struct list_head *head);
static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
{
- dout("get_session %p %d -> %d\n", s,
+ dout("mdsc get_session %p %d -> %d\n", s,
atomic_read(&s->s_ref), atomic_read(&s->s_ref)+1);
atomic_inc(&s->s_ref);
return s;
void ceph_put_mds_session(struct ceph_mds_session *s)
{
- dout("put_session %p %d -> %d\n", s,
+ dout("mdsc put_session %p %d -> %d\n", s,
atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
if (atomic_dec_and_test(&s->s_ref))
kfree(s);
return session;
}
+static bool __have_session(struct ceph_mds_client *mdsc, int mds)
+{
+ if (mds >= mdsc->max_sessions)
+ return false;
+ return mdsc->sessions[mds];
+}
/*
* create+register a new session for given mds.
void ceph_mdsc_put_request(struct ceph_mds_request *req)
{
- dout("put_request %p %d -> %d\n", req,
+ dout("mdsc put_request %p %d -> %d\n", req,
atomic_read(&req->r_ref), atomic_read(&req->r_ref)-1);
if (atomic_dec_and_test(&req->r_ref)) {
if (req->r_request)
}
/*
- * Register an in-flight request, and assign a tid in msg request header.
+ * Register an in-flight request, and assign a tid. Link to directory
+ * are modifying (if any).
*
* Called under mdsc->mutex.
*/
static void __register_request(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req,
- struct inode *listener)
+ struct inode *dir)
{
req->r_tid = ++mdsc->last_tid;
if (req->r_num_caps)
ceph_mdsc_get_request(req);
radix_tree_insert(&mdsc->request_tree, req->r_tid, (void *)req);
- if (listener) {
- struct ceph_inode_info *ci = ceph_inode(listener);
+ if (dir) {
+ struct ceph_inode_info *ci = ceph_inode(dir);
spin_lock(&ci->i_unsafe_lock);
- req->r_unsafe_dir = listener;
+ req->r_unsafe_dir = dir;
list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
spin_unlock(&ci->i_unsafe_lock);
}
}
}
-static bool __have_session(struct ceph_mds_client *mdsc, int mds)
-{
- if (mds >= mdsc->max_sessions)
- return false;
- return mdsc->sessions[mds];
-}
-
/*
* Choose mds to send request to next. If there is a hint set in
* the request (e.g., due to a prior forward hint from the mds), use
return 0;
}
+/*
+ * session caps
+ */
+
/*
* Free preallocated cap messages assigned to this session
*/
wake_up_session_caps(session);
}
-
+/*
+ * send a session close request
+ */
static int request_close_session(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
return err;
}
+/*
+ * flush all dirty inode data to disk.
+ *
+ * returns true if we've flushed through want_flush_seq
+ */
+static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
+{
+ int mds, ret = 1;
+
+ dout("check_cap_flush want %lld\n", want_flush_seq);
+ mutex_lock(&mdsc->mutex);
+ for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
+ struct ceph_mds_session *session = mdsc->sessions[mds];
+
+ if (!session)
+ continue;
+ get_session(session);
+ mutex_unlock(&mdsc->mutex);
+
+ mutex_lock(&session->s_mutex);
+ if (!list_empty(&session->s_cap_flushing)) {
+ struct ceph_inode_info *ci =
+ list_entry(session->s_cap_flushing.next,
+ struct ceph_inode_info,
+ i_flushing_item);
+ struct inode *inode = &ci->vfs_inode;
+
+ spin_lock(&inode->i_lock);
+ if (ci->i_cap_flush_seq <= want_flush_seq) {
+ dout("check_cap_flush still flushing %p "
+ "seq %lld <= %lld to mds%d\n", inode,
+ ci->i_cap_flush_seq, want_flush_seq,
+ session->s_mds);
+ ret = 0;
+ }
+ spin_unlock(&inode->i_lock);
+ }
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+
+ if (!ret)
+ return ret;
+ mutex_lock(&mdsc->mutex);
+ }
+
+ mutex_unlock(&mdsc->mutex);
+ dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
+ return ret;
+}
+
/*
* called under s_mutex
*/
spin_unlock(&session->s_cap_lock);
}
+/*
+ * requests
+ */
+
/*
* Create an mds request.
*/
* session setup, forwarding, retry details.
*/
int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
- struct inode *listener,
+ struct inode *dir,
struct ceph_mds_request *req)
{
int err;
/* issue */
mutex_lock(&mdsc->mutex);
- __register_request(mdsc, req, listener);
+ __register_request(mdsc, req, dir);
__do_request(mdsc, req);
/* wait */
CEPH_MDS_LEASE_RELEASE, seq);
}
+/*
+ * drop all leases (and dentry refs) in preparation for umount
+ */
+static void drop_leases(struct ceph_mds_client *mdsc)
+{
+ int i;
+
+ dout("drop_leases\n");
+ mutex_lock(&mdsc->mutex);
+ for (i = 0; i < mdsc->max_sessions; i++) {
+ struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
+ if (!s)
+ continue;
+ mutex_unlock(&mdsc->mutex);
+ mutex_lock(&s->s_mutex);
+ mutex_unlock(&s->s_mutex);
+ ceph_put_mds_session(s);
+ mutex_lock(&mdsc->mutex);
+ }
+ mutex_unlock(&mdsc->mutex);
+}
+
+
/*
* delayed work -- periodically trim expired leases, renew caps with mds
int renew_caps;
u32 want_map = 0;
- dout("delayed_work\n");
+ dout("mdsc delayed_work\n");
ceph_check_delayed_caps(mdsc, 0);
mutex_lock(&mdsc->mutex);
INIT_LIST_HEAD(&mdsc->dentry_lru);
}
-/*
- * drop all leases (and dentry refs) in preparation for umount
- */
-static void drop_leases(struct ceph_mds_client *mdsc)
-{
- int i;
-
- dout("drop_leases\n");
- mutex_lock(&mdsc->mutex);
- for (i = 0; i < mdsc->max_sessions; i++) {
- struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
- if (!s)
- continue;
- mutex_unlock(&mdsc->mutex);
- mutex_lock(&s->s_mutex);
- mutex_unlock(&s->s_mutex);
- ceph_put_mds_session(s);
- mutex_lock(&mdsc->mutex);
- }
- mutex_unlock(&mdsc->mutex);
-}
-
/*
* Wait for safe replies on open mds requests. If we time out, drop
* all requests from the tree to avoid dangling dentry refs.
wait_requests(mdsc);
}
-/*
- * sync - flush all dirty inode data to disk.
- *
- * returns true if we've flushed through want_flush_seq
- */
-static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
-{
- int mds, ret = 1;
-
- dout("check_cap_flush want %lld\n", want_flush_seq);
- mutex_lock(&mdsc->mutex);
- for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
- struct ceph_mds_session *session = mdsc->sessions[mds];
-
- if (!session)
- continue;
- get_session(session);
- mutex_unlock(&mdsc->mutex);
-
- mutex_lock(&session->s_mutex);
- if (!list_empty(&session->s_cap_flushing)) {
- struct ceph_inode_info *ci =
- list_entry(session->s_cap_flushing.next,
- struct ceph_inode_info,
- i_flushing_item);
- struct inode *inode = &ci->vfs_inode;
-
- spin_lock(&inode->i_lock);
- if (ci->i_cap_flush_seq <= want_flush_seq) {
- dout("check_cap_flush still flushing %p "
- "seq %lld <= %lld to mds%d\n", inode,
- ci->i_cap_flush_seq, want_flush_seq,
- session->s_mds);
- ret = 0;
- }
- spin_unlock(&inode->i_lock);
- }
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
-
- if (!ret)
- return ret;
- mutex_lock(&mdsc->mutex);
- }
-
- mutex_unlock(&mdsc->mutex);
- dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
- return ret;
-}
-
/*
* wait for all write mds requests to flush.
*/
#include "messenger.h"
#include "mdsmap.h"
-/*
- * A cluster of MDS (metadata server) daemons is responsible for
- * managing the file system namespace (the directory hierarchy and
- * inodes) and for coordinating shared access to storage. Metadata is
- * partitioning hierarchically across a number of servers, and that
- * partition varies over time as the cluster adjusts the distribution
- * in order to balance load.
- *
- * The MDS client is primarily responsible to managing synchronous
- * metadata requests for operations like open, unlink, and so forth.
- * If there is a MDS failure, we find out about it when we (possibly
- * request and) receive a new MDS map, and can resubmit affected
- * requests.
- *
- * For the most part, though, we take advantage of a lossless
- * communications channel to the MDS, and do not need to worry about
- * timing out or resubmitting requests.
- *
- * We maintain a stateful "session" with each MDS we interact with.
- * Within each session, we sent periodic heartbeat messages to ensure
- * any capabilities or leases we have been issues remain valid. If
- * the session times out and goes stale, our leases and capabilities
- * are no longer valid.
- */
-
/*
* Some lock dependencies:
*
};
/*
- * parsed info about an mds reply, including a "trace" from
- * the referenced inode, through its parents up to the root
- * directory, and directory contents (for readdir results).
+ * parsed info about an mds reply, including information about the
+ * target inode and/or its parent directory and dentry, and directory
+ * contents (for readdir results).
*/
struct ceph_mds_reply_info_parsed {
struct ceph_mds_reply_head *head;
unsigned long s_ttl; /* time until mds kills us */
u64 s_seq; /* incoming msg seq # */
struct mutex s_mutex; /* serialize session messages */
- spinlock_t s_cap_lock; /* protects s_caps, s_cap_{gen,ttl} */
+
+ /* protected by s_cap_lock */
+ spinlock_t s_cap_lock;
u32 s_cap_gen; /* inc each time we get mds stale msg */
unsigned long s_cap_ttl; /* when session caps expire */
- unsigned long s_renew_requested; /* last time we sent a renew req */
struct list_head s_caps; /* all caps issued by this session */
int s_nr_caps, s_trim_caps;
- atomic_t s_ref;
- struct list_head s_waiting; /* waiting requests */
- struct list_head s_unsafe; /* unsafe requests */
-
int s_num_cap_releases;
struct list_head s_cap_releases; /* waiting cap_release messages */
struct list_head s_cap_releases_done; /* ready to send */
- struct list_head s_cap_flushing; /* inodes w/ flushing caps */
+ /* protected by mutex */
+ struct list_head s_cap_flushing; /* inodes w/ flushing caps */
u64 s_cap_flush_tid;
+ unsigned long s_renew_requested; /* last time we sent a renew req */
+
+ atomic_t s_ref;
+ struct list_head s_waiting; /* waiting requests */
+ struct list_head s_unsafe; /* unsafe requests */
};
/*
struct ceph_mds_request {
u64 r_tid; /* transaction id */
- int r_op;
+ int r_op; /* mds op code */
/* operation on what? */
struct inode *r_inode; /* arg1 */
const char *r_path1, *r_path2;
struct ceph_vino r_ino1, r_ino2;
+ struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
+ struct inode *r_target_inode; /* resulting inode */
+
union ceph_mds_request_args r_args;
+ int r_fmode; /* file mode, if expecting cap */
+
+ /* for choosing which mds to send this request to */
+ int r_direct_mode;
+ u32 r_direct_hash; /* choose dir frag based on this dentry hash */
+ bool r_direct_is_hash; /* true if r_direct_hash is valid */
/* data payload is used for xattr ops */
struct page **r_pages;
struct inode *r_old_inode;
int r_old_inode_drop, r_old_inode_unless;
- struct inode *r_target_inode; /* resulting inode */
-
struct ceph_msg *r_request; /* original request */
struct ceph_msg *r_reply;
struct ceph_mds_reply_info_parsed r_reply_info;
int r_err;
- unsigned long r_timeout; /* optional. jiffies */
+ unsigned long r_timeout; /* optional. jiffies */
unsigned long r_started; /* start time to measure timeout against */
unsigned long r_request_started; /* start time for mds request only,
used to measure lease durations */
- /* for choosing which mds to send this request to */
- int r_direct_mode;
- u32 r_direct_hash; /* choose dir frag based on this dentry hash */
- bool r_direct_is_hash; /* true if r_direct_hash is valid */
-
/* link unsafe requests to parent directory, for fsync */
struct inode *r_unsafe_dir;
struct list_head r_unsafe_dir_item;
- /* references to the trailing dentry and inode from parsing the
- * mds response. also used to feed a VFS-provided dentry into
- * the reply handler */
- int r_fmode; /* file mode, if expecting cap */
struct ceph_mds_session *r_session;
struct ceph_mds_session *r_fwd_session; /* forwarded from */
- struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
int r_attempts; /* resend attempts */
int r_num_fwd; /* number of forward attempts */
spinlock_t cap_dirty_lock; /* protects above items */
wait_queue_head_t cap_flushing_wq;
- struct dentry *debugfs_file;
+ struct dentry *debugfs_file;
- spinlock_t dentry_lru_lock;
- struct list_head dentry_lru;
- int num_dentry;
+ spinlock_t dentry_lru_lock;
+ struct list_head dentry_lru;
+ int num_dentry;
};
extern const char *ceph_mds_op_name(int op);
-extern struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
+extern struct ceph_mds_session *
+__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
inline static struct ceph_mds_session *
ceph_get_mds_session(struct ceph_mds_session *s)
return s;
}
-/*
- * requests
- */
-static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
-{
- atomic_inc(&req->r_ref);
-}
-
extern void ceph_put_mds_session(struct ceph_mds_session *s);
extern void ceph_send_msg_mds(struct ceph_mds_client *mdsc,
extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req);
extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
- struct inode *listener,
+ struct inode *dir,
struct ceph_mds_request *req);
+static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
+{
+ atomic_inc(&req->r_ref);
+}
extern void ceph_mdsc_put_request(struct ceph_mds_request *req);
extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
-
extern void ceph_mdsc_handle_reset(struct ceph_mds_client *mdsc, int mds);
-extern struct ceph_mds_request *ceph_mdsc_get_listener_req(struct inode *inode,
- u64 tid);
extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
int stop_on_nosnap);
#include "types.h"
/*
- * mds map - describe servers in the mds cluster
+ * mds map - describe servers in the mds cluster.
*
- * fields limited to those the client cares about
+ * we limit fields to those the client actually xcares about
*/
struct ceph_mdsmap {
u32 m_epoch, m_client_epoch, m_last_failure;
struct ceph_entity_addr *m_addr; /* mds addrs */
s32 *m_state; /* states */
+ /* which object pools file data can be stored in */
int m_num_data_pg_pools;
u32 *m_data_pg_pools;
u32 m_cas_pg_pool;
*/
void ceph_osdc_put_request(struct ceph_osd_request *req)
{
- dout("put_request %p %d -> %d\n", req, atomic_read(&req->r_ref),
+ dout("osdc put_request %p %d -> %d\n", req, atomic_read(&req->r_ref),
atomic_read(&req->r_ref)-1);
BUG_ON(atomic_read(&req->r_ref) <= 0);
if (atomic_dec_and_test(&req->r_ref)) {