From 8c5836153694459aa09528bb73c23503eb97a93e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 21 Jul 2009 16:16:25 -0700 Subject: [PATCH] kclient: mds_client cleanup --- src/kernel/import_patch_set_into_linux_git.sh | 32 +-- src/kernel/mds_client.c | 218 ++++++++++-------- src/kernel/mds_client.h | 101 +++----- src/kernel/mdsmap.h | 5 +- src/kernel/osd_client.c | 2 +- 5 files changed, 182 insertions(+), 176 deletions(-) diff --git a/src/kernel/import_patch_set_into_linux_git.sh b/src/kernel/import_patch_set_into_linux_git.sh index 666c113ea423b..8bf59a69cb443 100755 --- a/src/kernel/import_patch_set_into_linux_git.sh +++ b/src/kernel/import_patch_set_into_linux_git.sh @@ -156,25 +156,25 @@ git add $target/ceph/mdsmap.c git commit -s -F - < %d\n", s, + dout("mdsc get_session %p %d -> %d\n", s, atomic_read(&s->s_ref), atomic_read(&s->s_ref)+1); atomic_inc(&s->s_ref); return s; @@ -252,7 +277,7 @@ static struct ceph_mds_session *get_session(struct ceph_mds_session *s) void ceph_put_mds_session(struct ceph_mds_session *s) { - dout("put_session %p %d -> %d\n", s, + dout("mdsc put_session %p %d -> %d\n", s, atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); if (atomic_dec_and_test(&s->s_ref)) kfree(s); @@ -275,6 +300,12 @@ struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, return session; } +static bool __have_session(struct ceph_mds_client *mdsc, int mds) +{ + if (mds >= mdsc->max_sessions) + return false; + return mdsc->sessions[mds]; +} /* * create+register a new session for given mds. @@ -353,7 +384,7 @@ static void put_request_sessions(struct ceph_mds_request *req) void ceph_mdsc_put_request(struct ceph_mds_request *req) { - dout("put_request %p %d -> %d\n", req, + dout("mdsc put_request %p %d -> %d\n", req, atomic_read(&req->r_ref), atomic_read(&req->r_ref)-1); if (atomic_dec_and_test(&req->r_ref)) { if (req->r_request) @@ -403,13 +434,14 @@ static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc, } /* - * Register an in-flight request, and assign a tid in msg request header. + * Register an in-flight request, and assign a tid. Link to directory + * are modifying (if any). * * Called under mdsc->mutex. */ static void __register_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req, - struct inode *listener) + struct inode *dir) { req->r_tid = ++mdsc->last_tid; if (req->r_num_caps) @@ -418,11 +450,11 @@ static void __register_request(struct ceph_mds_client *mdsc, ceph_mdsc_get_request(req); radix_tree_insert(&mdsc->request_tree, req->r_tid, (void *)req); - if (listener) { - struct ceph_inode_info *ci = ceph_inode(listener); + if (dir) { + struct ceph_inode_info *ci = ceph_inode(dir); spin_lock(&ci->i_unsafe_lock); - req->r_unsafe_dir = listener; + req->r_unsafe_dir = dir; list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops); spin_unlock(&ci->i_unsafe_lock); } @@ -444,13 +476,6 @@ static void __unregister_request(struct ceph_mds_client *mdsc, } } -static bool __have_session(struct ceph_mds_client *mdsc, int mds) -{ - if (mds >= mdsc->max_sessions) - return false; - return mdsc->sessions[mds]; -} - /* * Choose mds to send request to next. If there is a hint set in * the request (e.g., due to a prior forward hint from the mds), use @@ -600,6 +625,10 @@ out: return 0; } +/* + * session caps + */ + /* * Free preallocated cap messages assigned to this session */ @@ -770,7 +799,9 @@ static void renewed_caps(struct ceph_mds_client *mdsc, wake_up_session_caps(session); } - +/* + * send a session close request + */ static int request_close_session(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { @@ -936,6 +967,56 @@ out_unlocked: return err; } +/* + * flush all dirty inode data to disk. + * + * returns true if we've flushed through want_flush_seq + */ +static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) +{ + int mds, ret = 1; + + dout("check_cap_flush want %lld\n", want_flush_seq); + mutex_lock(&mdsc->mutex); + for (mds = 0; ret && mds < mdsc->max_sessions; mds++) { + struct ceph_mds_session *session = mdsc->sessions[mds]; + + if (!session) + continue; + get_session(session); + mutex_unlock(&mdsc->mutex); + + mutex_lock(&session->s_mutex); + if (!list_empty(&session->s_cap_flushing)) { + struct ceph_inode_info *ci = + list_entry(session->s_cap_flushing.next, + struct ceph_inode_info, + i_flushing_item); + struct inode *inode = &ci->vfs_inode; + + spin_lock(&inode->i_lock); + if (ci->i_cap_flush_seq <= want_flush_seq) { + dout("check_cap_flush still flushing %p " + "seq %lld <= %lld to mds%d\n", inode, + ci->i_cap_flush_seq, want_flush_seq, + session->s_mds); + ret = 0; + } + spin_unlock(&inode->i_lock); + } + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + + if (!ret) + return ret; + mutex_lock(&mdsc->mutex); + } + + mutex_unlock(&mdsc->mutex); + dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); + return ret; +} + /* * called under s_mutex */ @@ -960,6 +1041,10 @@ static void send_cap_releases(struct ceph_mds_client *mdsc, spin_unlock(&session->s_cap_lock); } +/* + * requests + */ + /* * Create an mds request. */ @@ -1458,7 +1543,7 @@ void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, * session setup, forwarding, retry details. */ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, - struct inode *listener, + struct inode *dir, struct ceph_mds_request *req) { int err; @@ -1476,7 +1561,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, /* issue */ mutex_lock(&mdsc->mutex); - __register_request(mdsc, req, listener); + __register_request(mdsc, req, dir); __do_request(mdsc, req); /* wait */ @@ -2382,6 +2467,29 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, CEPH_MDS_LEASE_RELEASE, seq); } +/* + * drop all leases (and dentry refs) in preparation for umount + */ +static void drop_leases(struct ceph_mds_client *mdsc) +{ + int i; + + dout("drop_leases\n"); + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); + if (!s) + continue; + mutex_unlock(&mdsc->mutex); + mutex_lock(&s->s_mutex); + mutex_unlock(&s->s_mutex); + ceph_put_mds_session(s); + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); +} + + /* * delayed work -- periodically trim expired leases, renew caps with mds @@ -2402,7 +2510,7 @@ static void delayed_work(struct work_struct *work) int renew_caps; u32 want_map = 0; - dout("delayed_work\n"); + dout("mdsc delayed_work\n"); ceph_check_delayed_caps(mdsc, 0); mutex_lock(&mdsc->mutex); @@ -2486,28 +2594,6 @@ void ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) INIT_LIST_HEAD(&mdsc->dentry_lru); } -/* - * drop all leases (and dentry refs) in preparation for umount - */ -static void drop_leases(struct ceph_mds_client *mdsc) -{ - int i; - - dout("drop_leases\n"); - mutex_lock(&mdsc->mutex); - for (i = 0; i < mdsc->max_sessions; i++) { - struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); - if (!s) - continue; - mutex_unlock(&mdsc->mutex); - mutex_lock(&s->s_mutex); - mutex_unlock(&s->s_mutex); - ceph_put_mds_session(s); - mutex_lock(&mdsc->mutex); - } - mutex_unlock(&mdsc->mutex); -} - /* * Wait for safe replies on open mds requests. If we time out, drop * all requests from the tree to avoid dangling dentry refs. @@ -2552,56 +2638,6 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) wait_requests(mdsc); } -/* - * sync - flush all dirty inode data to disk. - * - * returns true if we've flushed through want_flush_seq - */ -static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) -{ - int mds, ret = 1; - - dout("check_cap_flush want %lld\n", want_flush_seq); - mutex_lock(&mdsc->mutex); - for (mds = 0; ret && mds < mdsc->max_sessions; mds++) { - struct ceph_mds_session *session = mdsc->sessions[mds]; - - if (!session) - continue; - get_session(session); - mutex_unlock(&mdsc->mutex); - - mutex_lock(&session->s_mutex); - if (!list_empty(&session->s_cap_flushing)) { - struct ceph_inode_info *ci = - list_entry(session->s_cap_flushing.next, - struct ceph_inode_info, - i_flushing_item); - struct inode *inode = &ci->vfs_inode; - - spin_lock(&inode->i_lock); - if (ci->i_cap_flush_seq <= want_flush_seq) { - dout("check_cap_flush still flushing %p " - "seq %lld <= %lld to mds%d\n", inode, - ci->i_cap_flush_seq, want_flush_seq, - session->s_mds); - ret = 0; - } - spin_unlock(&inode->i_lock); - } - mutex_unlock(&session->s_mutex); - ceph_put_mds_session(session); - - if (!ret) - return ret; - mutex_lock(&mdsc->mutex); - } - - mutex_unlock(&mdsc->mutex); - dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); - return ret; -} - /* * wait for all write mds requests to flush. */ diff --git a/src/kernel/mds_client.h b/src/kernel/mds_client.h index a2be61dd736d9..5f394ca7e9330 100644 --- a/src/kernel/mds_client.h +++ b/src/kernel/mds_client.h @@ -11,31 +11,6 @@ #include "messenger.h" #include "mdsmap.h" -/* - * A cluster of MDS (metadata server) daemons is responsible for - * managing the file system namespace (the directory hierarchy and - * inodes) and for coordinating shared access to storage. Metadata is - * partitioning hierarchically across a number of servers, and that - * partition varies over time as the cluster adjusts the distribution - * in order to balance load. - * - * The MDS client is primarily responsible to managing synchronous - * metadata requests for operations like open, unlink, and so forth. - * If there is a MDS failure, we find out about it when we (possibly - * request and) receive a new MDS map, and can resubmit affected - * requests. - * - * For the most part, though, we take advantage of a lossless - * communications channel to the MDS, and do not need to worry about - * timing out or resubmitting requests. - * - * We maintain a stateful "session" with each MDS we interact with. - * Within each session, we sent periodic heartbeat messages to ensure - * any capabilities or leases we have been issues remain valid. If - * the session times out and goes stale, our leases and capabilities - * are no longer valid. - */ - /* * Some lock dependencies: * @@ -66,9 +41,9 @@ struct ceph_mds_reply_info_in { }; /* - * parsed info about an mds reply, including a "trace" from - * the referenced inode, through its parents up to the root - * directory, and directory contents (for readdir results). + * parsed info about an mds reply, including information about the + * target inode and/or its parent directory and dentry, and directory + * contents (for readdir results). */ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_head *head; @@ -119,22 +94,25 @@ struct ceph_mds_session { unsigned long s_ttl; /* time until mds kills us */ u64 s_seq; /* incoming msg seq # */ struct mutex s_mutex; /* serialize session messages */ - spinlock_t s_cap_lock; /* protects s_caps, s_cap_{gen,ttl} */ + + /* protected by s_cap_lock */ + spinlock_t s_cap_lock; u32 s_cap_gen; /* inc each time we get mds stale msg */ unsigned long s_cap_ttl; /* when session caps expire */ - unsigned long s_renew_requested; /* last time we sent a renew req */ struct list_head s_caps; /* all caps issued by this session */ int s_nr_caps, s_trim_caps; - atomic_t s_ref; - struct list_head s_waiting; /* waiting requests */ - struct list_head s_unsafe; /* unsafe requests */ - int s_num_cap_releases; struct list_head s_cap_releases; /* waiting cap_release messages */ struct list_head s_cap_releases_done; /* ready to send */ - struct list_head s_cap_flushing; /* inodes w/ flushing caps */ + /* protected by mutex */ + struct list_head s_cap_flushing; /* inodes w/ flushing caps */ u64 s_cap_flush_tid; + unsigned long s_renew_requested; /* last time we sent a renew req */ + + atomic_t s_ref; + struct list_head s_waiting; /* waiting requests */ + struct list_head s_unsafe; /* unsafe requests */ }; /* @@ -161,7 +139,7 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc, struct ceph_mds_request { u64 r_tid; /* transaction id */ - int r_op; + int r_op; /* mds op code */ /* operation on what? */ struct inode *r_inode; /* arg1 */ @@ -170,7 +148,16 @@ struct ceph_mds_request { const char *r_path1, *r_path2; struct ceph_vino r_ino1, r_ino2; + struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ + struct inode *r_target_inode; /* resulting inode */ + union ceph_mds_request_args r_args; + int r_fmode; /* file mode, if expecting cap */ + + /* for choosing which mds to send this request to */ + int r_direct_mode; + u32 r_direct_hash; /* choose dir frag based on this dentry hash */ + bool r_direct_is_hash; /* true if r_direct_hash is valid */ /* data payload is used for xattr ops */ struct page **r_pages; @@ -184,34 +171,22 @@ struct ceph_mds_request { struct inode *r_old_inode; int r_old_inode_drop, r_old_inode_unless; - struct inode *r_target_inode; /* resulting inode */ - struct ceph_msg *r_request; /* original request */ struct ceph_msg *r_reply; struct ceph_mds_reply_info_parsed r_reply_info; int r_err; - unsigned long r_timeout; /* optional. jiffies */ + unsigned long r_timeout; /* optional. jiffies */ unsigned long r_started; /* start time to measure timeout against */ unsigned long r_request_started; /* start time for mds request only, used to measure lease durations */ - /* for choosing which mds to send this request to */ - int r_direct_mode; - u32 r_direct_hash; /* choose dir frag based on this dentry hash */ - bool r_direct_is_hash; /* true if r_direct_hash is valid */ - /* link unsafe requests to parent directory, for fsync */ struct inode *r_unsafe_dir; struct list_head r_unsafe_dir_item; - /* references to the trailing dentry and inode from parsing the - * mds response. also used to feed a VFS-provided dentry into - * the reply handler */ - int r_fmode; /* file mode, if expecting cap */ struct ceph_mds_session *r_session; struct ceph_mds_session *r_fwd_session; /* forwarded from */ - struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ int r_attempts; /* resend attempts */ int r_num_fwd; /* number of forward attempts */ @@ -275,16 +250,17 @@ struct ceph_mds_client { spinlock_t cap_dirty_lock; /* protects above items */ wait_queue_head_t cap_flushing_wq; - struct dentry *debugfs_file; + struct dentry *debugfs_file; - spinlock_t dentry_lru_lock; - struct list_head dentry_lru; - int num_dentry; + spinlock_t dentry_lru_lock; + struct list_head dentry_lru; + int num_dentry; }; extern const char *ceph_mds_op_name(int op); -extern struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *, int mds); +extern struct ceph_mds_session * +__ceph_lookup_mds_session(struct ceph_mds_client *, int mds); inline static struct ceph_mds_session * ceph_get_mds_session(struct ceph_mds_session *s) @@ -293,14 +269,6 @@ ceph_get_mds_session(struct ceph_mds_session *s) return s; } -/* - * requests - */ -static inline void ceph_mdsc_get_request(struct ceph_mds_request *req) -{ - atomic_inc(&req->r_ref); -} - extern void ceph_put_mds_session(struct ceph_mds_session *s); extern void ceph_send_msg_mds(struct ceph_mds_client *mdsc, @@ -334,16 +302,17 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req); extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, - struct inode *listener, + struct inode *dir, struct ceph_mds_request *req); +static inline void ceph_mdsc_get_request(struct ceph_mds_request *req) +{ + atomic_inc(&req->r_ref); +} extern void ceph_mdsc_put_request(struct ceph_mds_request *req); extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); - extern void ceph_mdsc_handle_reset(struct ceph_mds_client *mdsc, int mds); -extern struct ceph_mds_request *ceph_mdsc_get_listener_req(struct inode *inode, - u64 tid); extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, int stop_on_nosnap); diff --git a/src/kernel/mdsmap.h b/src/kernel/mdsmap.h index a9bb3d6ba0d49..b9f311a82f647 100644 --- a/src/kernel/mdsmap.h +++ b/src/kernel/mdsmap.h @@ -4,9 +4,9 @@ #include "types.h" /* - * mds map - describe servers in the mds cluster + * mds map - describe servers in the mds cluster. * - * fields limited to those the client cares about + * we limit fields to those the client actually xcares about */ struct ceph_mdsmap { u32 m_epoch, m_client_epoch, m_last_failure; @@ -18,6 +18,7 @@ struct ceph_mdsmap { struct ceph_entity_addr *m_addr; /* mds addrs */ s32 *m_state; /* states */ + /* which object pools file data can be stored in */ int m_num_data_pg_pools; u32 *m_data_pg_pools; u32 m_cas_pg_pool; diff --git a/src/kernel/osd_client.c b/src/kernel/osd_client.c index bfe6a5f13670d..cc796df7561ca 100644 --- a/src/kernel/osd_client.c +++ b/src/kernel/osd_client.c @@ -59,7 +59,7 @@ static void calc_layout(struct ceph_osd_client *osdc, */ void ceph_osdc_put_request(struct ceph_osd_request *req) { - dout("put_request %p %d -> %d\n", req, atomic_read(&req->r_ref), + dout("osdc put_request %p %d -> %d\n", req, atomic_read(&req->r_ref), atomic_read(&req->r_ref)-1); BUG_ON(atomic_read(&req->r_ref) <= 0); if (atomic_dec_and_test(&req->r_ref)) { -- 2.39.5