From 6976f2c1c7c7178286d72cb496074a72734421cc Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 3 Aug 2009 11:39:27 -0700 Subject: [PATCH] kclient: use caps, fragtree only to choose mds (not hierarchy) Since we require caps for all inodes in our cache, no need to consider parents when identifying where to sent a request. Just look at fragtree (for fragmented dirs) or caps. --- src/TODO | 5 +- src/kernel/inode.c | 8 +-- src/kernel/mds_client.c | 126 +++++++++++++++++++++------------------- src/kernel/super.h | 2 +- 4 files changed, 72 insertions(+), 69 deletions(-) diff --git a/src/TODO b/src/TODO index 3dbf931e65993..fa75a27f67905 100644 --- a/src/TODO +++ b/src/TODO @@ -108,12 +108,9 @@ repair - mds scrubbing kclient -- async writepage ?- ensure cap_snaps reflush after client reconnect -- fix up mds selection, and ESTALE handling +- fix up ESTALE handling - make cap import/export efficient -- simplify mds auth tracking? - - use caps instead? - flock, fnctl locks - ACLs - should we try to ref CAP_PIN on special inodes that are open? diff --git a/src/kernel/inode.c b/src/kernel/inode.c index 62e238a3bb90d..c388862d3dac3 100644 --- a/src/kernel/inode.c +++ b/src/kernel/inode.c @@ -189,7 +189,7 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, /* * Process dirfrag (delegation) info from the mds. Include leaf - * fragment in tree ONLY if mds >= 0 || ndist > 0. Otherwise, only + * fragment in tree ONLY if ndist > 0. Otherwise, only * branches/splits are included in i_fragtree) */ static int ceph_fill_dirfrag(struct inode *inode, @@ -204,7 +204,7 @@ static int ceph_fill_dirfrag(struct inode *inode, int err = 0; mutex_lock(&ci->i_fragtree_mutex); - if (mds < 0 && ndist == 0) { + if (ndist == 0) { /* no delegation info needed. */ frag = __ceph_find_frag(ci, id); if (!frag) @@ -241,8 +241,8 @@ static int ceph_fill_dirfrag(struct inode *inode, frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP); for (i = 0; i < frag->ndist; i++) frag->dist[i] = le32_to_cpu(dirinfo->dist[i]); - dout("fill_dirfrag %llx.%llx frag %x referral mds %d ndist=%d\n", - ceph_vinop(inode), frag->frag, frag->mds, frag->ndist); + dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n", + ceph_vinop(inode), frag->frag, frag->ndist); out: mutex_unlock(&ci->i_fragtree_mutex); diff --git a/src/kernel/mds_client.c b/src/kernel/mds_client.c index 1308126610798..59ab1769590c5 100644 --- a/src/kernel/mds_client.c +++ b/src/kernel/mds_client.c @@ -478,21 +478,23 @@ static void __unregister_request(struct ceph_mds_client *mdsc, } /* - * Choose mds to send request to next. If there is a hint set in - * the request (e.g., due to a prior forward hint from the mds), use - * that. + * Choose mds to send request to next. If there is a hint set in the + * request (e.g., due to a prior forward hint from the mds), use that. + * Otherwise, consult frag tree and/or caps to identify the + * appropriate mds. If all else fails, choose randomly. * * Called under mdsc->mutex. */ static int __choose_mds(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { + struct inode *inode; + struct ceph_inode_info *ci; + struct ceph_cap *cap; + int mode = req->r_direct_mode; int mds = -1; u32 hash = req->r_direct_hash; bool is_hash = req->r_direct_is_hash; - struct dentry *dentry = req->r_dentry; - struct ceph_inode_info *ci; - int mode = req->r_direct_mode; /* * is there a specific mds we should try? ignore hint if we have @@ -509,65 +511,69 @@ static int __choose_mds(struct ceph_mds_client *mdsc, if (mode == USE_RANDOM_MDS) goto random; - /* - * try to find an appropriate mds to contact based on the - * given dentry. walk up the tree until we find delegation info - * in the i_fragtree. - * - * if is_hash is true, direct request at the appropriate directory - * fragment (as with a readdir on a fragmented directory). - */ - while (dentry) { - if (is_hash && dentry->d_inode && - S_ISDIR(dentry->d_inode->i_mode)) { - struct ceph_inode_frag frag; - int found; - - ci = ceph_inode(dentry->d_inode); - ceph_choose_frag(ci, hash, &frag, &found); - if (found) { - if (mode == USE_ANY_MDS && frag.ndist > 0) { - u8 r; - - /* choose a random replica */ - get_random_bytes(&r, 1); - r %= frag.ndist; - mds = frag.dist[r]; - dout("choose_mds %p %llx.%llx " - "frag %u mds%d (%d/%d)\n", - dentry->d_inode, - ceph_vinop(&ci->vfs_inode), - frag.frag, frag.mds, - (int)r, frag.ndist); - return mds; - } - /* since the more deeply nested item wasn't - * known to be replicated, then we want to - * look for the authoritative mds. */ - mode = USE_AUTH_MDS; - if (frag.mds >= 0) { - /* choose auth mds */ - mds = frag.mds; - dout("choose_mds %p %llx.%llx " - "frag %u mds%d (auth)\n", - dentry->d_inode, - ceph_vinop(&ci->vfs_inode), - frag.frag, mds); - return mds; - } + inode = 0; + if (req->r_inode) + inode = req->r_inode; + else if (req->r_dentry) + inode = req->r_dentry->d_inode; + dout("__choose_mds %p mode %d\n", inode, mode); + if (!inode) + goto random; + ci = ceph_inode(inode); + + if (is_hash && S_ISDIR(inode->i_mode)) { + struct ceph_inode_frag frag; + int found; + + ceph_choose_frag(ci, hash, &frag, &found); + if (found) { + if (mode == USE_ANY_MDS && frag.ndist > 0) { + u8 r; + + /* choose a random replica */ + get_random_bytes(&r, 1); + r %= frag.ndist; + mds = frag.dist[r]; + dout("choose_mds %p %llx.%llx " + "frag %u mds%d (%d/%d)\n", + inode, ceph_vinop(inode), + frag.frag, frag.mds, + (int)r, frag.ndist); + return mds; + } + + /* since this file/dir wasn't known to be + * replicated, then we want to look for the + * authoritative mds. */ + mode = USE_AUTH_MDS; + if (frag.mds >= 0) { + /* choose auth mds */ + mds = frag.mds; + dout("choose_mds %p %llx.%llx " + "frag %u mds%d (auth)\n", + inode, ceph_vinop(inode), frag.frag, mds); + return mds; } } - if (IS_ROOT(dentry)) - break; + } - /* move up the hierarchy, but direct request based on the hash - * for the child's dentry name */ - hash = dentry->d_name.hash; - is_hash = true; - dentry = dentry->d_parent; + spin_lock(&inode->i_lock); + cap = 0; + if (mode == USE_AUTH_MDS) + cap = ci->i_auth_cap; + if (!cap && !RB_EMPTY_ROOT(&ci->i_caps)) + cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); + if (!cap) { + spin_unlock(&inode->i_lock); + goto random; } + mds = cap->session->s_mds; + dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n", + inode, ceph_vinop(inode), mds, + cap == ci->i_auth_cap ? "auth " : "", cap); + spin_unlock(&inode->i_lock); + return mds; - /* ok, just pick one at random */ random: mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap); dout("choose_mds chose random mds%d\n", mds); diff --git a/src/kernel/super.h b/src/kernel/super.h index 388a64d6066b4..63c049dea5930 100644 --- a/src/kernel/super.h +++ b/src/kernel/super.h @@ -228,7 +228,7 @@ struct ceph_inode_frag { u32 frag; int split_by; /* i.e. 2^(split_by) children */ - /* delegation info */ + /* delegation and replication info */ int mds; /* -1 if same authority as parent */ int ndist; /* >0 if replicated */ int dist[CEPH_MAX_DIRFRAG_REP]; -- 2.39.5