]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
kclient: use caps, fragtree only to choose mds (not hierarchy)
authorSage Weil <sage@newdream.net>
Mon, 3 Aug 2009 18:39:27 +0000 (11:39 -0700)
committerSage Weil <sage@newdream.net>
Tue, 4 Aug 2009 22:33:35 +0000 (15:33 -0700)
Since we require caps for all inodes in our cache, no need to consider
parents when identifying where to sent a request.  Just look at fragtree
(for fragmented dirs) or caps.

src/TODO
src/kernel/inode.c
src/kernel/mds_client.c
src/kernel/super.h

index 3dbf931e659936700b86286ff909c8e5d47cdbc1..fa75a27f6790553d99d41d1698f21e2e222abccb 100644 (file)
--- a/src/TODO
+++ b/src/TODO
@@ -108,12 +108,9 @@ repair
 - mds scrubbing
 
 kclient
-- async writepage
 ?- ensure cap_snaps reflush after client reconnect 
-- fix up mds selection, and ESTALE handling
+- fix up ESTALE handling
 - make cap import/export efficient
-- simplify mds auth tracking?
-  - use caps instead?
 - flock, fnctl locks
 - ACLs
 - should we try to ref CAP_PIN on special inodes that are open?  
index 62e238a3bb90de8940c3fcc430b96fc5ecfa86cc..c388862d3dac3005b0ae5b0334d5a6bb43b90e53 100644 (file)
@@ -189,7 +189,7 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
 
 /*
  * Process dirfrag (delegation) info from the mds.  Include leaf
- * fragment in tree ONLY if mds >= 0 || ndist > 0.  Otherwise, only
+ * fragment in tree ONLY if ndist > 0.  Otherwise, only
  * branches/splits are included in i_fragtree)
  */
 static int ceph_fill_dirfrag(struct inode *inode,
@@ -204,7 +204,7 @@ static int ceph_fill_dirfrag(struct inode *inode,
        int err = 0;
 
        mutex_lock(&ci->i_fragtree_mutex);
-       if (mds < 0 && ndist == 0) {
+       if (ndist == 0) {
                /* no delegation info needed. */
                frag = __ceph_find_frag(ci, id);
                if (!frag)
@@ -241,8 +241,8 @@ static int ceph_fill_dirfrag(struct inode *inode,
        frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
        for (i = 0; i < frag->ndist; i++)
                frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
-       dout("fill_dirfrag %llx.%llx frag %x referral mds %d ndist=%d\n",
-            ceph_vinop(inode), frag->frag, frag->mds, frag->ndist);
+       dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
+            ceph_vinop(inode), frag->frag, frag->ndist);
 
 out:
        mutex_unlock(&ci->i_fragtree_mutex);
index 13081266107987f5d66d40949f2ebe390bf24069..59ab1769590c54a02482dd8aec1071b0c16014de 100644 (file)
@@ -478,21 +478,23 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 }
 
 /*
- * Choose mds to send request to next.  If there is a hint set in
- * the request (e.g., due to a prior forward hint from the mds), use
- * that.
+ * Choose mds to send request to next.  If there is a hint set in the
+ * request (e.g., due to a prior forward hint from the mds), use that.
+ * Otherwise, consult frag tree and/or caps to identify the
+ * appropriate mds.  If all else fails, choose randomly.
  *
  * Called under mdsc->mutex.
  */
 static int __choose_mds(struct ceph_mds_client *mdsc,
                        struct ceph_mds_request *req)
 {
+       struct inode *inode;
+       struct ceph_inode_info *ci;
+       struct ceph_cap *cap;
+       int mode = req->r_direct_mode;
        int mds = -1;
        u32 hash = req->r_direct_hash;
        bool is_hash = req->r_direct_is_hash;
-       struct dentry *dentry = req->r_dentry;
-       struct ceph_inode_info *ci;
-       int mode = req->r_direct_mode;
 
        /*
         * is there a specific mds we should try?  ignore hint if we have
@@ -509,65 +511,69 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
        if (mode == USE_RANDOM_MDS)
                goto random;
 
-       /*
-        * try to find an appropriate mds to contact based on the
-        * given dentry.  walk up the tree until we find delegation info
-        * in the i_fragtree.
-        *
-        * if is_hash is true, direct request at the appropriate directory
-        * fragment (as with a readdir on a fragmented directory).
-        */
-       while (dentry) {
-               if (is_hash && dentry->d_inode &&
-                   S_ISDIR(dentry->d_inode->i_mode)) {
-                       struct ceph_inode_frag frag;
-                       int found;
-
-                       ci = ceph_inode(dentry->d_inode);
-                       ceph_choose_frag(ci, hash, &frag, &found);
-                       if (found) {
-                               if (mode == USE_ANY_MDS && frag.ndist > 0) {
-                                       u8 r;
-
-                                       /* choose a random replica */
-                                       get_random_bytes(&r, 1);
-                                       r %= frag.ndist;
-                                       mds = frag.dist[r];
-                                       dout("choose_mds %p %llx.%llx "
-                                            "frag %u mds%d (%d/%d)\n",
-                                            dentry->d_inode,
-                                            ceph_vinop(&ci->vfs_inode),
-                                            frag.frag, frag.mds,
-                                            (int)r, frag.ndist);
-                                       return mds;
-                               }
-                               /* since the more deeply nested item wasn't
-                                * known to be replicated, then we want to
-                                * look for the authoritative mds. */
-                               mode = USE_AUTH_MDS;
-                               if (frag.mds >= 0) {
-                                       /* choose auth mds */
-                                       mds = frag.mds;
-                                       dout("choose_mds %p %llx.%llx "
-                                            "frag %u mds%d (auth)\n",
-                                            dentry->d_inode,
-                                            ceph_vinop(&ci->vfs_inode),
-                                            frag.frag, mds);
-                                       return mds;
-                               }
+       inode = 0;
+       if (req->r_inode)
+               inode = req->r_inode;
+       else if (req->r_dentry)
+               inode = req->r_dentry->d_inode;
+       dout("__choose_mds %p mode %d\n", inode, mode);
+       if (!inode)
+               goto random;
+       ci = ceph_inode(inode);
+
+       if (is_hash && S_ISDIR(inode->i_mode)) {
+               struct ceph_inode_frag frag;
+               int found;
+
+               ceph_choose_frag(ci, hash, &frag, &found);
+               if (found) {
+                       if (mode == USE_ANY_MDS && frag.ndist > 0) {
+                               u8 r;
+
+                               /* choose a random replica */
+                               get_random_bytes(&r, 1);
+                               r %= frag.ndist;
+                               mds = frag.dist[r];
+                               dout("choose_mds %p %llx.%llx "
+                                    "frag %u mds%d (%d/%d)\n",
+                                    inode, ceph_vinop(inode),
+                                    frag.frag, frag.mds,
+                                    (int)r, frag.ndist);
+                               return mds;
+                       }
+
+                       /* since this file/dir wasn't known to be
+                        * replicated, then we want to look for the
+                        * authoritative mds. */
+                       mode = USE_AUTH_MDS;
+                       if (frag.mds >= 0) {
+                               /* choose auth mds */
+                               mds = frag.mds;
+                               dout("choose_mds %p %llx.%llx "
+                                    "frag %u mds%d (auth)\n",
+                                    inode, ceph_vinop(inode), frag.frag, mds);
+                               return mds;
                        }
                }
-               if (IS_ROOT(dentry))
-                       break;
+       }
 
-               /* move up the hierarchy, but direct request based on the hash
-                * for the child's dentry name */
-               hash = dentry->d_name.hash;
-               is_hash = true;
-               dentry = dentry->d_parent;
+       spin_lock(&inode->i_lock);
+       cap = 0;
+       if (mode == USE_AUTH_MDS)
+               cap = ci->i_auth_cap;
+       if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
+               cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
+       if (!cap) {
+               spin_unlock(&inode->i_lock);
+               goto random;
        }
+       mds = cap->session->s_mds;
+       dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
+            inode, ceph_vinop(inode), mds, 
+            cap == ci->i_auth_cap ? "auth " : "", cap);
+       spin_unlock(&inode->i_lock);
+       return mds;
 
-       /* ok, just pick one at random */
 random:
        mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
        dout("choose_mds chose random mds%d\n", mds);
index 388a64d6066b4182cbba93587dc3782da146d67e..63c049dea59309e8ebdfc08a7b21ac5b76e8a33b 100644 (file)
@@ -228,7 +228,7 @@ struct ceph_inode_frag {
        u32 frag;
        int split_by;         /* i.e. 2^(split_by) children */
 
-       /* delegation info */
+       /* delegation and replication info */
        int mds;              /* -1 if same authority as parent */
        int ndist;            /* >0 if replicated */
        int dist[CEPH_MAX_DIRFRAG_REP];