kclient: caps, mon_client cleanup

author Sage Weil <sage@newdream.net>

Fri, 17 Jul 2009 19:50:12 +0000 (12:50 -0700)

committer Sage Weil <sage@newdream.net>

Fri, 17 Jul 2009 20:53:38 +0000 (13:53 -0700)
author Sage Weil <sage@newdream.net>
Fri, 17 Jul 2009 19:50:12 +0000 (12:50 -0700)
committer Sage Weil <sage@newdream.net>
Fri, 17 Jul 2009 20:53:38 +0000 (13:53 -0700)
diff --git a/src/kernel/caps.c b/src/kernel/caps.c

index c7be92cc15c677d4ee7d2d51f5d6f3e3c483a8fd..6fffbb586090dfa9e9f1430a622d199e7a46433a 100644 (file)
--- a/src/kernel/caps.c
+++ b/src/kernel/caps.c
@@ -8,6 +8,33 @@
  #include "decode.h"
  #include "messenger.h"
  
+/*
+ * Capability management
+ *
+ * The Ceph metadata servers control client access to inode metadata
+ * and file data by issuing capabilities, granting clients permission
+ * to read and/or write both inode field and file data to OSDs
+ * (storage nodes).  Each capability consists of a set of bits
+ * indicating which operations are allowed.
+ *
+ * If the client holds a *_SHARED cap, the client has a coherent value
+ * that can be safely read from the cached inode.
+ *
+ * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
+ * client is allowed to change inode attributes (e.g., file size,
+ * mtime), note its dirty state in the ceph_cap, and asynchronously
+ * flush that metadata change to the MDS.
+ *
+ * In the event of a conflicting operation (perhaps by another
+ * client), the MDS will revoke the conflicting client capabilities.
+ *
+ * In order for a client to cache an inode, it must hold a capability
+ * with at least one MDS server.  When inodes are released, release
+ * notifications are batched and periodically sent en masse to the MDS
+ * cluster to release server state.
+ */
+
+
  /*
   * Generate readable cap strings for debugging output.
   */
@@ -16,13 +43,6 @@ static char cap_str[MAX_CAP_STR][40];
  static DEFINE_SPINLOCK(cap_str_lock);
  static int last_cap_str;
  
-static spinlock_t caps_list_lock;
-static struct list_head caps_list;  /* unused (reserved or unreserved) */
-static int caps_total_count;        /* total caps allocated */
-static int caps_use_count;          /* in use */
-static int caps_reserve_count;      /* unused, reserved */
-static int caps_avail_count;        /* unused, unreserved */
-
  static char *gcap_string(char *s, int c)
  {
         if (c & CEPH_CAP_GSHARED)
@@ -90,11 +110,23 @@ const char *ceph_cap_string(int caps)
  }
  
  /*
- * cap reservations
+ * Cap reservations
+ *
+ * Maintain a global pool of preallocated struct ceph_caps, referenced
+ * by struct ceph_caps_reservations.  This ensures that we preallocate
+ * memory needed to successfully process an MDS response.  (If an MDS
+ * sends us cap information and we fail to process it, we will have
+ * problems due to the client and MDS being out of sync.)
   *
- * maintain a global pool of preallocated struct ceph_caps, referenced
- * by struct ceph_caps_reservations.
+ * Reservations are 'owned' by a ceph_cap_reservation context.
   */
+static spinlock_t caps_list_lock;
+static struct list_head caps_list;  /* unused (reserved or unreserved) */
+static int caps_total_count;        /* total caps allocated */
+static int caps_use_count;          /* in use */
+static int caps_reserve_count;      /* unused, reserved */
+static int caps_avail_count;        /* unused, unreserved */
+
  void ceph_caps_init(void)
  {
         INIT_LIST_HEAD(&caps_list);
@@ -197,7 +229,7 @@ static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
  {
         struct ceph_cap *cap = NULL;
  
-       /* temporary */
+       /* temporary, until we do something about cap import/export */
         if (!ctx)
                 return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
  
@@ -278,7 +310,7 @@ static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
  }
  
  /*
- * Return id of any MDS with a cap, preferably WR|WRBUFFER|EXCL, else
+ * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
   * -1.
   */
  static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
@@ -336,7 +368,8 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
  }
  
  /*
- * (re)set cap hold timeouts.
+ * (re)set cap hold timeouts, which control the delayed release
+ * of unused caps back to the MDS.  Should be called on cap use.
   */
  static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
                                struct ceph_inode_info *ci)
@@ -350,10 +383,9 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
         dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
              ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
  }
+
  /*
- * (Re)queue cap at the end of the delayed cap release list.  If
- * an inode was queued but with i_hold_caps_max=0, meaning it was
- * queued for immediate flush, don't reset the timeouts.
+ * (Re)queue cap at the end of the delayed cap release list.
   *
   * If I_FLUSH is set, leave the inode at the front of the list.
   *
@@ -382,7 +414,7 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
  /*
   * Queue an inode for immediate writeback.  Mark inode with I_FLUSH,
   * indicating we should send a cap message to flush dirty metadata
- * asap.
+ * asap, and move to the front of the delayed cap list.
   */
  static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
                                       struct ceph_inode_info *ci)
@@ -399,7 +431,7 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
  /*
   * Cancel delayed work on cap.
   *
- * Caller hold i_lock.
+ * Caller must hold i_lock.
   */
  static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
                                struct ceph_inode_info *ci)
@@ -415,12 +447,11 @@ static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
  /*
   * Add a capability under the given MDS session.
   *
- * Bump i_count when adding the first cap.
- *
   * Caller should hold session snap_rwsem (read), s_mutex.
   *
   * @fmode is the open file mode, if we are opening a file,
- * otherwise it is < 0.
+ * otherwise it is < 0.  (This is so we can atomically add
+ * the cap and add an open file reference to it.)
   */
  int ceph_add_cap(struct inode *inode,
                  struct ceph_mds_session *session, u64 cap_id,
@@ -479,7 +510,7 @@ retry:
                 /* add to session cap list */
                 cap->session = session;
                 spin_lock(&session->s_cap_lock);
-               list_add(&cap->session_caps, &session->s_caps);
+               list_add_tail(&cap->session_caps, &session->s_caps);
                 session->s_nr_caps++;
                 spin_unlock(&session->s_cap_lock);
         }
@@ -552,7 +583,8 @@ retry:
  
  /*
   * Return true if cap has not timed out and belongs to the current
- * generation of the MDS session.
+ * generation of the MDS session (i.e. has not gone 'stale' due to
+ * us losing touch with the mds).
   */
  static int __cap_is_valid(struct ceph_cap *cap)
  {
@@ -600,6 +632,9 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
         return have;
  }
  
+/*
+ * Get cap bits issued by caps other than @ocap
+ */
  int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
  {
         int have = ci->i_snap_caps;
@@ -617,6 +652,10 @@ int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
         return have;
  }
  
+/*
+ * Move a cap to the end of the LRU (oldest caps at list head, newest
+ * at list tail).
+ */
  static void __touch_cap(struct ceph_cap *cap)
  {
         struct ceph_mds_session *s = cap->session;
@@ -629,8 +668,9 @@ static void __touch_cap(struct ceph_cap *cap)
  }
  
  /*
- * Return true if we hold the given mask.  And move the cap(s) to the front
- * of their respective LRUs.
+ * Return true if we hold the given mask.  And move the cap(s) to the
+ * front of their respective LRUs.  (This is the preferred way for
+ * callers to check for caps they want.)
   */
  int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
  {
@@ -670,6 +710,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
                         if (touch) {
                                 struct rb_node *q;
  
+                               /* touch this + preceeding caps */
                                 __touch_cap(cap);
                                 for (q = rb_first(&ci->i_caps); q != p;
                                      q = rb_next(q)) {
@@ -688,7 +729,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
  }
  
  /*
- * Return mask of caps currently being revoked by an MDS.
+ * Return true if mask caps are currently being revoked by an MDS.
   */
  int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
  {
@@ -713,7 +754,7 @@ int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
  }
  
  /*
- * Return caps we have registered with the MDS(s) as wanted.
+ * Return caps we have registered with the MDS(s) as 'wanted'.
   */
  int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
  {
@@ -850,6 +891,10 @@ static void send_cap_msg(struct ceph_mds_client *mdsc, u64 ino, u64 cid, int op,
         ceph_send_msg_mds(mdsc, msg, mds);
  }
  
+/*
+ * Queue cap releases when an inode is dropped from our
+ * cache.
+ */
  void ceph_queue_caps_release(struct inode *inode)
  {
         struct ceph_inode_info *ci = ceph_inode(inode);
@@ -977,6 +1022,11 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
         cap->mds_wanted = want;
  
         if (flushing) {
+               /*
+                * assign a tid for flush operations so we can avoid
+                *  flush1 -> dirty1 -> flush2 -> flushack1 -> mark clean
+                * type races.
+                */
                 flush_tid = ++cap->session->s_cap_flush_tid;
                 ci->i_cap_flush_tid = flush_tid;
                 dout(" cap_flush_tid %lld\n", flush_tid);
@@ -1141,7 +1191,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
  }
  
  /*
- * Add dirty inode to the sync (currently flushing) list.
+ * Add dirty inode to the flushing list.  Assigned a seq number so we
+ * can wait for caps to flush without starving.
   */
  static void __mark_caps_flushing(struct inode *inode,
                                  struct ceph_mds_session *session)
@@ -1162,11 +1213,15 @@ static void __mark_caps_flushing(struct inode *inode,
  }
  
  /*
- * Swiss army knife function to examine currently used, wanted versus
- * held caps.  Release, flush, ack revoked caps to mds as appropriate.
+ * Swiss army knife function to examine currently used and wanted
+ * versus held caps.  Release, flush, ack revoked caps to mds as
+ * appropriate.
   *
- * @is_delayed indicates caller is delayed work and we should not
- * delay further.
+ *  CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
+ *    cap release further.  
+ *  CHECK_CAPS_AUTHONLY - we should only check the auth cap
+ *  CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
+ *    further delay.
   */
  void ceph_check_caps(struct ceph_inode_info *ci, int flags,
                      struct ceph_mds_session *session)
@@ -1501,7 +1556,10 @@ static int caps_are_clean(struct inode *inode)
  }
  
  /*
- * Flush any dirty caps back to the mds
+ * Flush any dirty caps back to the mds.  If we aren't asked to wait,
+ * queue inode for flush but don't do so immediately, because we can
+ * get by with fewer MDS messages if we wait for e.g. data writeback
+ * to complete first.
   */
  int ceph_write_inode(struct inode *inode, int wait)
  {
@@ -1706,8 +1764,8 @@ retry:
  }
  
  /*
- * Take cap refs.  Caller must already now we hold at least on ref on
- * the caps in question or we don't know this is safe.
+ * Take cap refs.  Caller must already know we hold at least one ref
+ * on the caps in question or we don't know this is safe.
   */
  void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
  {
diff --git a/src/kernel/import_patch_set_into_linux_git.sh b/src/kernel/import_patch_set_into_linux_git.sh

index ee74282f588972c047f7b27f126de9a8cb286e02..0476f21eec0273f28689f6d5bb564df1f51544a2 100755 (executable)
--- a/src/kernel/import_patch_set_into_linux_git.sh
+++ b/src/kernel/import_patch_set_into_linux_git.sh
@@ -217,13 +217,16 @@ git add $target/ceph/caps.c
  git commit -F - <<EOF
  ceph: capability management
  
-The Ceph metadata servers control client access to data by issuing
-capabilities granting clients permission to read and/or write to OSDs
+The Ceph metadata servers control client access to inode metadata and file data by issuing
+capabilities, granting clients permission to read and/or write both inode field and file data to OSDs
  (storage nodes).  Each capability consists of a set of bits indicating
  which operations are allowed.
  
-In the case of an EXCL (exclusive) or WR capabilities, the client is
-allowed to change inode attributes (e.g., file size, mtime), noting
+If the client holds a *_SHARED cap, the client has a coherent value
+that can be safely read from the cached inode.
+
+In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the client
+is allowed to change inode attributes (e.g., file size, mtime), note
  its dirty state in the ceph_cap, and asynchronously flush that
  metadata change to the MDS.
  
diff --git a/src/kernel/mon_client.c b/src/kernel/mon_client.c

index 9abb45bea6d92b567b934b8a4faf44a2e2088410..abee6dc8fd99c68b00cdeed3a6b4cbd4e869cce7 100644 (file)
--- a/src/kernel/mon_client.c
+++ b/src/kernel/mon_client.c
@@ -8,6 +8,12 @@
  #include "super.h"
  #include "decode.h"
  
+/*
+ * Interact with Ceph monitor cluster.  Handle requests for new map
+ * versions, and periodically resend as needed.  Also implement
+ * statfs() and umount().
+ */
+
  /*
   * Decode a monmap blob (e.g., during mount).
   */
@@ -65,8 +71,8 @@ int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
  }
  
  /*
- * Choose a monitor.  If @notmon >= 0, choose a different monitor than
- * last time.
+ * Choose a monitor.  If @newmon >= 0, try to choose a different
+ * monitor than last time.
   */
  static int pick_mon(struct ceph_mon_client *monc, int newmon)
  {
@@ -80,7 +86,8 @@ static int pick_mon(struct ceph_mon_client *monc, int newmon)
  }
  
  /*
- * Generic timeout mechanism for monitor requests
+ * Generic timeout mechanism for monitor requests, so we can resend if
+ * we don't get a timely reply.
   */
  static void reschedule_timeout(struct ceph_mon_request *req)
  {
@@ -127,7 +134,7 @@ static void init_request_type(struct ceph_mon_client *monc,
  
  
  /*
- * mds map
+ * Request a new mds map.
   */
  static void request_mdsmap(struct ceph_mon_client *monc, int newmon)
  {
@@ -147,7 +154,7 @@ static void request_mdsmap(struct ceph_mon_client *monc, int newmon)
  }
  
  /*
- * Register our desire for an mdsmap >= epoch @want.
+ * Register our desire for an mdsmap epoch >= @want.
   */
  void ceph_monc_request_mdsmap(struct ceph_mon_client *monc, u32 want)
  {
@@ -407,7 +414,7 @@ static void do_statfs_check(struct work_struct *work)
                 if (time_after(jiffies, req->last_attempt + req->delay)) {
                         req->last_attempt = jiffies;
                         if (req->delay < MAX_DELAY_INTERVAL)
-                               req->delay *= 2;
+                               req->delay *= 2; /* exponential backoff */
                         send_statfs(monc, req, newmon);
                         newmon = 0;
                 }
author	Sage Weil <sage@newdream.net>
	Fri, 17 Jul 2009 19:50:12 +0000 (12:50 -0700)
committer	Sage Weil <sage@newdream.net>
	Fri, 17 Jul 2009 20:53:38 +0000 (13:53 -0700)
src/kernel/caps.c		patch \| blob \| history
src/kernel/import_patch_set_into_linux_git.sh		patch \| blob \| history
src/kernel/mon_client.c		patch \| blob \| history