client: add delegation support for cephfs

author Jeff Layton <jlayton@redhat.com>

Tue, 14 Nov 2017 12:26:56 +0000 (07:26 -0500)

committer Nathan Cutler <ncutler@suse.com>

Wed, 13 Dec 2017 12:56:23 +0000 (13:56 +0100)
author Jeff Layton <jlayton@redhat.com>
Tue, 14 Nov 2017 12:26:56 +0000 (07:26 -0500)
committer Nathan Cutler <ncutler@suse.com>
Wed, 13 Dec 2017 12:56:23 +0000 (13:56 +0100)
diff --git a/src/client/CMakeLists.txt b/src/client/CMakeLists.txt

index 4d182d8082e163defe929299317a06985f0c71e3..8897ada7b598725e35f9cada4402d19c6cb8bfc3 100644 (file)
--- a/src/client/CMakeLists.txt
+++ b/src/client/CMakeLists.txt
@@ -7,6 +7,7 @@ set(libclient_srcs
    ClientSnapRealm.cc
    MetaSession.cc
    Trace.cc
-  posix_acl.cc)
+  posix_acl.cc
+  Delegation.cc)
  add_library(client STATIC ${libclient_srcs})
  target_link_libraries(client osdc)
diff --git a/src/client/Client.cc b/src/client/Client.cc

index 1d9277a61b6ecfb1d5c01f83d2aedc4418600e3f..4fb47d489a4c765996973d0fb54494d51e19d1d8 100644 (file)
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -83,6 +83,7 @@
  #include "Client.h"
  #include "Inode.h"
  #include "Dentry.h"
+#include "Delegation.h"
  #include "Dir.h"
  #include "ClientSnapRealm.h"
  #include "Fh.h"
@@ -253,7 +254,8 @@ Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
      mounted(false), unmounting(false), blacklisted(false),
      local_osd(-1), local_osd_epoch(0),
      unsafe_sync_write(0),
-    client_lock("Client::client_lock")
+    client_lock("Client::client_lock"),
+    deleg_timeout(0)
  {
    _reset_faked_inos();
    //
@@ -5060,11 +5062,18 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient
    check_cap_issue(in, cap, new_caps);
  
    // update caps
-  if (old_caps & ~new_caps) { 
-    ldout(cct, 10) << "  revocation of " << ccap_string(~new_caps & old_caps) << dendl;
+  int revoked = old_caps & ~new_caps;
+  if (revoked) {
+    ldout(cct, 10) << "  revocation of " << ccap_string(revoked) << dendl;
      cap->issued = new_caps;
      cap->implemented |= new_caps;
  
+    // recall delegations if we're losing caps necessary for them
+    if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
+      in->recall_deleg(false);
+    else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
+      in->recall_deleg(true);
+
      if (((used & ~new_caps) & CEPH_CAP_FILE_BUFFER)
          && !_flush(in, new C_Client_FlushComplete(this, in))) {
        // waitin' for flush
@@ -5821,16 +5830,16 @@ void Client::flush_mdlog(MetaSession *session)
  }
  
  
-void Client::unmount()
+void Client::_unmount()
  {
-  Mutex::Locker lock(client_lock);
-
    if (unmounting)
      return;
  
    ldout(cct, 2) << "unmounting" << dendl;
    unmounting = true;
  
+  deleg_timeout = 0;
+
    flush_mdlog_sync(); // flush the mdlog for pending requests, if any
    while (!mds_requests.empty()) {
      ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl;
@@ -5944,6 +5953,12 @@ void Client::unmount()
    ldout(cct, 2) << "unmounted." << dendl;
  }
  
+void Client::unmount()
+{
+  Mutex::Locker lock(client_lock);
+  _unmount();
+}
+
  void Client::flush_cap_releases()
  {
    // send any cap releases
@@ -8361,6 +8376,8 @@ int Client::_release_fh(Fh *f)
    Inode *in = f->inode.get();
    ldout(cct, 5) << "_release_fh " << f << " mode " << f->mode << " on " << *in << dendl;
  
+  in->unset_deleg(f);
+
    if (in->snapid == CEPH_NOSNAP) {
      if (in->put_open_ref(f->mode)) {
        _flush(in, new C_Client_FlushComplete(this, in));
@@ -8412,11 +8429,11 @@ int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
  
    in->get_open_ref(cmode);  // make note of pending open, since it effects _wanted_ caps.
  
-  if ((flags & O_TRUNC) == 0 &&
-      in->caps_issued_mask(want)) {
+  if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
      // update wanted?
      check_caps(in, CHECK_CAPS_NODELAY);
    } else {
+
      MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
      filepath path;
      in->make_nosnap_relative_path(path);
@@ -8431,6 +8448,34 @@ int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
      req->head.args.open.old_size = in->size;   // for O_TRUNC
      req->set_inode(in);
      result = make_request(req, perms);
+
+    /*
+     * NFS expects that delegations will be broken on a conflicting open,
+     * not just when there is actual conflicting access to the file. SMB leases
+     * and oplocks also have similar semantics.
+     *
+     * Ensure that clients that have delegations enabled will wait on minimal
+     * caps during open, just to ensure that other clients holding delegations
+     * return theirs first.
+     */
+    if (deleg_timeout && result == 0) {
+      int need = 0, have;
+
+      if (cmode & CEPH_FILE_MODE_WR)
+        need |= CEPH_CAP_FILE_WR;
+      if (cmode & CEPH_FILE_MODE_RD)
+        need |= CEPH_CAP_FILE_RD;
+
+      result = get_caps(in, need, want, &have, -1);
+      if (result < 0) {
+       ldout(cct, 1) << "Unable to get caps after open of inode " << *in <<
+                         " . Denying open: " <<
+                         cpp_strerror(result) << dendl;
+       in->put_open_ref(cmode);
+      } else {
+       put_cap_ref(in, need);
+      }
+    }
    }
  
    // success?
@@ -11911,8 +11956,9 @@ int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
    req->set_filepath(path);
  
    InodeRef otherin;
-
+  Inode *in;
    Dentry *de;
+
    int res = get_or_create(dir, name, &de);
    if (res < 0)
      goto fail;
@@ -11923,7 +11969,10 @@ int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
    res = _lookup(dir, name, 0, &otherin, perm);
    if (res < 0)
      goto fail;
-  req->set_other_inode(otherin.get());
+
+  in = otherin.get();
+  req->set_other_inode(in);
+  in->break_all_delegs();
    req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
  
    req->set_inode(dir);
@@ -12093,15 +12142,26 @@ int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const ch
      res = _lookup(fromdir, fromname, 0, &oldin, perm);
      if (res < 0)
        goto fail;
-    req->set_old_inode(oldin.get());
+
+    Inode *oldinode = oldin.get();
+    oldinode->break_all_delegs();
+    req->set_old_inode(oldinode);
      req->old_inode_drop = CEPH_CAP_LINK_SHARED;
  
      res = _lookup(todir, toname, 0, &otherin, perm);
-    if (res != 0 && res != -ENOENT) {
-      goto fail;
-    } else if (res == 0) {
-      req->set_other_inode(otherin.get());
+    switch (res) {
+    case 0:
+      {
+       Inode *in = otherin.get();
+       req->set_other_inode(in);
+       in->break_all_delegs();
+      }
        req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
+      break;
+    case -ENOENT:
+      break;
+    default:
+      goto fail;
      }
  
      req->set_inode(todir);
@@ -12172,6 +12232,7 @@ int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& pe
      return -EDQUOT;
    }
  
+  in->break_all_delegs();
    MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
  
    filepath path(newname, dir->ino);
@@ -13014,6 +13075,48 @@ int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
    return _flock(fh, cmd, owner);
  }
  
+int Client::set_deleg_timeout(uint32_t timeout)
+{
+  Mutex::Locker lock(client_lock);
+
+  /*
+   * The whole point is to prevent blacklisting so we must time out the
+   * delegation before the session autoclose timeout kicks in.
+   */
+  if (timeout >= mdsmap->get_session_autoclose())
+    return -EINVAL;
+
+  deleg_timeout = timeout;
+  return 0;
+}
+
+int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
+{
+  int ret = -EINVAL;
+
+  Mutex::Locker lock(client_lock);
+
+  if (!mounted)
+    return -ENOTCONN;
+
+  Inode *inode = fh->inode.get();
+
+  switch(cmd) {
+  case CEPH_DELEGATION_NONE:
+    inode->unset_deleg(fh);
+    ret = 0;
+    break;
+  default:
+    try {
+      ret = inode->set_deleg(fh, cmd, cb, priv);
+    } catch (std::bad_alloc) {
+      ret = -ENOMEM;
+    }
+    break;
+  }
+  return ret;
+}
+
  class C_Client_RequestInterrupt : public Context  {
  private:
    Client *client;
@@ -13736,6 +13839,8 @@ const char** Client::get_tracked_conf_keys() const
      "client_cache_size",
      "client_cache_mid",
      "client_acl_type",
+    "client_deleg_timeout",
+    "client_deleg_break_on_open",
      NULL
    };
    return keys;
@@ -13839,4 +13944,3 @@ void StandaloneClient::shutdown()
    objecter->shutdown();
    monclient->shutdown();
  }
-
diff --git a/src/client/Client.h b/src/client/Client.h

index 16aef0312c15e8cb00632e01bc1b05be1a684d6a..568ecdf1aa684d662417309181f66ae7fd981dc8 100644 (file)
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -139,6 +139,9 @@ typedef int (*client_getgroups_callback_t)(void *handle, gid_t **sgids);
  typedef void(*client_switch_interrupt_callback_t)(void *handle, void *data);
  typedef mode_t (*client_umask_callback_t)(void *handle);
  
+/* Callback for delegation recalls */
+typedef void (*ceph_deleg_cb_t)(Fh *fh, void *priv);
+
  struct client_callback_args {
    void *handle;
    client_ino_callback_t ino_cb;
@@ -404,6 +407,8 @@ protected:
  public:
    entity_name_t get_myname() { return messenger->get_myname(); } 
    void _sync_write_commit(Inode *in);
+  void wait_on_list(list<Cond*>& ls);
+  void signal_cond_list(list<Cond*>& ls);
  
  protected:
    std::unique_ptr<Filer>             filer;
@@ -482,8 +487,6 @@ protected:
  
    // helpers
    void wake_inode_waiters(MetaSession *s);
-  void wait_on_list(list<Cond*>& ls);
-  void signal_cond_list(list<Cond*>& ls);
  
    void wait_on_context_list(list<Context*>& ls);
    void signal_context_list(list<Context*>& ls);
@@ -494,12 +497,16 @@ protected:
    void put_inode(Inode *in, int n=1);
    void close_dir(Dir *dir);
  
+  // same as unmount() but for when the client_lock is already held
+  void _unmount();
+
    friend class C_Client_FlushComplete; // calls put_inode()
    friend class C_Client_CacheInvalidate;  // calls ino_invalidate_cb
    friend class C_Client_DentryInvalidate;  // calls dentry_invalidate_cb
    friend class C_Block_Sync; // Calls block map and protected helpers
    friend class C_Client_RequestInterrupt;
    friend class C_Client_Remount;
+  friend class C_Deleg_Timeout; // Asserts on client_lock, called when a delegation is unreturned
    friend void intrusive_ptr_release(Inode *in);
  
    //int get_cache_size() { return lru.lru_get_size(); }
@@ -731,6 +738,7 @@ protected:
    // fs ops.
  private:
  
+  uint32_t deleg_timeout;
    void fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off);
  
    // some readdir helpers
@@ -1237,6 +1245,9 @@ public:
    const char** get_tracked_conf_keys() const override;
    void handle_conf_change(const struct md_config_t *conf,
                                   const std::set <std::string> &changed) override;
+  uint32_t get_deleg_timeout() { return deleg_timeout; }
+  int set_deleg_timeout(uint32_t timeout);
+  int ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv);
  };
  
  /**
diff --git a/src/client/Delegation.cc b/src/client/Delegation.cc

new file mode 100644 (file)

index 0000000..6c972fd
--- /dev/null
+++ b/src/client/Delegation.cc
@@ -0,0 +1,132 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include "common/Clock.h"
+#include "common/Timer.h"
+
+#include "Client.h"
+#include "Inode.h"
+#include "Fh.h"
+#include "Delegation.h"
+
+class C_Deleg_Timeout : public Context {
+  Delegation *deleg;
+public:
+  explicit C_Deleg_Timeout(Delegation *d) : deleg(d) {}
+  void finish(int r) override {
+    Inode *in = deleg->get_fh()->inode.get();
+    Client *client = in->client;
+
+    // Called back via Timer, which takes client_lock for us
+    assert(client->client_lock.is_locked_by_me());
+
+    lsubdout(client->cct, client, 0) << __func__ <<
+         ": delegation return timeout for inode 0x" <<
+         std::hex << in->ino << ". Forcibly unmounting client. "<<
+         client << std::dec << dendl;
+    client->_unmount();
+  }
+};
+
+/**
+ * ceph_deleg_caps_for_type - what caps are necessary for a delegation?
+ * @type: delegation request type
+ *
+ * Determine what caps are necessary in order to grant a delegation of a given
+ * type. For read delegations, we need whatever we require in order to do
+ * cached reads, plus AsLs to cover metadata changes that should trigger a
+ * recall. We also grab Xs since changing xattrs usually alters the mtime and
+ * so would trigger a recall.
+ *
+ * For write delegations, we need whatever read delegations need plus the
+ * caps to allow writing to the file (Fbwx).
+ */
+int ceph_deleg_caps_for_type(unsigned type)
+{
+       int caps = CEPH_CAP_PIN;
+
+       switch (type) {
+       case CEPH_DELEGATION_WR:
+               caps |= CEPH_CAP_FILE_EXCL |
+                       CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER;
+               /* Fallthrough */
+       case CEPH_DELEGATION_RD:
+               caps |= CEPH_CAP_FILE_SHARED |
+                       CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
+                       CEPH_CAP_XATTR_SHARED |
+                       CEPH_CAP_LINK_SHARED | CEPH_CAP_AUTH_SHARED;
+               break;
+       default:
+               // Should never happen
+               assert(false);
+       }
+       return caps;
+}
+
+/*
+ * A delegation is a container for holding caps on behalf of a client that
+ * wants to be able to rely on them until recalled.
+ */
+Delegation::Delegation(Fh *_fh, unsigned _type, ceph_deleg_cb_t _cb, void *_priv)
+       : fh(_fh), priv(_priv), type(_type), recall_cb(_cb),
+         recall_time(utime_t()), timeout_event(nullptr)
+{
+  Inode *inode = _fh->inode.get();
+  inode->client->get_cap_ref(inode, ceph_deleg_caps_for_type(_type));
+};
+
+Delegation::~Delegation()
+{
+  disarm_timeout();
+  Inode *inode = fh->inode.get();
+  inode->client->put_cap_ref(inode, ceph_deleg_caps_for_type(type));
+}
+
+void Delegation::reinit(unsigned _type, ceph_deleg_cb_t _recall_cb, void *_priv)
+{
+  /* update cap refs -- note that we do a get first to avoid any going to 0 */
+  if (type != _type) {
+    Inode *inode = fh->inode.get();
+
+    inode->client->get_cap_ref(inode, ceph_deleg_caps_for_type(_type));
+    inode->client->put_cap_ref(inode, ceph_deleg_caps_for_type(type));
+    type = _type;
+  }
+
+  recall_cb = _recall_cb;
+  priv = _priv;
+}
+
+void Delegation::arm_timeout()
+{
+  Client *client = fh->inode.get()->client;
+
+  if (timeout_event)
+    return;
+
+  timeout_event = new C_Deleg_Timeout(this);
+  client->timer.add_event_after(client->get_deleg_timeout(), timeout_event);
+}
+
+void Delegation::disarm_timeout()
+{
+  Client *client = fh->inode.get()->client;
+
+  if (!timeout_event)
+    return;
+
+  client->timer.cancel_event(timeout_event);
+  timeout_event = nullptr;
+}
+
+void Delegation::recall(bool skip_read)
+{
+  /* If skip_read is true, don't break read delegations */
+  if (skip_read && type == CEPH_DELEGATION_RD)
+    return;
+
+  if (!is_recalled()) {
+    recall_cb(fh, priv);
+    recall_time = ceph_clock_now();
+    arm_timeout();
+  }
+}
diff --git a/src/client/Delegation.h b/src/client/Delegation.h

new file mode 100644 (file)

index 0000000..e260f6c
--- /dev/null
+++ b/src/client/Delegation.h
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef _CEPH_CLIENT_DELEGATION_H
+#define _CEPH_CLIENT_DELEGATION_H
+
+#include "common/Clock.h"
+#include "common/Timer.h"
+
+class Fh;
+
+/* Commands for manipulating delegation state */
+#ifndef CEPH_DELEGATION_NONE
+# define CEPH_DELEGATION_NONE  0
+# define CEPH_DELEGATION_RD    1
+# define CEPH_DELEGATION_WR    2
+#endif
+
+/* Callback for delegation recalls */
+typedef void (*ceph_deleg_cb_t)(Fh *fh, void *priv);
+
+/* Converts CEPH_DELEGATION_* to cap mask */
+int ceph_deleg_caps_for_type(unsigned type);
+
+/*
+ * A delegation is a container for holding caps on behalf of a client that
+ * wants to be able to rely on them until recalled.
+ */
+class Delegation {
+public:
+  Delegation(Fh *_fh, unsigned _type, ceph_deleg_cb_t _cb, void *_priv);
+  ~Delegation();
+  Fh *get_fh() { return fh; }
+  unsigned get_type() { return type; }
+  bool is_recalled() { return !recall_time.is_zero(); }
+
+  void reinit(unsigned _type, ceph_deleg_cb_t _recall_cb, void *_priv);
+  void recall(bool skip_read);
+private:
+  // Filehandle against which it was acquired
+  Fh                           *fh;
+
+  // opaque token that will be passed to the callback
+  void                         *priv;
+
+  // CEPH_DELEGATION_* type
+  unsigned                     type;
+
+  // callback into application to recall delegation
+  ceph_deleg_cb_t              recall_cb;
+
+  // time of first recall
+  utime_t                      recall_time;
+
+  // timer for unreturned delegations
+  Context                      *timeout_event;
+
+  void arm_timeout();
+  void disarm_timeout();
+};
+
+#endif /* _CEPH_CLIENT_DELEGATION_H */
diff --git a/src/client/Inode.cc b/src/client/Inode.cc

index a3d42254765ce46334960d176ab5a85060c56f85..d9fd0336313ebc59292bde6fcc8be2743dccc850 100644 (file)
--- a/src/client/Inode.cc
+++ b/src/client/Inode.cc
@@ -8,6 +8,7 @@
  #include "Fh.h"
  #include "MetaSession.h"
  #include "ClientSnapRealm.h"
+#include "Delegation.h"
  
  #include "mds/flock.h"
  
@@ -27,6 +28,12 @@ Inode::~Inode()
      assert(oset.objects.empty());
    }
  
+  if (!delegations.empty()) {
+    lsubdout(client->cct, client, 0) << __func__ << ": leftover delegations on inode 0x"
+      << std::hex << ino << std::dec << dendl;
+    assert(delegations.empty());
+  }
+
    delete fcntl_locks;
    delete flock_locks;
  }
@@ -116,6 +123,7 @@ void Inode::make_nosnap_relative_path(filepath& p)
  void Inode::get_open_ref(int mode)
  {
    open_by_mode[mode]++;
+  break_deleg(!(mode & CEPH_FILE_MODE_WR));
  }
  
  bool Inode::put_open_ref(int mode)
@@ -555,3 +563,160 @@ void Inode::set_async_err(int r)
    }
  }
  
+bool Inode::has_recalled_deleg()
+{
+  if (delegations.empty())
+    return false;
+
+  // Either all delegations are recalled or none are. Just check the first.
+  Delegation& deleg = delegations.front();
+  return deleg.is_recalled();
+}
+
+void Inode::recall_deleg(bool skip_read)
+{
+  if (delegations.empty())
+    return;
+
+  // Issue any recalls
+  for (list<Delegation>::iterator d = delegations.begin();
+       d != delegations.end(); ++d) {
+
+    Delegation& deleg = *d;
+    deleg.recall(skip_read);
+  }
+}
+
+bool Inode::delegations_broken(bool skip_read)
+{
+  if (delegations.empty()) {
+    lsubdout(client->cct, client, 10) <<
+         __func__ << ": delegations empty on " << *this << dendl;
+    return true;
+  }
+
+  if (skip_read) {
+    Delegation& deleg = delegations.front();
+    lsubdout(client->cct, client, 10) <<
+       __func__ << ": read delegs only on " << *this << dendl;
+    if (deleg.get_type() == CEPH_FILE_MODE_RD) {
+       return true;
+    }
+  }
+  lsubdout(client->cct, client, 10) <<
+       __func__ << ": not broken" << *this << dendl;
+  return false;
+}
+
+void Inode::break_deleg(bool skip_read)
+{
+  lsubdout(client->cct, client, 10) <<
+         __func__ << ": breaking delegs on " << *this << dendl;
+
+  recall_deleg(skip_read);
+
+  while (!delegations_broken(skip_read))
+    client->wait_on_list(waitfor_deleg);
+}
+
+/**
+ * set_deleg: request a delegation on an open Fh
+ * @fh: filehandle on which to acquire it
+ * @type: delegation request type
+ * @cb: delegation recall callback function
+ * @priv: private pointer to be passed to callback
+ *
+ * Attempt to acquire a delegation on an open file handle. If there are no
+ * conflicts and we have the right caps, allocate a new delegation, fill it
+ * out and return 0. Return an error if we can't get one for any reason.
+ */
+int Inode::set_deleg(Fh *fh, unsigned type, ceph_deleg_cb_t cb, void *priv)
+{
+  lsubdout(client->cct, client, 10) <<
+         __func__ << ": inode " << *this << dendl;
+
+  /*
+   * 0 deleg timeout means that they haven't been explicitly enabled. Don't
+   * allow it, with an unusual error to make it clear.
+   */
+  if (!client->get_deleg_timeout())
+    return -ETIME;
+
+  // Just say no if we have any recalled delegs still outstanding
+  if (has_recalled_deleg()) {
+    lsubdout(client->cct, client, 10) << __func__ <<
+         ": has_recalled_deleg" << dendl;
+    return -EAGAIN;
+  }
+
+  // check vs. currently open files on this inode
+  switch (type) {
+  case CEPH_DELEGATION_RD:
+    if (open_count_for_write()) {
+      lsubdout(client->cct, client, 10) << __func__ <<
+           ": open for write" << dendl;
+      return -EAGAIN;
+    }
+    break;
+  case CEPH_DELEGATION_WR:
+    if (open_count() > 1) {
+      lsubdout(client->cct, client, 10) << __func__ << ": open" << dendl;
+      return -EAGAIN;
+    }
+    break;
+  default:
+    return -EINVAL;
+  }
+
+  /*
+   * A delegation is essentially a long-held container for cap references that
+   * we delegate to the client until recalled. The caps required depend on the
+   * type of delegation (read vs. rw). This is entirely an opportunistic thing.
+   * If we don't have the necessary caps for the delegation, then we just don't
+   * grant one.
+   *
+   * In principle we could request the caps from the MDS, but a delegation is
+   * usually requested just after an open. If we don't have the necessary caps
+   * already, then it's likely that there is some sort of conflicting access.
+   *
+   * In the future, we may need to add a way to have this request caps more
+   * aggressively -- for instance, to handle WANT_DELEGATION for NFSv4.1+.
+   */
+  int need = ceph_deleg_caps_for_type(type);
+  if (!caps_issued_mask(need)) {
+    lsubdout(client->cct, client, 10) << __func__ << ": cap mismatch, have="
+      << ccap_string(caps_issued()) << " need=" << ccap_string(need) << dendl;
+    return -EAGAIN;
+  }
+
+  for (list<Delegation>::iterator d = delegations.begin();
+       d != delegations.end(); ++d) {
+    Delegation& deleg = *d;
+    if (deleg.get_fh() == fh) {
+      deleg.reinit(type, cb, priv);
+      return 0;
+    }
+  }
+
+  delegations.emplace_back(fh, type, cb, priv);
+  return 0;
+}
+
+/**
+ * unset_deleg - remove a delegation that was previously set
+ * @fh: file handle to clear delegation of
+ *
+ * Unlink delegation from the Inode (if there is one), put caps and free it.
+ */
+void Inode::unset_deleg(Fh *fh)
+{
+  for (list<Delegation>::iterator d = delegations.begin();
+       d != delegations.end(); ++d) {
+    Delegation& deleg = *d;
+    if (deleg.get_fh() == fh) {
+      delegations.erase(d);
+      client->signal_cond_list(waitfor_deleg);
+      break;
+    }
+  }
+}
diff --git a/src/client/Inode.h b/src/client/Inode.h

index e8ce367aa83504de23142c8fd0ab02b9640b596b..71586cad78aebde19c953a04d7d43e6ae5339d61 100644 (file)
--- a/src/client/Inode.h
+++ b/src/client/Inode.h
@@ -4,6 +4,8 @@
  #ifndef CEPH_CLIENT_INODE_H
  #define CEPH_CLIENT_INODE_H
  
+#include <numeric>
+
  #include "include/types.h"
  #include "include/xlist.h"
  
@@ -14,6 +16,7 @@
  
  #include "InodeRef.h"
  #include "UserPerm.h"
+#include "Delegation.h"
  
  class Client;
  struct MetaSession;
@@ -198,6 +201,7 @@ struct Inode {
  
    list<Cond*>       waitfor_caps;
    list<Cond*>       waitfor_commit;
+  list<Cond*>      waitfor_deleg;
  
    Dentry *get_first_parent() {
      assert(!dn_set.empty());
@@ -226,6 +230,8 @@ struct Inode {
    ceph_lock_state_t *fcntl_locks;
    ceph_lock_state_t *flock_locks;
  
+  list<Delegation> delegations;
+
    xlist<MetaRequest*> unsafe_ops;
  
    std::set<Fh*> fhs;
@@ -292,6 +298,33 @@ struct Inode {
    void rm_fh(Fh *f) {fhs.erase(f);}
    void set_async_err(int r);
    void dump(Formatter *f) const;
+
+  void break_all_delegs() { break_deleg(false); };
+
+  void recall_deleg(bool skip_read);
+  bool has_recalled_deleg();
+  int set_deleg(Fh *fh, unsigned type, ceph_deleg_cb_t cb, void *priv);
+  void unset_deleg(Fh *fh);
+
+private:
+  // how many opens for write on this Inode?
+  long open_count_for_write()
+  {
+    return (long)(open_by_mode[CEPH_FILE_MODE_RDWR] +
+                 open_by_mode[CEPH_FILE_MODE_WR]);
+  };
+
+  // how many opens of any sort on this inode?
+  long open_count()
+  {
+    return (long) std::accumulate(open_by_mode.begin(), open_by_mode.end(), 0,
+                                 [] (int value, const std::map<int, int>::value_type& p)
+                   { return value + p.second; });
+  };
+
+  void break_deleg(bool skip_read);
+  bool delegations_broken(bool skip_read);
+
  };
  
  ostream& operator<<(ostream &out, const Inode &in);
diff --git a/src/include/cephfs/libcephfs.h b/src/include/cephfs/libcephfs.h

index 45cef34eb47ced86c5026275777bc07b3020b4ff..0a48c80c9c569074a98b5c3d2d14576cb0102aa3 100644 (file)
--- a/src/include/cephfs/libcephfs.h
+++ b/src/include/cephfs/libcephfs.h
@@ -1572,6 +1572,85 @@ int ceph_ll_getlk(struct ceph_mount_info *cmount,
  int ceph_ll_setlk(struct ceph_mount_info *cmount,
                   Fh *fh, struct flock *fl, uint64_t owner, int sleep);
  
+/*
+ * Delegation support
+ *
+ * Delegations are way for an application to request exclusive or
+ * semi-exclusive access to an Inode. The client requests the delegation and
+ * if it's successful it can reliably cache file data and metadata until the
+ * delegation is recalled.
+ *
+ * Recalls are issued via a callback function, provided by the application.
+ * Callback functions should act something like signal handlers.  You want to
+ * do as little as possible in the callback. Any major work should be deferred
+ * in some fashion as it's difficult to predict the context in which this
+ * function will be called.
+ *
+ * Once the delegation has been recalled, the application should return it as
+ * soon as possible. The application has client_deleg_timeout seconds to
+ * return it, after which the cmount structure is forcibly unmounted and
+ * further calls into it fail.
+ *
+ * The application can set the client_deleg_timeout config option to suit its
+ * needs, but it should take care to choose a value that allows it to avoid
+ * forcible eviction from the cluster in the event of an application bug.
+ */
+typedef void (*ceph_deleg_cb_t)(struct Fh *fh, void *priv);
+
+/* Commands for manipulating delegation state */
+#ifndef CEPH_DELEGATION_NONE
+# define CEPH_DELEGATION_NONE  0
+# define CEPH_DELEGATION_RD    1
+# define CEPH_DELEGATION_WR    2
+#endif
+
+/**
+ * Get the amount of time that the client has to return caps
+ * @param cmount the ceph mount handle to use.
+ *
+ * In the event that a client does not return its caps, the MDS may blacklist
+ * it after this timeout. Applications should check this value and ensure
+ * that they set the delegation timeout to a value lower than this.
+ *
+ * This call returns the cap return timeout (in seconds) for this cmount, or
+ * zero if it's not mounted.
+ */
+uint32_t ceph_get_cap_return_timeout(struct ceph_mount_info *cmount);
+
+/**
+ * Set the delegation timeout for the mount (thereby enabling delegations)
+ * @param cmount the ceph mount handle to use.
+ * @param timeout the delegation timeout (in seconds)
+ *
+ * Since the client could end up blacklisted if it doesn't return delegations
+ * in time, we mandate that any application wanting to use delegations
+ * explicitly set the timeout beforehand. Until this call is done on the
+ * mount, attempts to set a delegation will return -ETIME.
+ *
+ * Once a delegation is recalled, if it is not returned in this amount of
+ * time, the cmount will be forcibly unmounted and further access attempts
+ * will fail (usually with -ENOTCONN errors).
+ *
+ * This value is further vetted against the cap return timeout, and this call
+ * can fail with -EINVAL if the timeout value is too long. Delegations can be
+ * disabled again by setting the timeout to 0.
+ */
+int ceph_set_deleg_timeout(struct ceph_mount_info *cmount, uint32_t timeout);
+
+/**
+ * Request a delegation on an open Fh
+ * @param cmount the ceph mount handle to use.
+ * @param fh file handle
+ * @param cmd CEPH_DELEGATION_* command
+ * @param cb callback function for recalling delegation
+ * @param priv opaque token passed back during recalls
+ *
+ * Returns 0 if the delegation was granted, -EAGAIN if there was a conflict
+ * and other error codes if there is a fatal error of some sort (e.g. -ENOMEM,
+ * -ETIME)
+ */
+int ceph_ll_delegation(struct ceph_mount_info *cmount, Fh *fh,
+                      unsigned int cmd, ceph_deleg_cb_t cb, void *priv);
  #ifdef __cplusplus
  }
  #endif
diff --git a/src/libcephfs.cc b/src/libcephfs.cc

index 74652da147280c27ee48002268d178ca2b36dd11..cb7a0131782f90d6c74980c6bcac613b9584a6ed 100644 (file)
--- a/src/libcephfs.cc
+++ b/src/libcephfs.cc
@@ -30,6 +30,7 @@
  #include "messages/MMonMap.h"
  #include "msg/Messenger.h"
  #include "include/assert.h"
+#include "mds/MDSMap.h"
  
  #include "include/cephfs/libcephfs.h"
  
@@ -1723,6 +1724,12 @@ extern "C" int ceph_ll_setlk(struct ceph_mount_info *cmount,
    return (cmount->get_client()->ll_setlk(fh, fl, owner, sleep));
  }
  
+extern "C" int ceph_ll_delegation(struct ceph_mount_info *cmount, Fh *fh,
+                                 unsigned cmd, ceph_deleg_cb_t cb, void *priv)
+{
+  return (cmount->get_client()->ll_delegation(fh, cmd, cb, priv));
+}
+
  extern "C" uint32_t ceph_ll_stripe_unit(class ceph_mount_info *cmount,
                                         Inode *in)
  {
@@ -1778,3 +1785,17 @@ extern "C" void ceph_buffer_free(char *buf)
      free(buf);
    }
  }
+
+extern "C" uint32_t ceph_get_cap_return_timeout(class ceph_mount_info *cmount)
+{
+  if (!cmount->is_mounted())
+    return 0;
+  return cmount->get_client()->mdsmap->get_session_autoclose().sec();
+}
+
+extern "C" int ceph_set_deleg_timeout(class ceph_mount_info *cmount, uint32_t timeout)
+{
+  if (!cmount->is_mounted())
+    return -ENOTCONN;
+  return cmount->get_client()->set_deleg_timeout(timeout);
+}
diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h

index 454f422dde2288ee1ae4c70e1912f0ffab49bc03..f2585a997f6a514d4e0c58014b9388a4d56f05b7 100644 (file)
--- a/src/mds/MDSMap.h
+++ b/src/mds/MDSMap.h
@@ -249,6 +249,11 @@ public:
    utime_t get_session_timeout() const {
      return utime_t(session_timeout,0);
    }
+
+  utime_t get_session_autoclose() const {
+    return utime_t(session_autoclose, 0);
+  }
+
    uint64_t get_max_filesize() const { return max_file_size; }
    void set_max_filesize(uint64_t m) { max_file_size = m; }
author	Jeff Layton <jlayton@redhat.com>
	Tue, 14 Nov 2017 12:26:56 +0000 (07:26 -0500)
committer	Nathan Cutler <ncutler@suse.com>
	Wed, 13 Dec 2017 12:56:23 +0000 (13:56 +0100)
src/client/CMakeLists.txt		patch \| blob \| history
src/client/Client.cc		patch \| blob \| history
src/client/Client.h		patch \| blob \| history
src/client/Delegation.cc	[new file with mode: 0644]	patch \| blob
src/client/Delegation.h	[new file with mode: 0644]	patch \| blob
src/client/Inode.cc		patch \| blob \| history
src/client/Inode.h		patch \| blob \| history
src/include/cephfs/libcephfs.h		patch \| blob \| history
src/libcephfs.cc		patch \| blob \| history
src/mds/MDSMap.h		patch \| blob \| history