From deb2c1ea985fcb906e47b93fd3d0117794e2d0a1 Mon Sep 17 00:00:00 2001 From: Matt Benjamin Date: Thu, 19 Jan 2017 18:14:30 -0500 Subject: [PATCH] rgw_file: add timed namespace invalidation With change, librgw/rgw_file consumers can provide an invalidation callback, which is used by the library to invalidate directories whose contents should be forgotten. The existing RGWLib GC mechanism is being used to drive this. New configuration params have been added. The main configurable is rgw_nfs_namespace_expire_secs, the expire timeout. Updated post Yehuda review. Fixes: http://tracker.ceph.com/issues/18651 Signed-off-by: Matt Benjamin --- src/common/config_opts.h | 3 ++ src/include/rados/rgw_file.h | 12 ++++- src/rgw/librgw.cc | 13 +++++- src/rgw/rgw_file.cc | 86 ++++++++++++++++++++++++++---------- src/rgw/rgw_file.h | 50 ++++++++++++--------- src/rgw/rgw_lib_frontend.h | 8 +++- 6 files changed, 123 insertions(+), 49 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index b37459a69e70a..abf1557ed2bdd 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -1479,6 +1479,9 @@ OPTION(rgw_nfs_lru_lanes, OPT_INT, 5) OPTION(rgw_nfs_lru_lane_hiwat, OPT_INT, 911) OPTION(rgw_nfs_fhcache_partitions, OPT_INT, 3) OPTION(rgw_nfs_fhcache_size, OPT_INT, 2017) /* 3*2017=6051 */ +OPTION(rgw_nfs_namespace_expire_secs, OPT_INT, 300) /* namespace invalidate + * timer */ +OPTION(rgw_nfs_max_gc, OPT_INT, 300) /* max gc events per cycle */ OPTION(rgw_nfs_write_completion_interval_s, OPT_INT, 10) /* stateless (V3) * commit * delay */ diff --git a/src/include/rados/rgw_file.h b/src/include/rados/rgw_file.h index 47a97633b3b0e..dfa1ede18c93f 100644 --- a/src/include/rados/rgw_file.h +++ b/src/include/rados/rgw_file.h @@ -27,7 +27,7 @@ extern "C" { #define LIBRGW_FILE_VER_MAJOR 1 #define LIBRGW_FILE_VER_MINOR 1 -#define LIBRGW_FILE_VER_EXTRA 0 +#define LIBRGW_FILE_VER_EXTRA 1 #define LIBRGW_FILE_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra) #define LIBRGW_FILE_VERSION_CODE LIBRGW_FILE_VERSION(LIBRGW_FILE_VER_MAJOR, LIBRGW_FILE_VER_MINOR, LIBRGW_FILE_VER_EXTRA) @@ -119,6 +119,16 @@ int rgw_mount(librgw_t rgw, const char *uid, const char *key, const char *secret, struct rgw_fs **rgw_fs, uint32_t flags); +/* + register invalidate callbacks +*/ +#define RGW_REG_INVALIDATE_FLAG_NONE 0x0000 + +typedef void (*rgw_fh_callback_t)(void *handle, struct rgw_fh_hk fh_hk); + +int rgw_register_invalidate(struct rgw_fs *rgw_fs, rgw_fh_callback_t cb, + void *arg, uint32_t flags); + /* detach rgw namespace */ diff --git a/src/rgw/librgw.cc b/src/rgw/librgw.cc index aed20dd21b35e..e296350abb072 100644 --- a/src/rgw/librgw.cc +++ b/src/rgw/librgw.cc @@ -13,6 +13,7 @@ */ #include #include +#include #include "include/types.h" #include "include/rados/librgw.h" @@ -80,6 +81,8 @@ namespace rgw { m_tp.drain(&req_wq); } +#define MIN_EXPIRE_S 120 + void RGWLibProcess::run() { /* write completion interval */ @@ -92,6 +95,14 @@ namespace rgw { /* gc loop */ while (! shutdown) { lsubdout(cct, rgw, 5) << "RGWLibProcess GC" << dendl; + + /* dirent invalidate timeout--basically, the upper-bound on + * inconsistency with the S3 namespace */ + auto expire_s = cct->_conf->rgw_nfs_namespace_expire_secs; + + /* delay between gc cycles */ + auto delay_s = std::max(1, std::min(MIN_EXPIRE_S, expire_s/2)); + unique_lock uniq(mtx); restart: int cur_gen = gen; @@ -106,7 +117,7 @@ namespace rgw { goto restart; /* invalidated */ } uniq.unlock(); - std::this_thread::sleep_for(std::chrono::seconds(120)); + std::this_thread::sleep_for(std::chrono::seconds(delay_s)); } } diff --git a/src/rgw/rgw_file.cc b/src/rgw/rgw_file.cc index e2a39dff59cbf..63e3b094f9d43 100644 --- a/src/rgw/rgw_file.cc +++ b/src/rgw/rgw_file.cc @@ -81,22 +81,6 @@ namespace rgw { using std::get; LookupFHResult fhr{nullptr, 0}; -#if 0 - RGWFileHandle::directory* d = parent->get_directory(); - if (! d->name_cache.empty()) { - RGWFileHandle::dirent_string name{path}; - const auto& diter = d->name_cache.find(name); - if (diter != d->name_cache.end()) { - fhr = lookup_fh(parent, path, - RGWFileHandle::FLAG_CREATE| - ((diter->second == RGW_FS_TYPE_DIRECTORY) ? - RGWFileHandle::FLAG_DIRECTORY : - RGWFileHandle::FLAG_NONE)); - if (get<0>(fhr)) - return fhr; - } - } -#endif /* XXX the need for two round-trip operations to identify file or * directory leaf objects is unecessary--the current proposed @@ -643,13 +627,34 @@ namespace rgw { rele(); } /* RGWLibFS::close */ + std::ostream& operator<<(std::ostream &os, RGWLibFS::event const &ev) { + os << ">"; + return os; + } + void RGWLibFS::gc() { using std::get; using directory = RGWFileHandle::directory; - static constexpr uint32_t max_ev = 24; - static constexpr uint16_t expire_s = 300; /* 5m */ + /* dirent invalidate timeout--basically, the upper-bound on + * inconsistency with the S3 namespace */ + auto expire_s + = get_context()->_conf->rgw_nfs_namespace_expire_secs; + + /* max events to gc in one cycle */ + uint32_t max_ev = + std::max(1, get_context()->_conf->rgw_nfs_max_gc); struct timespec now; event_vector ve; @@ -660,11 +665,15 @@ namespace rgw { do { { lock_guard guard(state.mtx); /* LOCKED */ + /* just return if no events */ + if (events.empty()) { + return; + } uint32_t _max_ev = (events.size() < 500) ? max_ev : (events.size() / 4); for (uint32_t ix = 0; (ix < _max_ev) && (events.size() > 0); ++ix) { event& ev = events.front(); - if (ev.ts.tv_sec < (now.tv_sec + expire_s)) { + if (ev.ts.tv_sec > (now.tv_sec + expire_s)) { stop = true; break; } @@ -674,8 +683,12 @@ namespace rgw { } /* anon */ /* !LOCKED */ for (auto& ev : ve) { + lsubdout(get_context(), rgw, 15) + << "try-expire ev: " << ev << dendl; if (likely(ev.t == event::type::READDIR)) { RGWFileHandle* rgw_fh = lookup_handle(ev.fhk.fh_hk); + lsubdout(get_context(), rgw, 15) + << "ev rgw_fh: " << rgw_fh << dendl; if (rgw_fh) { RGWFileHandle::directory* d; if (unlikely(! rgw_fh->is_dir())) { @@ -692,14 +705,15 @@ namespace rgw { if (d) { lock_guard guard(rgw_fh->mtx); d->clear_state(); + rgw_fh->invalidate(); } rele: unref(rgw_fh); } /* rgw_fh */ } /* event::type::READDIR */ } /* ev */ - std::this_thread::sleep_for(std::chrono::seconds(120)); - } while (! stop); + ve.clear(); + } while (! (stop || shutdown)); } /* RGWLibFS::gc */ void RGWFileHandle::encode_attrs(ceph::buffer::list& ux_key1, @@ -734,7 +748,6 @@ namespace rgw { int rc = 0; struct timespec now; CephContext* cct = fs->get_context(); - directory* d = get_directory(); /* already type-checked */ (void) clock_gettime(CLOCK_MONOTONIC_COARSE, &now); /* !LOCKED */ @@ -749,8 +762,9 @@ namespace rgw { offset); rc = rgwlib.get_fe()->execute_req(&req); if (! rc) { - set_nlink(2 + d->name_cache.size()); + lock_guard guard(mtx); state.atime = now; + set_nlink(2 + 1); *eof = req.eof(); event ev(event::type::READDIR, get_key(), state.atime); fs->state.push_event(ev); @@ -760,8 +774,9 @@ namespace rgw { RGWReaddirRequest req(cct, fs->get_user(), this, rcb, cb_arg, offset); rc = rgwlib.get_fe()->execute_req(&req); if (! rc) { + lock_guard guard(mtx); state.atime = now; - set_nlink(2 + d->name_cache.size()); + set_nlink(2 + 1); *eof = req.eof(); event ev(event::type::READDIR, get_key(), state.atime); fs->state.push_event(ev); @@ -918,6 +933,18 @@ namespace rgw { delete write_req; } + void RGWFileHandle::directory::clear_state() + { + marker_cache.clear(); + } + + void RGWFileHandle::invalidate() { + RGWLibFS *fs = get_fs(); + if (fs->invalidate_cb) { + fs->invalidate_cb(fs->invalidate_arg, get_key().fh_hk); + } + } + int RGWWriteRequest::exec_start() { struct req_state* s = get_state(); @@ -1133,6 +1160,17 @@ void rgwfile_version(int *major, int *minor, int *extra) return 0; } +/* + register invalidate callbacks +*/ +int rgw_register_invalidate(struct rgw_fs *rgw_fs, rgw_fh_callback_t cb, + void *arg, uint32_t flags) + +{ + RGWLibFS *fs = static_cast(rgw_fs->fs_private); + return fs->register_invalidate(cb, arg, flags); +} + /* detach rgw namespace */ diff --git a/src/rgw/rgw_file.h b/src/rgw/rgw_file.h index 3e3f4aeafbe41..d594494e288e6 100644 --- a/src/rgw/rgw_file.h +++ b/src/rgw/rgw_file.h @@ -169,7 +169,6 @@ namespace rgw { using dirent_string = basic_sstring; using marker_cache_t = flat_map; - using name_cache_t = flat_map; struct State { uint64_t dev; @@ -199,19 +198,10 @@ namespace rgw { uint32_t flags; marker_cache_t marker_cache; - name_cache_t name_cache; directory() : flags(FLAG_NONE) {} - void clear_state() { - marker_cache.clear(); - name_cache.clear(); - } - - void set_overflow() { - clear_state(); - flags |= FLAG_OVERFLOW; - } + void clear_state(); }; boost::variant variant_type; @@ -469,18 +459,9 @@ namespace rgw { // XXXX check for failure (dup key) d->marker_cache.insert( marker_cache_t::value_type(off, marker.data())); - /* 90% of directories hold <= 32 entries (Yifan Wang, CMU), - * but go big */ - if (d->name_cache.size() < 128) { - d->name_cache.insert( - name_cache_t::value_type(marker.data(), obj_type)); - } else { - d->set_overflow(); // too many - } } } - /* XXX */ std::string find_marker(uint64_t off) { // XXX copy using std::get; directory* d = get(&variant_type); @@ -607,6 +588,8 @@ namespace rgw { void decode_attrs(const ceph::buffer::list* ux_key1, const ceph::buffer::list* ux_attrs1); + void invalidate(); + virtual bool reclaim(); typedef cohort::lru::LRU FhLRU; @@ -704,6 +687,9 @@ namespace rgw { CephContext* cct; struct rgw_fs fs; RGWFileHandle root_fh; + rgw_fh_callback_t invalidate_cb; + void *invalidate_arg; + bool shutdown; mutable std::atomic refcnt; @@ -732,6 +718,9 @@ namespace rgw { : t(t), fhk(k), ts(ts) {} }; + friend std::ostream& operator<<(std::ostream &os, + RGWLibFS::event const &ev); + using event_vector = /* boost::small_vector */ std::vector; @@ -759,7 +748,6 @@ namespace rgw { State() : flags(0) {} void push_event(const event& ev) { - lock_guard guard(mtx); events.push_back(ev); } } state; @@ -774,7 +762,8 @@ namespace rgw { RGWLibFS(CephContext* _cct, const char *_uid, const char *_user_id, const char* _key) - : cct(_cct), root_fh(this, get_inst()), refcnt(1), + : cct(_cct), root_fh(this, get_inst()), invalidate_cb(nullptr), + invalidate_arg(nullptr), shutdown(false), refcnt(1), fh_cache(cct->_conf->rgw_nfs_fhcache_partitions, cct->_conf->rgw_nfs_fhcache_size), fh_lru(cct->_conf->rgw_nfs_lru_lanes, @@ -817,6 +806,8 @@ namespace rgw { intrusive_ptr_release(this); } + void stop() { shutdown = true; } + void release_evict(RGWFileHandle* fh) { /* remove from cache, releases sentinel ref */ fh_cache.remove(fh->fh.fh_hk.object, fh, @@ -860,6 +851,12 @@ namespace rgw { return ret; } /* authorize */ + int register_invalidate(rgw_fh_callback_t cb, void *arg, uint32_t flags) { + invalidate_cb = cb; + invalidate_arg = arg; + return 0; + } + /* find RGWFileHandle by id */ LookupFHResult lookup_fh(const fh_key& fhk, const uint32_t flags = RGWFileHandle::FLAG_NONE) { @@ -1050,6 +1047,15 @@ namespace rgw { fh->mtx.unlock(); /* !LOCKED */ out: lat.lock->unlock(); /* !LATCHED */ + + /* special case: lookup root_fh */ + if (! fh) { + if (unlikely(fh_hk == root_fh.fh.fh_hk)) { + fh = &root_fh; + ref(fh); + } + } + return fh; } diff --git a/src/rgw/rgw_lib_frontend.h b/src/rgw/rgw_lib_frontend.h index 16bfe408dd35a..65ccda96d01c5 100644 --- a/src/rgw/rgw_lib_frontend.h +++ b/src/rgw/rgw_lib_frontend.h @@ -32,7 +32,13 @@ namespace rgw { void run(); void checkpoint(); - void stop() { shutdown = true; } + + void stop() { + shutdown = true; + for (const auto& fs: mounted_fs) { + fs.second->stop(); + } + } void register_fs(RGWLibFS* fs) { lock_guard guard(mtx); -- 2.39.5