From: Yan, Zheng Date: Wed, 9 Sep 2015 06:14:33 +0000 (+0800) Subject: client: use faked inode number when sizeof(ino_t) < 8 X-Git-Tag: v9.1.0~56^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=24b20a8fcdbb588a8c2b7e0d66e82260e6158343;p=ceph.git client: use faked inode number when sizeof(ino_t) < 8 Cephfs uses 48-bits inode number. When ino_t is 32-bits, the upper 16 bits of inode number gets lost for stat(2) and readdir(2). 32-bits ino_t is even more problematic with ceph-fuse. In ceph-fuse case, we need to encode both cephfs inode number and snapid into fuse_ino_t. 32-bits are clearly not enough. The fix is assign a 32-bits faked inode number to each cached inode. Faked inode numbers are used for stat(2), readdir(2) and fuse. We maintain a data structure that maps faked inode number to cephfs's vinodeno_t. So we can find inode by the 32-bits faked inode number. This appoach should work as long as there are less than 4 billions cached inodes. Signed-off-by: Yan, Zheng --- diff --git a/src/client/Client.cc b/src/client/Client.cc index 638ceecb585..ea91a5b0f9d 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -161,6 +161,60 @@ dir_result_t::dir_result_t(Inode *in) buffer(0) { } +void Client::_reset_faked_inos() +{ + ino_t start = 1024; + free_faked_inos.clear(); + free_faked_inos.insert(start, (uint32_t)-1 - start + 1); + last_used_faked_ino = 0; + _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos; +} + +void Client::_assign_faked_ino(Inode *in) +{ + interval_set::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1); + if (it == free_faked_inos.end() && last_used_faked_ino > 0) { + last_used_faked_ino = 0; + it = free_faked_inos.lower_bound(last_used_faked_ino + 1); + } + assert(it != free_faked_inos.end()); + if (last_used_faked_ino < it.get_start()) { + assert(it.get_len() > 0); + last_used_faked_ino = it.get_start(); + } else { + ++last_used_faked_ino; + assert(it.get_start() + it.get_len() > last_used_faked_ino); + } + in->faked_ino = last_used_faked_ino; + free_faked_inos.erase(in->faked_ino); + faked_ino_map[in->faked_ino] = in->vino(); +} + +void Client::_release_faked_ino(Inode *in) +{ + free_faked_inos.insert(in->faked_ino); + faked_ino_map.erase(in->faked_ino); +} + +vinodeno_t Client::_map_faked_ino(ino_t ino) +{ + vinodeno_t vino; + if (ino == 1) + vino = root->vino(); + else if (faked_ino_map.count(ino)) + vino = faked_ino_map[ino]; + else + vino = vinodeno_t(0, CEPH_NOSNAP); + ldout(cct, 10) << "map_faked_ino " << ino << " -> " << vino << dendl; + return vino; +} + +vinodeno_t Client::map_faked_ino(ino_t ino) +{ + Mutex::Locker lock(client_lock); + return _map_faked_ino(ino); +} + // cons/des Client::Client(Messenger *m, MonClient *mc) @@ -193,6 +247,7 @@ Client::Client(Messenger *m, MonClient *mc) { monclient->set_messenger(m); + _reset_faked_inos(); // root = 0; @@ -276,6 +331,7 @@ void Client::tear_down_cache() while (!root_parents.empty()) root_parents.erase(root_parents.begin()); inode_map.clear(); + _reset_faked_inos(); } assert(inode_map.empty()); @@ -573,6 +629,7 @@ void Client::trim_cache(bool trim_kernel_dcache) while (!root_parents.empty()) root_parents.erase(root_parents.begin()); inode_map.clear(); + _reset_faked_inos(); } } @@ -745,6 +802,10 @@ Inode * Client::add_update_inode(InodeStat *st, utime_t from, } else { in = new Inode(this, st->vino, &st->layout); inode_map[st->vino] = in; + + if (use_faked_inos()) + _assign_faked_ino(in); + if (!root) { root = in; root_ancestor = in; @@ -2576,6 +2637,9 @@ void Client::put_inode(Inode *in, int n) assert(!unclean); put_qtree(in); inode_map.erase(in->vino()); + if (use_faked_inos()) + _release_faked_ino(in); + in->cap_item.remove_myself(); in->snaprealm_item.remove_myself(); in->snapdir_parent.reset(); @@ -3309,7 +3373,10 @@ public: void Client::_async_invalidate(InodeRef& in, int64_t off, int64_t len, bool keep_caps) { ldout(cct, 10) << "_async_invalidate " << off << "~" << len << (keep_caps ? " keep_caps" : "") << dendl; - ino_invalidate_cb(callback_handle, in->vino(), off, len); + if (use_faked_inos()) + ino_invalidate_cb(callback_handle, vinodeno_t(in->faked_ino, CEPH_NOSNAP), off, len); + else + ino_invalidate_cb(callback_handle, in->vino(), off, len); client_lock.Lock(); if (!keep_caps) @@ -4393,10 +4460,16 @@ private: public: C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) : client(c), name(dn->name) { - dirino = dn->dir->parent_inode->vino(); - if (del) - ino = dn->inode->vino(); - else + if (client->use_faked_inos()) { + dirino.ino = dn->dir->parent_inode->faked_ino; + if (del) + ino.ino = dn->inode->faked_ino; + } else { + dirino = dn->dir->parent_inode->vino(); + if (del) + ino = dn->inode->vino(); + } + if (!del) ino.ino = inodeno_t(); } void finish(int r) { @@ -5795,7 +5868,10 @@ int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_inf << " mode 0" << oct << in->mode << dec << " mtime " << in->mtime << " ctime " << in->ctime << dendl; memset(st, 0, sizeof(struct stat)); - st->st_ino = in->ino; + if (use_faked_inos()) + st->st_ino = in->faked_ino; + else + st->st_ino = in->ino; st->st_dev = in->snapid; st->st_mode = in->mode; st->st_rdev = in->rdev; @@ -6349,9 +6425,8 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p) assert(diri->dn_set.size() < 2); // can't have multiple hard-links to a dir uint64_t next_off = 1; - fill_dirent(&de, ".", S_IFDIR, diri->ino, next_off); - fill_stat(diri, &st); + fill_dirent(&de, ".", S_IFDIR, st.st_ino, next_off); client_lock.Unlock(); int r = cb(p, &de, &st, -1, next_off); @@ -6368,8 +6443,8 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p) ldout(cct, 15) << " including .." << dendl; if (!diri->dn_set.empty()) { InodeRef& in = diri->get_first_parent()->inode; - fill_dirent(&de, "..", S_IFDIR, in->ino, 2); fill_stat(in, &st); + fill_dirent(&de, "..", S_IFDIR, st.st_ino, 2); } else { /* must be at the root (no parent), * so we add the dotdot with a special inode (3) */ @@ -8505,8 +8580,10 @@ Inode *Client::open_snapdir(Inode *diri) in->size = diri->size; in->dirfragtree.clear(); - inode_map[vino] = in; in->snapdir_parent = diri; + inode_map[vino] = in; + if (use_faked_inos()) + _assign_faked_ino(in); ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl; } else { in = inode_map[vino]; @@ -8652,6 +8729,18 @@ snapid_t Client::ll_get_snapid(Inode *in) return in->snapid; } +Inode *Client::ll_get_inode(ino_t ino) +{ + Mutex::Locker lock(client_lock); + vinodeno_t vino = _map_faked_ino(ino); + unordered_map::iterator p = inode_map.find(vino); + if (p == inode_map.end()) + return NULL; + Inode *in = p->second; + _ll_get(in); + return in; +} + Inode *Client::ll_get_inode(vinodeno_t vino) { Mutex::Locker lock(client_lock); diff --git a/src/client/Client.h b/src/client/Client.h index 81137452edc..bc775eca03e 100644 --- a/src/client/Client.h +++ b/src/client/Client.h @@ -346,6 +346,17 @@ protected: // cache ceph::unordered_map inode_map; + + // fake inode number for 32-bits ino_t + ceph::unordered_map faked_ino_map; + interval_set free_faked_inos; + ino_t last_used_faked_ino; + void _assign_faked_ino(Inode *in); + void _release_faked_ino(Inode *in); + bool _use_faked_inos; + void _reset_faked_inos(); + vinodeno_t _map_faked_ino(ino_t ino); + Inode* root; map root_parents; Inode* root_ancestor; @@ -615,6 +626,8 @@ protected: Dentry *old_dentry = NULL); void update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session); + bool use_faked_inos() { return _use_faked_inos; } + vinodeno_t map_faked_ino(ino_t ino); // ---------------------- // fs ops. @@ -925,6 +938,8 @@ public: Mutex::Locker lock(client_lock); return _get_vino(in); } + // get inode from faked ino + Inode *ll_get_inode(ino_t ino); Inode *ll_get_inode(vinodeno_t vino); int ll_lookup(Inode *parent, const char *name, struct stat *attr, Inode **out, int uid = -1, int gid = -1); diff --git a/src/client/Inode.cc b/src/client/Inode.cc index 16eee7a7aa4..9d17baf730a 100644 --- a/src/client/Inode.cc +++ b/src/client/Inode.cc @@ -11,7 +11,8 @@ ostream& operator<<(ostream &out, Inode &in) { out << in.vino() << "(" - << "ref=" << in._ref + << "faked_ino=" << in.faked_ino + << " ref=" << in._ref << " ll_ref=" << in.ll_ref << " cap_refs=" << in.cap_refs << " open=" << in.open_by_mode diff --git a/src/client/Inode.h b/src/client/Inode.h index f18f65272d9..a1500b4c333 100644 --- a/src/client/Inode.h +++ b/src/client/Inode.h @@ -159,6 +159,8 @@ struct Inode { // -- the actual inode -- inodeno_t ino; snapid_t snapid; + ino_t faked_ino; + uint32_t rdev; // if special file // affected by any inode change... @@ -296,7 +298,7 @@ struct Inode { xlist unsafe_dir_ops; Inode(Client *c, vinodeno_t vino, ceph_file_layout *newlayout) - : client(c), ino(vino.ino), snapid(vino.snapid), + : client(c), ino(vino.ino), snapid(vino.snapid), faked_ino(0), rdev(0), mode(0), uid(0), gid(0), nlink(0), size(0), truncate_seq(1), truncate_size(-1), time_warp_seq(0), max_size(0), version(0), xattr_version(0), diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc index 410509d7c8f..daa5e3dcf96 100644 --- a/src/client/fuse_ll.cc +++ b/src/client/fuse_ll.cc @@ -72,9 +72,8 @@ public: void finalize(); uint64_t fino_snap(uint64_t fino); - vinodeno_t fino_vino(inodeno_t fino); uint64_t make_fake_ino(inodeno_t ino, snapid_t snapid); - Inode * iget(inodeno_t fino); + Inode * iget(fuse_ino_t fino); void iput(Inode *in); int fd_on_success; @@ -1009,27 +1008,27 @@ int CephFuse::Handle::loop() uint64_t CephFuse::Handle::fino_snap(uint64_t fino) { - Mutex::Locker l(stag_lock); - uint64_t stag = FINO_STAG(fino); - assert(stag_snap_map.count(stag)); - return stag_snap_map[stag]; -} - -vinodeno_t CephFuse::Handle::fino_vino(inodeno_t fino) -{ - if (fino.val == 1) { - fino = inodeno_t(client->get_root_ino()); + if (client->use_faked_inos()) { + vinodeno_t vino = client->map_faked_ino(fino); + return vino.snapid; + } else { + Mutex::Locker l(stag_lock); + uint64_t stag = FINO_STAG(fino); + assert(stag_snap_map.count(stag)); + return stag_snap_map[stag]; } - vinodeno_t vino(FINO_INO(fino), fino_snap(fino)); - //cout << "fino_vino " << fino << " -> " << vino << std::endl; - return vino; } -Inode * CephFuse::Handle::iget(inodeno_t fino) +Inode * CephFuse::Handle::iget(fuse_ino_t fino) { - Inode *in = - client->ll_get_inode(fino_vino(fino)); - return in; + if (client->use_faked_inos()) { + return client->ll_get_inode((ino_t)fino); + } else { + if (fino == 1) + fino = inodeno_t(client->get_root_ino()); + vinodeno_t vino(FINO_INO(fino), fino_snap(fino)); + return client->ll_get_inode(vino); + } } void CephFuse::Handle::iput(Inode *in) @@ -1039,17 +1038,22 @@ void CephFuse::Handle::iput(Inode *in) uint64_t CephFuse::Handle::make_fake_ino(inodeno_t ino, snapid_t snapid) { - Mutex::Locker l(stag_lock); - uint64_t stag; - if (snap_stag_map.count(snapid) == 0) { - stag = ++last_stag; - snap_stag_map[snapid] = stag; - stag_snap_map[stag] = snapid; - } else - stag = snap_stag_map[snapid]; - inodeno_t fino = MAKE_FINO(ino, stag); - //cout << "make_fake_ino " << ino << "." << snapid << " -> " << fino << std::endl; - return fino; + if (client->use_faked_inos()) { + // already faked by libcephfs + return ino; + } else { + Mutex::Locker l(stag_lock); + uint64_t stag; + if (snap_stag_map.count(snapid) == 0) { + stag = ++last_stag; + snap_stag_map[snapid] = stag; + stag_snap_map[stag] = snapid; + } else + stag = snap_stag_map[snapid]; + inodeno_t fino = MAKE_FINO(ino, stag); + //cout << "make_fake_ino " << ino << "." << snapid << " -> " << fino << std::endl; + return fino; + } } CephFuse::CephFuse(Client *c, int fd) : _handle(new CephFuse::Handle(c, fd)) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index a09873666e3..3b03fb17c8b 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -364,6 +364,7 @@ OPTION(fuse_multithreaded, OPT_BOOL, true) OPTION(client_try_dentry_invalidate, OPT_BOOL, true) // the client should try to use dentry invaldation instead of remounting, on kernels it believes that will work for OPTION(client_die_on_failed_remount, OPT_BOOL, true) OPTION(client_check_pool_perm, OPT_BOOL, true) +OPTION(client_use_faked_inos, OPT_BOOL, false) OPTION(crush_location, OPT_STR, "") // whitespace-separated list of key=value pairs describing crush location