From: Yan, Zheng Date: Mon, 9 Feb 2015 06:21:02 +0000 (+0800) Subject: mds: optimize memory usage of inode_t X-Git-Tag: v9.0.0~201^2~7 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=aa46d487bf974653356857688b80f75b366510fc;p=ceph.git mds: optimize memory usage of inode_t inode_t::old_pools is rarely used. Defining it as compact_set can save 40 bytes. inline_data is also rarely used, dynamiclly allocating bufferlist for inline_data can save another 72 bytes. Signed-off-by: Yan, Zheng --- diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index cdeb23bb13e..ad14701f7b2 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -1070,15 +1070,12 @@ void CInode::build_backtrace(int64_t pool, inode_backtrace_t& bt) in = diri; pdn = in->get_parent_dn(); } - vector::iterator i = inode.old_pools.begin(); - while(i != inode.old_pools.end()) { + for (compact_set::iterator i = inode.old_pools.begin(); + i != inode.old_pools.end(); + ++i) { // don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0) - if (*i == pool) { - ++i; - continue; - } - bt.old_pools.insert(*i); - ++i; + if (*i != pool) + bt.old_pools.insert(*i); } } @@ -1134,11 +1131,10 @@ void CInode::store_backtrace(MDSInternalContextBase *fin, int op_prio) mdcache->mds->objecter->mutate(oid, oloc, op, snapc, ceph_clock_now(g_ceph_context), 0, NULL, gather.new_sub()); - set old_pools; - for (vector::iterator p = inode.old_pools.begin(); - p != inode.old_pools.end(); - ++p) { - if (*p == pool || old_pools.count(*p)) + for (compact_set::iterator p = inode.old_pools.begin(); + p != inode.old_pools.end(); + ++p) { + if (*p == pool) continue; ObjectOperation op; @@ -1149,7 +1145,6 @@ void CInode::store_backtrace(MDSInternalContextBase *fin, int op_prio) object_locator_t oloc(*p); mdcache->mds->objecter->mutate(oid, oloc, op, snapc, ceph_clock_now(g_ceph_context), 0, NULL, gather.new_sub()); - old_pools.insert(*p); } gather.activate(); } @@ -1373,7 +1368,6 @@ void CInode::encode_lock_state(int type, bufferlist& bl) ::encode(inode.truncate_size, bl); ::encode(inode.client_ranges, bl); ::encode(inode.inline_data, bl); - ::encode(inode.inline_version, bl); } } else { // treat flushing as dirty when rejoining cache @@ -1571,7 +1565,6 @@ void CInode::decode_lock_state(int type, bufferlist& bl) ::decode(inode.truncate_size, p); ::decode(inode.client_ranges, p); ::decode(inode.inline_data, p); - ::decode(inode.inline_version, p); } } else { bool replica_dirty; @@ -2842,7 +2835,7 @@ int CInode::get_caps_allowed_for_client(client_t client) const } else { allowed = get_caps_allowed_by_type(CAP_ANY); } - if (inode.inline_version != CEPH_INLINE_NONE && + if (inode.inline_data.version != CEPH_INLINE_NONE && !mdcache->mds->get_session(client)->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR); return allowed; @@ -3056,13 +3049,14 @@ int CInode::encode_inodestat(bufferlist& bl, Session *session, // inline data version_t inline_version = 0; bufferlist inline_data; - if (i->inline_version == CEPH_INLINE_NONE) { + if (i->inline_data.version == CEPH_INLINE_NONE) { inline_version = CEPH_INLINE_NONE; } else if ((!cap && !no_caps) || - (cap && cap->client_inline_version < i->inline_version) || + (cap && cap->client_inline_version < i->inline_data.version) || (getattr_caps & CEPH_CAP_FILE_RD)) { // client requests inline data - inline_version = i->inline_version; - inline_data = i->inline_data; + inline_version = i->inline_data.version; + if (i->inline_data.length() > 0) + inline_data = i->inline_data.get_data(); } // nest (do same as file... :/) @@ -3257,9 +3251,10 @@ void CInode::encode_cap_message(MClientCaps *m, Capability *cap) i->atime.encode_timeval(&m->head.atime); m->head.time_warp_seq = i->time_warp_seq; - if (cap->client_inline_version < i->inline_version) { - m->inline_version = cap->client_inline_version = i->inline_version; - m->inline_data = i->inline_data; + if (cap->client_inline_version < i->inline_data.version) { + m->inline_version = cap->client_inline_version = i->inline_data.version; + if (i->inline_data.length() > 0) + m->inline_data = i->inline_data.get_data(); } else { m->inline_version = 0; } diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index d5dd693c3db..4ea960fa491 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -1888,7 +1888,7 @@ bool Locker::issue_caps(CInode *in, Capability *only_cap) allowed |= xlocker_allowed & in->get_xlocker_mask(it->first); Session *session = mds->get_session(it->first); - if (in->inode.inline_version != CEPH_INLINE_NONE && + if (in->inode.inline_data.version != CEPH_INLINE_NONE && !(session && session->connection && session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))) allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR); @@ -2918,9 +2918,12 @@ void Locker::_update_cap_fields(CInode *in, int dirty, MClientCaps *m, inode_t * } if (in->inode.is_file() && (dirty & CEPH_CAP_FILE_WR) && - inline_version > pi->inline_version) { - pi->inline_version = inline_version; - pi->inline_data = m->inline_data; + inline_version > pi->inline_data.version) { + pi->inline_data.version = inline_version; + if (inline_version != CEPH_INLINE_NONE && m->inline_data.length() > 0) + pi->inline_data.get_data() = m->inline_data; + else + pi->inline_data.free_data(); } if ((dirty & CEPH_CAP_FILE_EXCL) && atime != pi->atime) { dout(7) << " atime " << pi->atime << " -> " << atime diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 385c0f9fc45..2d43317cc1f 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -9243,7 +9243,7 @@ void MDCache::purge_stray(CDentry *dn) NULL, gather.new_sub()); } // remove old backtrace objects - for (vector::iterator p = pi->old_pools.begin(); + for (compact_set::iterator p = pi->old_pools.begin(); p != pi->old_pools.end(); ++p) { object_locator_t oloc(*p); diff --git a/src/mds/Server.cc b/src/mds/Server.cc index edc94cb3006..39b7bd7a22b 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -2214,7 +2214,7 @@ CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino if (!mds->mdsmap->get_inline_data_enabled() || !mdr->session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) - in->inode.inline_version = CEPH_INLINE_NONE; + in->inode.inline_data.version = CEPH_INLINE_NONE; mdcache->add_inode(in); // add dout(10) << "prepare_new_inode " << *in << dendl; @@ -2750,7 +2750,7 @@ void Server::handle_client_open(MDRequestRef& mdr) return; } - if (cur->inode.inline_version != CEPH_INLINE_NONE && + if (cur->inode.inline_data.version != CEPH_INLINE_NONE && !mdr->session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) { dout(7) << "old client cannot open inline data file " << *cur << dendl; respond_to_request(mdr, -EPERM); diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc index ffb5086f3a3..df2fa8bfea1 100644 --- a/src/mds/mdstypes.cc +++ b/src/mds/mdstypes.cc @@ -229,6 +229,27 @@ ostream& operator<<(ostream& out, const client_writeable_range_t& r) return out << r.range.first << '-' << r.range.last << "@" << r.follows; } +/* + * inline_data_t + */ +void inline_data_t::encode(bufferlist &bl) const +{ + ::encode(version, bl); + if (blp) + ::encode(*blp, bl); + else + ::encode(bufferlist(), bl); +} +void inline_data_t::decode(bufferlist::iterator &p) +{ + ::decode(version, p); + uint32_t inline_len; + ::decode(inline_len, p); + if (inline_len > 0) + ::decode_nohead(inline_len, get_data(), p); + else + free_data(); +} /* * inode_t @@ -274,9 +295,7 @@ void inode_t::encode(bufferlist &bl) const ::encode(backtrace_version, bl); ::encode(old_pools, bl); ::encode(max_size_ever, bl); - ::encode(inline_version, bl); ::encode(inline_data, bl); - ::encode(quota, bl); ENCODE_FINISH(bl); @@ -340,10 +359,9 @@ void inode_t::decode(bufferlist::iterator &p) if (struct_v >= 8) ::decode(max_size_ever, p); if (struct_v >= 9) { - ::decode(inline_version, p); ::decode(inline_data, p); } else { - inline_version = CEPH_INLINE_NONE; + inline_data.version = CEPH_INLINE_NONE; } if (struct_v < 10) backtrace_version = 0; // force update backtrace @@ -372,10 +390,10 @@ void inode_t::dump(Formatter *f) const f->close_section(); f->open_array_section("old_pools"); - vector::const_iterator i = old_pools.begin(); - while(i != old_pools.end()) { + for (compact_set::const_iterator i = old_pools.begin(); + i != old_pools.end(); + ++i) f->dump_int("pool", *i); - } f->close_section(); f->dump_unsigned("size", size); @@ -444,9 +462,7 @@ int inode_t::compare(const inode_t &other, bool *divergent) const mtime != other.mtime || atime != other.atime || time_warp_seq != other.time_warp_seq || - !(*const_cast(&inline_data) == - *const_cast(&other.inline_data)) || - inline_version != other.inline_version || + inline_data != other.inline_data || client_ranges != other.client_ranges || !(dirstat == other.dirstat) || !(rstat == other.rstat) || @@ -472,7 +488,7 @@ bool inode_t::older_is_consistent(const inode_t &other) const if (max_size_ever < other.max_size_ever || truncate_seq < other.truncate_seq || time_warp_seq < other.time_warp_seq || - inline_version < other.inline_version || + inline_data.version < other.inline_data.version || dirstat.version < other.dirstat.version || rstat.version < other.rstat.version || accounted_rstat.version < other.accounted_rstat.version || diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index e633fb58a24..975f9004059 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -19,6 +19,7 @@ #include "include/xlist.h" #include "include/interval_set.h" #include "include/compact_map.h" +#include "include/compact_set.h" #include "inode_backtrace.h" @@ -357,6 +358,51 @@ inline bool operator==(const client_writeable_range_t& l, l.follows == r.follows; } +struct inline_data_t { +private: + bufferlist *blp; +public: + version_t version; + + void free_data() { + delete blp; + blp = NULL; + } + bufferlist& get_data() { + if (!blp) + blp = new bufferlist; + return *blp; + } + size_t length() const { return blp ? blp->length() : 0; } + + inline_data_t() : blp(0), version(1) {} + inline_data_t(const inline_data_t& o) : blp(0), version(o.version) { + if (o.blp) + get_data() = *o.blp; + } + ~inline_data_t() { + free_data(); + } + inline_data_t& operator=(const inline_data_t& o) { + version = o.version; + if (o.blp) + get_data() = *o.blp; + else + free_data(); + return *this; + } + bool operator==(const inline_data_t& o) const { + return length() == o.length() && + (length() == 0 || + (*const_cast(blp) == *const_cast(o.blp))); + } + bool operator!=(const inline_data_t& o) const { + return !(*this == o); + } + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator& bl); +}; +WRITE_CLASS_ENCODER(inline_data_t) /* * inode_t @@ -385,7 +431,7 @@ struct inode_t { // file (data access) ceph_dir_layout dir_layout; // [dir only] ceph_file_layout layout; - vector old_pools; + compact_set old_pools; uint64_t size; // on directory, # dentries uint64_t max_size_ever; // max size the file has ever been uint32_t truncate_seq; @@ -394,8 +440,7 @@ struct inode_t { utime_t mtime; // file data modify time. utime_t atime; // file data access time. uint32_t time_warp_seq; // count of (potential) mtime/atime timewarps (i.e., utimes()) - bufferlist inline_data; - version_t inline_version; + inline_data_t inline_data; std::map client_ranges; // client(s) can write to these ranges @@ -421,7 +466,6 @@ struct inode_t { truncate_seq(0), truncate_size(0), truncate_from(0), truncate_pending(0), time_warp_seq(0), - inline_version(1), version(0), file_data_version(0), xattr_version(0), backtrace_version(0) { clear_layout(); memset(&dir_layout, 0, sizeof(dir_layout)); @@ -504,7 +548,7 @@ struct inode_t { void add_old_pool(int64_t l) { backtrace_version = version; - old_pools.push_back(l); + old_pools.insert(l); } void encode(bufferlist &bl) const;