From d1c78fcbfeb386a24e5cb92f6d8c80dfe584bb0e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 14 May 2010 11:32:51 -0700 Subject: [PATCH] mds: allow readdir result limit in bytes This will allow the client to bound the size of the reply it gets --- src/include/ceph_fs.h | 1 + src/mds/CInode.cc | 57 +++++++++++++++++++++++++++++++------------ src/mds/CInode.h | 4 +-- src/mds/Server.cc | 44 +++++++++++++++++++++++++++------ 4 files changed, 81 insertions(+), 25 deletions(-) diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index 93487b26c677c..1f9eb2707815a 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -360,6 +360,7 @@ union ceph_mds_request_args { struct { __le32 frag; /* which dir fragment */ __le32 max_entries; /* how many dentries to grab */ + __le32 max_bytes; } __attribute__ ((packed)) readdir; struct { __le32 mode; diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 612eb0e28b12e..6238e4bbfedf1 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -1601,9 +1601,9 @@ void CInode::decode_snap_blob(bufferlist& snapbl) } -bool CInode::encode_inodestat(bufferlist& bl, Session *session, +int CInode::encode_inodestat(bufferlist& bl, Session *session, SnapRealm *realm, - snapid_t snapid) + snapid_t snapid, unsigned max_bytes) { int client = session->inst.name.num(); assert(snapid); @@ -1709,7 +1709,40 @@ bool CInode::encode_inodestat(bufferlist& bl, Session *session, i = pxattr ? pi:oi; bool had_latest_xattrs = cap && (cap->issued() & CEPH_CAP_XATTR_SHARED) && cap->client_xattr_version == i->xattr_version; + + // xattr + bufferlist xbl; + e.xattr_version = i->xattr_version; + if (!had_latest_xattrs && cap) { + if (!pxattrs) + pxattrs = pxattr ? get_projected_xattrs() : &xattrs; + ::encode(*pxattrs, xbl); + } + bufferlist splits; + for (map::iterator p = dirfragtree._splits.begin(); + p != dirfragtree._splits.end(); + p++) { + ::encode(p->first, bl); + ::encode(p->second, bl); + } + + // do we have room? + if (max_bytes) { + unsigned bytes = sizeof(e); + bytes += sizeof(__u32); + for (map::iterator p = dirfragtree._splits.begin(); + p != dirfragtree._splits.end(); + p++) + bytes += sizeof(p->first) + sizeof(p->second); + bytes += sizeof(__u32) + symlink.length(); + bytes += sizeof(__u32) + xbl.length(); + + if (bytes > max_bytes) + return -ENOSPC; + } + + // encode caps if (snapid != CEPH_NOSNAP) { /* @@ -1768,20 +1801,14 @@ bool CInode::encode_inodestat(bufferlist& bl, Session *session, << " seq " << e.cap.seq << " mseq " << e.cap.mseq << dendl; - // xattr - bufferlist xbl; - e.xattr_version = i->xattr_version; - if (!had_latest_xattrs && - cap && - (cap->pending() & CEPH_CAP_XATTR_SHARED)) { - - if (!pxattrs) - pxattrs = pxattr ? get_projected_xattrs() : &xattrs; - - ::encode(*pxattrs, xbl); - if (cap) + // include those xattrs? + if (xbl.length()) { + if (cap && (cap->pending() & CEPH_CAP_XATTR_SHARED)) { + dout(10) << "including xattrs version " << i->xattr_version << dendl; cap->client_xattr_version = i->xattr_version; - dout(10) << "including xattrs version " << i->xattr_version << dendl; + } else { + xbl.clear(); // no xattrs + } } // encode diff --git a/src/mds/CInode.h b/src/mds/CInode.h index ccb5ad0a9bafc..3b59e47d11557 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -485,8 +485,8 @@ private: // for giving to clients - bool encode_inodestat(bufferlist& bl, Session *session, SnapRealm *realm, - snapid_t snapid=CEPH_NOSNAP); + int encode_inodestat(bufferlist& bl, Session *session, SnapRealm *realm, + snapid_t snapid=CEPH_NOSNAP, unsigned max_bytes=0); void encode_cap_message(MClientCaps *m, Capability *cap); diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 21186ec3fef44..075a2bc103307 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -2371,7 +2371,19 @@ void Server::handle_client_readdir(MDRequest *mdr) unsigned max = req->head.args.readdir.max_entries; if (!max) max = dir->get_num_any(); // whatever, something big. + unsigned max_bytes = req->head.args.readdir.max_bytes; + if (!max_bytes) + max_bytes = 512 << 10; // 512 KB? + // start final blob + bufferlist dirbl; + dir->encode_dirstat(dirbl, mds->get_nodeid()); + + // count bytes available. + // this isn't perfect, but we should capture the main variable/unbounded size items! + int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2; + int bytes_left = max_bytes - front_bytes; + bytes_left -= realm->get_snap_trace().length(); __u32 numfiles = 0; while (it != dir->end() && numfiles < max) { @@ -2423,6 +2435,13 @@ void Server::handle_client_readdir(MDRequest *mdr) } assert(in); + if ((int)(dnbl.length() + dn->name.length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) { + dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl; + break; + } + + unsigned start_len = dnbl.length(); + // dentry dout(12) << "including dn " << *dn << dendl; ::encode(dn->name, dnbl); @@ -2430,8 +2449,16 @@ void Server::handle_client_readdir(MDRequest *mdr) // inode dout(12) << "including inode " << *in << dendl; - bool valid = in->encode_inodestat(dnbl, mdr->session, realm, snapid); - assert(valid); + int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length()); + if (r < 0) { + // chop off dn->name, lease + dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl; + bufferlist keep; + keep.substr_of(dnbl, 0, start_len); + dnbl.swap(keep); + break; + } + assert(r >= 0); numfiles++; // touch dn @@ -2440,10 +2467,8 @@ void Server::handle_client_readdir(MDRequest *mdr) __u8 end = (it == dir->end()); __u8 complete = (end && !offset); // FIXME: what purpose does this serve - - // final blob - bufferlist dirbl; - dir->encode_dirstat(dirbl, mds->get_nodeid()); + + // finish final blob ::encode(numfiles, dirbl); ::encode(end, dirbl); ::encode(complete, dirbl); @@ -2453,10 +2478,13 @@ void Server::handle_client_readdir(MDRequest *mdr) dir->log_mark_dirty(); // yay, reply + dout(10) << "reply to " << *req << " readdir num=" << numfiles + << " bytes=" << dirbl.length() + << " end=" << (int)end + << " complete=" << (int)complete + << dendl; MClientReply *reply = new MClientReply(req, 0); reply->set_dir_bl(dirbl); - dout(10) << "reply to " << *req << " readdir num=" << numfiles << " end=" << (int)end - << " complete=" << (int)complete << dendl; // bump popularity. NOTE: this doesn't quite capture it. mds->balancer->hit_dir(g_clock.now(), dir, META_POP_IRD, -1, numfiles); -- 2.39.5