From: Yan, Zheng Date: Mon, 25 Apr 2016 07:31:27 +0000 (+0800) Subject: client: using hash value to compose dentry offset X-Git-Tag: v10.2.2~8^2~4 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=3fe5a09f3e7aa9fc89cd47a261785110c2bf38a2;p=ceph.git client: using hash value to compose dentry offset If MDS sorts dentries in dirfrag in hash order, we use hash value to compose dentry offset. dentry offset is: (0xff << 52) | ((24 bits hash) << 28) | (the nth entry hash hash collision) This offset is stable across directory fragmentation. Signed-off-by: Yan, Zheng (cherry picked from commit 680766ec131b95271e320f54dfe6d69ea8d4fbb3) Signed-off-by: Greg Farnum head.args.readdir.frag; - uint64_t readdir_offset = dirp->next_offset; + unsigned readdir_offset = dirp->next_offset; string readdir_start = dirp->last_name; + unsigned last_hash = 0; + if (!readdir_start.empty()) + last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start)); + if (fg != dst.frag) { ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl; fg = dst.frag; readdir_offset = 2; readdir_start.clear(); - dirp->offset = dir_result_t::make_fpos(fg, readdir_offset); + if (!hash_order) + dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false); } ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end - << ", offset " << readdir_offset + << ", hash_order=" << hash_order << ", offset " << readdir_offset << ", readdir_start " << readdir_start << dendl; dirp->buffer_frag = fg; @@ -1162,12 +1168,19 @@ void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, // new dn dn = link(dir, dname, in, NULL); } - update_dentry_lease(dn, &dlease, request->sent_stamp, session); - dn->offset = dir_result_t::make_fpos(fg, readdir_offset++); + update_dentry_lease(dn, &dlease, request->sent_stamp, session); + if (hash_order) { + unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname)); + if (hash != last_hash) + readdir_offset = 2; + last_hash = hash; + dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true); + } else { + dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false); + } // add to cached result list dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in)); - ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl; } @@ -6795,7 +6808,6 @@ int Client::_opendir(Inode *in, dir_result_t **dirpp, int uid, int gid) (*dirpp)->start_shared_gen = in->shared_gen; (*dirpp)->owner_uid = uid; (*dirpp)->owner_gid = gid; - ldout(cct, 10) << "_opendir " << in->ino << ", our cache says the first dirfrag is " << (*dirpp)->frag() << dendl; ldout(cct, 3) << "_opendir(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl; return 0; } @@ -6846,25 +6858,28 @@ void Client::seekdir(dir_result_t *dirp, loff_t offset) Mutex::Locker lock(client_lock); ldout(cct, 3) << "seekdir(" << dirp << ", " << offset << ")" << dendl; - dir_result_t *d = static_cast(dirp); - if (offset == 0 || - dir_result_t::fpos_frag(offset) != d->frag() || - dir_result_t::fpos_off(offset) < d->fragpos()) { - _readdir_drop_dirp_buffer(d); - d->reset(); + if (dirp->hash_order()) { + if (dirp->offset > offset) { + _readdir_drop_dirp_buffer(dirp); + dirp->reset(); + } + } else { + if (offset == 0 || + dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) || + dirp->offset_low() > dir_result_t::fpos_low(offset)) { + _readdir_drop_dirp_buffer(dirp); + dirp->reset(); + } } - if (offset > d->offset) - d->release_count--; // bump if we do a forward seek + if (offset > dirp->offset) + dirp->release_count--; // bump if we do a forward seek - d->offset = offset; + dirp->offset = offset; } - - - //struct dirent { // ino_t d_ino; /* inode number */ // off_t d_off; /* offset to the next dirent */ @@ -6890,14 +6905,24 @@ void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t void Client::_readdir_next_frag(dir_result_t *dirp) { - frag_t fg = dirp->frag(); + frag_t fg = dirp->buffer_frag; - // advance - dirp->next_frag(); - if (dirp->at_end()) { + if (fg.is_rightmost()) { ldout(cct, 10) << "_readdir_next_frag advance from " << fg << " to END" << dendl; + dirp->set_end(); + return; + } + + // advance + fg = fg.next(); + ldout(cct, 10) << "_readdir_next_frag advance from " << dirp->buffer_frag << " to " << fg << dendl; + + if (dirp->hash_order()) { + int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true); + if (dirp->offset < new_offset) // don't decrease offset + dirp->offset = new_offset; } else { - ldout(cct, 10) << "_readdir_next_frag advance from " << fg << " to " << dirp->frag() << dendl; + dirp->offset = dir_result_t::make_fpos(fg, 2, false); _readdir_rechoose_frag(dirp); } } @@ -6905,11 +6930,17 @@ void Client::_readdir_next_frag(dir_result_t *dirp) void Client::_readdir_rechoose_frag(dir_result_t *dirp) { assert(dirp->inode); - frag_t cur = dirp->frag(); - frag_t f = dirp->inode->dirfragtree[cur.value()]; - if (f != cur) { - ldout(cct, 10) << "_readdir_rechoose_frag frag " << cur << " maps to " << f << dendl; - dirp->set_frag(f); + + if (dirp->hash_order()) + return; + + frag_t cur = frag_t(dirp->offset_high()); + frag_t fg = dirp->inode->dirfragtree[cur.value()]; + if (fg != cur) { + ldout(cct, 10) << "_readdir_rechoose_frag frag " << cur << " maps to " << fg << dendl; + dirp->offset = dir_result_t::make_fpos(fg, 2, false); + dirp->last_name.clear(); + dirp->next_offset = 2; } } @@ -6925,11 +6956,14 @@ int Client::_readdir_get_frag(dir_result_t *dirp) assert(dirp->inode); // get the current frag. - frag_t fg = dirp->frag(); + frag_t fg; + if (dirp->hash_order()) + fg = dirp->inode->dirfragtree[dirp->offset_high()]; + else + fg = frag_t(dirp->offset_high()); ldout(cct, 10) << "_readdir_get_frag " << dirp << " on " << dirp->inode->ino << " fg " << fg - << " next_offset " << dirp->next_offset - << dendl; + << " offset " << hex << dirp->offset << dendl; int op = CEPH_MDS_OP_READDIR; if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR) @@ -7032,8 +7066,7 @@ int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p) int r = cb(p, &de, &st, stmask, next_off); // _next_ offset client_lock.Lock(); ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec - << " = " << r - << dendl; + << " = " << r << dendl; if (r < 0) { return r; } @@ -7042,7 +7075,7 @@ int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p) if (dirp->at_end()) dirp->next_offset = 2; else - dirp->next_offset = dirp->fragpos(); + dirp->next_offset = dirp->offset_low(); dirp->at_cache_name = dn_name; // we successfully returned this one; update! if (r > 0) return r; @@ -7059,18 +7092,15 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p) dir_result_t *dirp = static_cast(d); - ldout(cct, 10) << "readdir_r_cb " << *dirp->inode << " offset " << hex << dirp->offset << dec - << " frag " << dirp->frag() << " fragpos " << hex << dirp->fragpos() << dec - << " at_end=" << dirp->at_end() - << dendl; + ldout(cct, 10) << "readdir_r_cb " << *dirp->inode << " offset " << hex << dirp->offset + << dec << " at_end=" << dirp->at_end() + << " hash_order=" << dirp->hash_order() << dendl; struct dirent de; struct stat st; memset(&de, 0, sizeof(de)); memset(&st, 0, sizeof(st)); - frag_t fg = dirp->frag(); - InodeRef& diri = dirp->inode; if (dirp->at_end()) @@ -7142,14 +7172,14 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p) if (dirp->at_end()) return 0; - if (dirp->buffer_frag != dirp->frag() || dirp->buffer.empty()) { + if (!dirp->is_cached()) { int r = _readdir_get_frag(dirp); if (r) return r; // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is // different than the requested one. (our dirfragtree was outdated) - fg = dirp->buffer_frag; } + frag_t fg = dirp->buffer_frag; ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size() << " offset " << hex << dirp->offset << dendl; @@ -7187,8 +7217,6 @@ int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p) if (!fg.is_rightmost()) { // next frag! _readdir_next_frag(dirp); - ldout(cct, 10) << " advancing to next frag: " << fg << " -> " << dirp->frag() << dendl; - fg = dirp->frag(); continue; } diff --git a/src/client/Client.h b/src/client/Client.h index df8ed79aa11..8ca18069fb5 100644 --- a/src/client/Client.h +++ b/src/client/Client.h @@ -161,15 +161,24 @@ struct client_callback_args { struct dir_result_t { static const int SHIFT = 28; static const int64_t MASK = (1 << SHIFT) - 1; + static const int64_t HASH = 0xFFULL << (SHIFT + 24); // impossible frag bits static const loff_t END = 1ULL << (SHIFT + 32); - static uint64_t make_fpos(unsigned frag, unsigned off) { - return ((uint64_t)frag << SHIFT) | (uint64_t)off; + static uint64_t make_fpos(unsigned h, unsigned l, bool hash) { + uint64_t v = ((uint64_t)h<< SHIFT) | (uint64_t)l; + if (hash) + v |= HASH; + else + assert((v & HASH) != HASH); + return v; } - static unsigned fpos_frag(uint64_t p) { - return (p & ~END) >> SHIFT; + static unsigned fpos_high(uint64_t p) { + unsigned v = (p & (END-1)) >> SHIFT; + if ((p & HASH) == HASH) + return ceph_frag_value(v); + return v; } - static unsigned fpos_off(uint64_t p) { + static unsigned fpos_low(uint64_t p) { return p & MASK; } static int fpos_cmp(uint64_t l, uint64_t r) { @@ -181,12 +190,15 @@ struct dir_result_t { return fpos_low(l) < fpos_low(r) ? -1 : 1; } - InodeRef inode; int owner_uid; int owner_gid; - int64_t offset; // high bits: frag_t, low bits: an offset + int64_t offset; // hash order: + // (0xff << 52) | ((24 bits hash) << 28) | + // (the nth entry has hash collision); + // frag+name order; + // ((frag value) << 28) | (the nth entry in frag); unsigned next_offset; // offset of next chunk (last_name's + 1) string last_name; // last entry in previous chunk @@ -216,23 +228,25 @@ struct dir_result_t { explicit dir_result_t(Inode *in); - frag_t frag() { return frag_t(offset >> SHIFT); } - unsigned fragpos() { return offset & MASK; } + unsigned offset_high() { return fpos_high(offset); } + unsigned offset_low() { return fpos_low(offset); } - void next_frag() { - frag_t fg = offset >> SHIFT; - if (fg.is_rightmost()) - set_end(); - else - set_frag(fg.next()); - } - void set_frag(frag_t f) { - offset = (uint64_t)f << SHIFT; - assert(sizeof(offset) == 8); - } void set_end() { offset |= END; } bool at_end() { return (offset & END); } + void set_hash_order() { offset |= HASH; } + bool hash_order() { return (offset & HASH) == HASH; } + + bool is_cached() { + if (buffer.empty()) + return false; + if (hash_order()) { + return buffer_frag.contains(offset_high()); + } else { + return buffer_frag == frag_t(offset_high()); + } + } + void reset() { last_name.clear(); at_cache_name.clear(); diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index fb426bf5e80..e8f5f2f5d57 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -397,6 +397,7 @@ extern const char *ceph_mds_op_name(int op); */ #define CEPH_READDIR_FRAG_END (1<<0) #define CEPH_READDIR_FRAG_COMPLETE (1<<8) +#define CEPH_READDIR_HASH_ORDER (1<<9) union ceph_mds_request_args { struct { diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 0fb53e3b70f..514f4d958e9 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -3446,6 +3446,7 @@ void Server::handle_client_readdir(MDRequestRef& mdr) } // client only understand END and COMPLETE flags ? if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) { + flags |= CEPH_READDIR_HASH_ORDER; } // finish final blob