::decode(flags, p);
bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
+ bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
frag_t fg = (unsigned)request->head.args.readdir.frag;
- uint64_t readdir_offset = dirp->next_offset;
+ unsigned readdir_offset = dirp->next_offset;
string readdir_start = dirp->last_name;
+ unsigned last_hash = 0;
+ if (!readdir_start.empty())
+ last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
+
if (fg != dst.frag) {
ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
fg = dst.frag;
readdir_offset = 2;
readdir_start.clear();
- dirp->offset = dir_result_t::make_fpos(fg, readdir_offset);
+ if (!hash_order)
+ dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
}
ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
- << ", offset " << readdir_offset
+ << ", hash_order=" << hash_order << ", offset " << readdir_offset
<< ", readdir_start " << readdir_start << dendl;
dirp->buffer_frag = fg;
// new dn
dn = link(dir, dname, in, NULL);
}
- update_dentry_lease(dn, &dlease, request->sent_stamp, session);
- dn->offset = dir_result_t::make_fpos(fg, readdir_offset++);
+ update_dentry_lease(dn, &dlease, request->sent_stamp, session);
+ if (hash_order) {
+ unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
+ if (hash != last_hash)
+ readdir_offset = 2;
+ last_hash = hash;
+ dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
+ } else {
+ dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
+ }
// add to cached result list
dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
-
ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
}
(*dirpp)->start_shared_gen = in->shared_gen;
(*dirpp)->owner_uid = uid;
(*dirpp)->owner_gid = gid;
- ldout(cct, 10) << "_opendir " << in->ino << ", our cache says the first dirfrag is " << (*dirpp)->frag() << dendl;
ldout(cct, 3) << "_opendir(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
return 0;
}
Mutex::Locker lock(client_lock);
ldout(cct, 3) << "seekdir(" << dirp << ", " << offset << ")" << dendl;
- dir_result_t *d = static_cast<dir_result_t*>(dirp);
- if (offset == 0 ||
- dir_result_t::fpos_frag(offset) != d->frag() ||
- dir_result_t::fpos_off(offset) < d->fragpos()) {
- _readdir_drop_dirp_buffer(d);
- d->reset();
+ if (dirp->hash_order()) {
+ if (dirp->offset > offset) {
+ _readdir_drop_dirp_buffer(dirp);
+ dirp->reset();
+ }
+ } else {
+ if (offset == 0 ||
+ dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
+ dirp->offset_low() > dir_result_t::fpos_low(offset)) {
+ _readdir_drop_dirp_buffer(dirp);
+ dirp->reset();
+ }
}
- if (offset > d->offset)
- d->release_count--; // bump if we do a forward seek
+ if (offset > dirp->offset)
+ dirp->release_count--; // bump if we do a forward seek
- d->offset = offset;
+ dirp->offset = offset;
}
-
-
-
//struct dirent {
// ino_t d_ino; /* inode number */
// off_t d_off; /* offset to the next dirent */
void Client::_readdir_next_frag(dir_result_t *dirp)
{
- frag_t fg = dirp->frag();
+ frag_t fg = dirp->buffer_frag;
- // advance
- dirp->next_frag();
- if (dirp->at_end()) {
+ if (fg.is_rightmost()) {
ldout(cct, 10) << "_readdir_next_frag advance from " << fg << " to END" << dendl;
+ dirp->set_end();
+ return;
+ }
+
+ // advance
+ fg = fg.next();
+ ldout(cct, 10) << "_readdir_next_frag advance from " << dirp->buffer_frag << " to " << fg << dendl;
+
+ if (dirp->hash_order()) {
+ int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
+ if (dirp->offset < new_offset) // don't decrease offset
+ dirp->offset = new_offset;
} else {
- ldout(cct, 10) << "_readdir_next_frag advance from " << fg << " to " << dirp->frag() << dendl;
+ dirp->offset = dir_result_t::make_fpos(fg, 2, false);
_readdir_rechoose_frag(dirp);
}
}
void Client::_readdir_rechoose_frag(dir_result_t *dirp)
{
assert(dirp->inode);
- frag_t cur = dirp->frag();
- frag_t f = dirp->inode->dirfragtree[cur.value()];
- if (f != cur) {
- ldout(cct, 10) << "_readdir_rechoose_frag frag " << cur << " maps to " << f << dendl;
- dirp->set_frag(f);
+
+ if (dirp->hash_order())
+ return;
+
+ frag_t cur = frag_t(dirp->offset_high());
+ frag_t fg = dirp->inode->dirfragtree[cur.value()];
+ if (fg != cur) {
+ ldout(cct, 10) << "_readdir_rechoose_frag frag " << cur << " maps to " << fg << dendl;
+ dirp->offset = dir_result_t::make_fpos(fg, 2, false);
+ dirp->last_name.clear();
+ dirp->next_offset = 2;
}
}
assert(dirp->inode);
// get the current frag.
- frag_t fg = dirp->frag();
+ frag_t fg;
+ if (dirp->hash_order())
+ fg = dirp->inode->dirfragtree[dirp->offset_high()];
+ else
+ fg = frag_t(dirp->offset_high());
ldout(cct, 10) << "_readdir_get_frag " << dirp << " on " << dirp->inode->ino << " fg " << fg
- << " next_offset " << dirp->next_offset
- << dendl;
+ << " offset " << hex << dirp->offset << dendl;
int op = CEPH_MDS_OP_READDIR;
if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
int r = cb(p, &de, &st, stmask, next_off); // _next_ offset
client_lock.Lock();
ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
- << " = " << r
- << dendl;
+ << " = " << r << dendl;
if (r < 0) {
return r;
}
if (dirp->at_end())
dirp->next_offset = 2;
else
- dirp->next_offset = dirp->fragpos();
+ dirp->next_offset = dirp->offset_low();
dirp->at_cache_name = dn_name; // we successfully returned this one; update!
if (r > 0)
return r;
dir_result_t *dirp = static_cast<dir_result_t*>(d);
- ldout(cct, 10) << "readdir_r_cb " << *dirp->inode << " offset " << hex << dirp->offset << dec
- << " frag " << dirp->frag() << " fragpos " << hex << dirp->fragpos() << dec
- << " at_end=" << dirp->at_end()
- << dendl;
+ ldout(cct, 10) << "readdir_r_cb " << *dirp->inode << " offset " << hex << dirp->offset
+ << dec << " at_end=" << dirp->at_end()
+ << " hash_order=" << dirp->hash_order() << dendl;
struct dirent de;
struct stat st;
memset(&de, 0, sizeof(de));
memset(&st, 0, sizeof(st));
- frag_t fg = dirp->frag();
-
InodeRef& diri = dirp->inode;
if (dirp->at_end())
if (dirp->at_end())
return 0;
- if (dirp->buffer_frag != dirp->frag() || dirp->buffer.empty()) {
+ if (!dirp->is_cached()) {
int r = _readdir_get_frag(dirp);
if (r)
return r;
// _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
// different than the requested one. (our dirfragtree was outdated)
- fg = dirp->buffer_frag;
}
+ frag_t fg = dirp->buffer_frag;
ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
<< " offset " << hex << dirp->offset << dendl;
if (!fg.is_rightmost()) {
// next frag!
_readdir_next_frag(dirp);
- ldout(cct, 10) << " advancing to next frag: " << fg << " -> " << dirp->frag() << dendl;
- fg = dirp->frag();
continue;
}
struct dir_result_t {
static const int SHIFT = 28;
static const int64_t MASK = (1 << SHIFT) - 1;
+ static const int64_t HASH = 0xFFULL << (SHIFT + 24); // impossible frag bits
static const loff_t END = 1ULL << (SHIFT + 32);
- static uint64_t make_fpos(unsigned frag, unsigned off) {
- return ((uint64_t)frag << SHIFT) | (uint64_t)off;
+ static uint64_t make_fpos(unsigned h, unsigned l, bool hash) {
+ uint64_t v = ((uint64_t)h<< SHIFT) | (uint64_t)l;
+ if (hash)
+ v |= HASH;
+ else
+ assert((v & HASH) != HASH);
+ return v;
}
- static unsigned fpos_frag(uint64_t p) {
- return (p & ~END) >> SHIFT;
+ static unsigned fpos_high(uint64_t p) {
+ unsigned v = (p & (END-1)) >> SHIFT;
+ if ((p & HASH) == HASH)
+ return ceph_frag_value(v);
+ return v;
}
- static unsigned fpos_off(uint64_t p) {
+ static unsigned fpos_low(uint64_t p) {
return p & MASK;
}
static int fpos_cmp(uint64_t l, uint64_t r) {
return fpos_low(l) < fpos_low(r) ? -1 : 1;
}
-
InodeRef inode;
int owner_uid;
int owner_gid;
- int64_t offset; // high bits: frag_t, low bits: an offset
+ int64_t offset; // hash order:
+ // (0xff << 52) | ((24 bits hash) << 28) |
+ // (the nth entry has hash collision);
+ // frag+name order;
+ // ((frag value) << 28) | (the nth entry in frag);
unsigned next_offset; // offset of next chunk (last_name's + 1)
string last_name; // last entry in previous chunk
explicit dir_result_t(Inode *in);
- frag_t frag() { return frag_t(offset >> SHIFT); }
- unsigned fragpos() { return offset & MASK; }
+ unsigned offset_high() { return fpos_high(offset); }
+ unsigned offset_low() { return fpos_low(offset); }
- void next_frag() {
- frag_t fg = offset >> SHIFT;
- if (fg.is_rightmost())
- set_end();
- else
- set_frag(fg.next());
- }
- void set_frag(frag_t f) {
- offset = (uint64_t)f << SHIFT;
- assert(sizeof(offset) == 8);
- }
void set_end() { offset |= END; }
bool at_end() { return (offset & END); }
+ void set_hash_order() { offset |= HASH; }
+ bool hash_order() { return (offset & HASH) == HASH; }
+
+ bool is_cached() {
+ if (buffer.empty())
+ return false;
+ if (hash_order()) {
+ return buffer_frag.contains(offset_high());
+ } else {
+ return buffer_frag == frag_t(offset_high());
+ }
+ }
+
void reset() {
last_name.clear();
at_cache_name.clear();