#include <vector>
#include <list>
#include <set>
+#include <map>
using namespace std;
-#include <ext/rope>
-using namespace __gnu_cxx;
-
/*
* some system constants
/** OSDExtent
* for mapping (ino, offset, len) to a (list of) byte extents in objects on osds
*/
-struct OSDExtent {
+class OSDExtent {
+ public:
int osd; // (acting) primary osd
object_t oid; // object id
repgroup_t rg; // replica group
size_t offset, len; // extent within the object
+ map<size_t, size_t> extents_in_buffer; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!)
+
+ OSDExtent() : osd(0), oid(0), rg(0), offset(0), len(0) { }
};
size_t len,
size_t offset,
list<OSDExtent>& extents) {
+ /* we want only one extent per object!
+ * this means that each extent we read may map into different bits of the
+ * final read buffer.. hence OSDExtent.extents_in_buffer
+ */
+ map< object_t, OSDExtent > object_extents;
+
// layout constant
size_t stripes_per_object = layout.object_size / layout.stripe_size;
size_t cur = offset;
size_t left = len;
while (left > 0) {
- OSDExtent ex;
-
// layout into objects
size_t blockno = cur / layout.stripe_size;
size_t stripeno = blockno / layout.stripe_count;
size_t objectsetno = stripeno / stripes_per_object;
size_t objectno = objectsetno * layout.stripe_count + stripepos;
- // find oid, osds
- ex.oid = file_to_object( ino, objectno );
- ex.rg = file_to_repgroup( ino, objectno );
- ex.osd = get_rg_acting_primary( ex.rg );
-
+ // find oid, extent
+ OSDExtent *ex = 0;
+ object_t oid = file_to_object( ino, objectno );
+ if (object_extents.count(oid))
+ ex = &object_extents[oid];
+ else {
+ ex = &object_extents[oid];
+ ex->oid = oid;
+ ex->rg = file_to_repgroup( ino, objectno );
+ ex->osd = get_rg_acting_primary( ex->rg );
+ }
+
// map range into object
size_t block_start = (stripeno % stripes_per_object)*layout.stripe_size;
size_t block_off = cur % layout.stripe_size;
size_t max = layout.stripe_size - block_off;
- ex.offset = block_start + block_off;
+ size_t x_offset = block_start + block_off;
+ size_t x_len;
if (left > max)
- ex.len = max;
+ x_len = max;
else
- ex.len = left;
-
+ x_len = left;
+
+ if (ex->offset + ex->len == x_offset) {
+ // add to extent
+ ex->len += x_len;
+ } else {
+ // new extent
+ assert(ex->len == 0);
+ assert(ex->offset == 0);
+ ex->offset = x_offset;
+ ex->len = x_len;
+ }
+ ex->extents_in_buffer[cur-offset] = x_len;
+
//cout << "map: ino " << ino << " oid " << ex.oid << " osd " << ex.osd << " offset " << ex.offset << " len " << ex.len << " ... left " << left << endl;
- left -= ex.len;
- cur += ex.len;
+ left -= x_len;
+ cur += x_len;
+ }
- // add it
- extents.push_back(ex);
+ // make final list
+ for (map<object_t, OSDExtent>::iterator it = object_extents.begin();
+ it != object_extents.end();
+ it++) {
+ extents.push_back(it->second);
}
}
p->onfinish = onfinish;
// find data
- list<OSDExtent> extents;
- osdcluster->file_to_extents(ino, layout, len, offset, extents);
+ osdcluster->file_to_extents(ino, layout, len, offset, p->extents);
- dout(7) << "osd read ino " << ino << " len " << len << " off " << offset << " in " << extents.size() << " extents" << endl;
+ dout(7) << "osd read ino " << ino << " len " << len << " off " << offset << " in " << p->extents.size() << " object extents" << endl;
int nfrag = 0;
off_t off = 0;
- for (list<OSDExtent>::iterator it = extents.begin();
- it != extents.end();
+ for (list<OSDExtent>::iterator it = p->extents.begin();
+ it != p->extents.end();
it++) {
int r = 0; // pick a replica
last_tid++;
dout(15) << " read on " << last_tid << endl;
messenger->send_message(m, MSG_ADDR_OSD(it->osd), 0);
- // note offset into read buffer
- p->read_off[last_tid] = off;
- off += it->len;
-
// add to gather set
p->outstanding_ops.insert(last_tid);
op_reads[last_tid] = p;
p->outstanding_ops.erase(tid);
// what buffer offset are we?
- size_t off = p->read_off[tid];
- dout(7) << "got frag at " << off << " len " << m->get_length() << ", still have " << p->outstanding_ops.size() << " more ops" << endl;
+ dout(7) << "got frag from " << m->get_oid() << " len " << m->get_length() << ", still have " << p->outstanding_ops.size() << " more ops" << endl;
if (p->outstanding_ops.empty()) {
// all done
/** BUG this doesn't handle holes properly **/
// we have other fragments, assemble them all... blech!
- p->read_data[off] = new bufferlist;
- p->read_data[off]->claim( m->get_data() );
+ p->read_data[m->get_oid()] = new bufferlist;
+ p->read_data[m->get_oid()]->claim( m->get_data() );
- // sort and string them together
- for (map<off_t, bufferlist*>::iterator it = p->read_data.begin();
- it != p->read_data.end();
+ // map extents back into buffer
+ map<off_t, bufferlist*> by_off; // buffer offset -> bufferlist
+
+ for (list<OSDExtent>::iterator eit = p->extents.begin();
+ eit != p->extents.end();
+ eit++) {
+ bufferlist *ox_buf = p->read_data[eit->oid];
+ int ox_off = 0;
+
+ for (map<size_t,size_t>::iterator bit = eit->extents_in_buffer.begin();
+ bit != eit->extents_in_buffer.end();
+ bit++) {
+ dout(10) << "object " << eit->oid << " extent (...) : offset " << ox_off << " -> buffer " << bit->first << " len " << bit->second << endl;
+ by_off[bit->first] = new bufferlist;
+ by_off[bit->first]->substr_of(*ox_buf, ox_off, bit->second);
+ ox_off += bit->second;
+ }
+ }
+
+ // sort and string bits together
+ for (map<off_t, bufferlist*>::iterator it = by_off.begin();
+ it != by_off.end();
it++) {
- dout(10) << " frag off " << it->first << " len " << it->second->length() << endl;
+ dout(10) << " frag buffer off " << it->first << " len " << it->second->length() << endl;
// FIXME: pad if hole?
if (it->second->length())
p->read_result->claim_append(*(it->second));
delete it->second;
}
+
+ // hose p->read_data
+ for (map<object_t, bufferlist*>::iterator it = p->read_data.begin();
+ it != p->read_data.end();
+ it++) {
+ delete it->second;
+ }
} else {
dout(15) << " only one frag" << endl;
// only one fragment, easy
}
} else {
// store my bufferlist for later assembling
- p->read_data[off] = new bufferlist;
- p->read_data[off]->claim( m->get_data() );
+ p->read_data[m->get_oid()] = new bufferlist;
+ p->read_data[m->get_oid()]->claim( m->get_data() );
}
delete m;
m->set_length(it->len);
m->set_offset(it->offset);
+ // map buffer segments into this extent
+ // (may be fragmented bc of striping)
bufferlist cur;
- cur.substr_of(bl, off, it->len);
+ for (map<size_t,size_t>::iterator bit = it->extents_in_buffer.begin();
+ bit != it->extents_in_buffer.end();
+ bit++) {
+ bufferlist thisbit;
+ thisbit.substr_of(bl, bit->first, bit->second);
+ cur.claim_append(thisbit);
+ }
+ assert(cur.length() == it->len);
m->set_data(cur);
off += it->len;