From e54aee1b5958337e574715931df47d2068aef29e Mon Sep 17 00:00:00 2001 From: sage Date: Fri, 1 Jul 2005 16:47:14 +0000 Subject: [PATCH] more efficient reads for striping git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@375 29311d96-e01e-0410-9327-a35deaab8ce9 --- ceph/config.cc | 1 + ceph/osd/OSDMap.h | 68 ++++++++++++++++++++++++++++++++------------- ceph/osdc/Filer.cc | 69 ++++++++++++++++++++++++++++++++-------------- ceph/osdc/Filer.h | 5 ++-- 4 files changed, 101 insertions(+), 42 deletions(-) diff --git a/ceph/config.cc b/ceph/config.cc index fba8bf5897157..2f2f8c3b7af3b 100644 --- a/ceph/config.cc +++ b/ceph/config.cc @@ -27,6 +27,7 @@ OSDFileLayout g_OSD_MDDirLayout( 1<<14, 1<<2, 1<<19 ); // stripe mds log over 128 byte bits (see mds_log_pad_entry below to match!) OSDFileLayout g_OSD_MDLogLayout( 1<<7, 32, 1<<20 ); +//OSDFileLayout g_OSD_MDLogLayout( 127, 32, 1<<20 ); // pathological case to test striping buffer mapping //OSDFileLayout g_OSD_MDLogLayout( 1<<20, 1, 1<<20 ); diff --git a/ceph/osd/OSDMap.h b/ceph/osd/OSDMap.h index ff98a5516e935..74e9a22e7d255 100644 --- a/ceph/osd/OSDMap.h +++ b/ceph/osd/OSDMap.h @@ -16,11 +16,9 @@ #include #include #include +#include using namespace std; -#include -using namespace __gnu_cxx; - /* * some system constants @@ -84,11 +82,15 @@ class OSDGroup { /** OSDExtent * for mapping (ino, offset, len) to a (list of) byte extents in objects on osds */ -struct OSDExtent { +class OSDExtent { + public: int osd; // (acting) primary osd object_t oid; // object id repgroup_t rg; // replica group size_t offset, len; // extent within the object + map extents_in_buffer; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!) + + OSDExtent() : osd(0), oid(0), rg(0), offset(0), len(0) { } }; @@ -247,14 +249,18 @@ class OSDCluster { size_t len, size_t offset, list& extents) { + /* we want only one extent per object! + * this means that each extent we read may map into different bits of the + * final read buffer.. hence OSDExtent.extents_in_buffer + */ + map< object_t, OSDExtent > object_extents; + // layout constant size_t stripes_per_object = layout.object_size / layout.stripe_size; size_t cur = offset; size_t left = len; while (left > 0) { - OSDExtent ex; - // layout into objects size_t blockno = cur / layout.stripe_size; size_t stripeno = blockno / layout.stripe_count; @@ -262,29 +268,53 @@ class OSDCluster { size_t objectsetno = stripeno / stripes_per_object; size_t objectno = objectsetno * layout.stripe_count + stripepos; - // find oid, osds - ex.oid = file_to_object( ino, objectno ); - ex.rg = file_to_repgroup( ino, objectno ); - ex.osd = get_rg_acting_primary( ex.rg ); - + // find oid, extent + OSDExtent *ex = 0; + object_t oid = file_to_object( ino, objectno ); + if (object_extents.count(oid)) + ex = &object_extents[oid]; + else { + ex = &object_extents[oid]; + ex->oid = oid; + ex->rg = file_to_repgroup( ino, objectno ); + ex->osd = get_rg_acting_primary( ex->rg ); + } + // map range into object size_t block_start = (stripeno % stripes_per_object)*layout.stripe_size; size_t block_off = cur % layout.stripe_size; size_t max = layout.stripe_size - block_off; - ex.offset = block_start + block_off; + size_t x_offset = block_start + block_off; + size_t x_len; if (left > max) - ex.len = max; + x_len = max; else - ex.len = left; - + x_len = left; + + if (ex->offset + ex->len == x_offset) { + // add to extent + ex->len += x_len; + } else { + // new extent + assert(ex->len == 0); + assert(ex->offset == 0); + ex->offset = x_offset; + ex->len = x_len; + } + ex->extents_in_buffer[cur-offset] = x_len; + //cout << "map: ino " << ino << " oid " << ex.oid << " osd " << ex.osd << " offset " << ex.offset << " len " << ex.len << " ... left " << left << endl; - left -= ex.len; - cur += ex.len; + left -= x_len; + cur += x_len; + } - // add it - extents.push_back(ex); + // make final list + for (map::iterator it = object_extents.begin(); + it != object_extents.end(); + it++) { + extents.push_back(it->second); } } diff --git a/ceph/osdc/Filer.cc b/ceph/osdc/Filer.cc index 571a784f10c96..f70ef3ff776c3 100644 --- a/ceph/osdc/Filer.cc +++ b/ceph/osdc/Filer.cc @@ -84,16 +84,15 @@ Filer::read(inodeno_t ino, p->onfinish = onfinish; // find data - list extents; - osdcluster->file_to_extents(ino, layout, len, offset, extents); + osdcluster->file_to_extents(ino, layout, len, offset, p->extents); - dout(7) << "osd read ino " << ino << " len " << len << " off " << offset << " in " << extents.size() << " extents" << endl; + dout(7) << "osd read ino " << ino << " len " << len << " off " << offset << " in " << p->extents.size() << " object extents" << endl; int nfrag = 0; off_t off = 0; - for (list::iterator it = extents.begin(); - it != extents.end(); + for (list::iterator it = p->extents.begin(); + it != p->extents.end(); it++) { int r = 0; // pick a replica last_tid++; @@ -107,10 +106,6 @@ Filer::read(inodeno_t ino, dout(15) << " read on " << last_tid << endl; messenger->send_message(m, MSG_ADDR_OSD(it->osd), 0); - // note offset into read buffer - p->read_off[last_tid] = off; - off += it->len; - // add to gather set p->outstanding_ops.insert(last_tid); op_reads[last_tid] = p; @@ -136,8 +131,7 @@ Filer::handle_osd_read_reply(MOSDOpReply *m) p->outstanding_ops.erase(tid); // what buffer offset are we? - size_t off = p->read_off[tid]; - dout(7) << "got frag at " << off << " len " << m->get_length() << ", still have " << p->outstanding_ops.size() << " more ops" << endl; + dout(7) << "got frag from " << m->get_oid() << " len " << m->get_length() << ", still have " << p->outstanding_ops.size() << " more ops" << endl; if (p->outstanding_ops.empty()) { // all done @@ -148,19 +142,45 @@ Filer::handle_osd_read_reply(MOSDOpReply *m) /** BUG this doesn't handle holes properly **/ // we have other fragments, assemble them all... blech! - p->read_data[off] = new bufferlist; - p->read_data[off]->claim( m->get_data() ); + p->read_data[m->get_oid()] = new bufferlist; + p->read_data[m->get_oid()]->claim( m->get_data() ); - // sort and string them together - for (map::iterator it = p->read_data.begin(); - it != p->read_data.end(); + // map extents back into buffer + map by_off; // buffer offset -> bufferlist + + for (list::iterator eit = p->extents.begin(); + eit != p->extents.end(); + eit++) { + bufferlist *ox_buf = p->read_data[eit->oid]; + int ox_off = 0; + + for (map::iterator bit = eit->extents_in_buffer.begin(); + bit != eit->extents_in_buffer.end(); + bit++) { + dout(10) << "object " << eit->oid << " extent (...) : offset " << ox_off << " -> buffer " << bit->first << " len " << bit->second << endl; + by_off[bit->first] = new bufferlist; + by_off[bit->first]->substr_of(*ox_buf, ox_off, bit->second); + ox_off += bit->second; + } + } + + // sort and string bits together + for (map::iterator it = by_off.begin(); + it != by_off.end(); it++) { - dout(10) << " frag off " << it->first << " len " << it->second->length() << endl; + dout(10) << " frag buffer off " << it->first << " len " << it->second->length() << endl; // FIXME: pad if hole? if (it->second->length()) p->read_result->claim_append(*(it->second)); delete it->second; } + + // hose p->read_data + for (map::iterator it = p->read_data.begin(); + it != p->read_data.end(); + it++) { + delete it->second; + } } else { dout(15) << " only one frag" << endl; // only one fragment, easy @@ -181,8 +201,8 @@ Filer::handle_osd_read_reply(MOSDOpReply *m) } } else { // store my bufferlist for later assembling - p->read_data[off] = new bufferlist; - p->read_data[off]->claim( m->get_data() ); + p->read_data[m->get_oid()] = new bufferlist; + p->read_data[m->get_oid()]->claim( m->get_data() ); } delete m; @@ -228,8 +248,17 @@ Filer::write(inodeno_t ino, m->set_length(it->len); m->set_offset(it->offset); + // map buffer segments into this extent + // (may be fragmented bc of striping) bufferlist cur; - cur.substr_of(bl, off, it->len); + for (map::iterator bit = it->extents_in_buffer.begin(); + bit != it->extents_in_buffer.end(); + bit++) { + bufferlist thisbit; + thisbit.substr_of(bl, bit->first, bit->second); + cur.claim_append(thisbit); + } + assert(cur.length() == it->len); m->set_data(cur); off += it->len; diff --git a/ceph/osdc/Filer.h b/ceph/osdc/Filer.h index 57663df15a195..38e73b51f1199 100644 --- a/ceph/osdc/Filer.h +++ b/ceph/osdc/Filer.h @@ -40,9 +40,8 @@ typedef __uint64_t tid_t; typedef struct { set outstanding_ops; size_t orig_offset; - - map read_off; - map read_data; // bits go here as they come back + list extents; + map read_data; // bits of data as they come back bufferlist *read_result; // eventaully condensed into here. -- 2.39.5