From: sage Date: Tue, 28 Jun 2005 05:50:23 +0000 (+0000) Subject: file layout policies X-Git-Tag: v0.1~2030 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=1084b9d2b14e765ba66cf6531e00765905df16c9;p=ceph.git file layout policies git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@351 29311d96-e01e-0410-9327-a35deaab8ce9 --- diff --git a/ceph/client/Client.cc b/ceph/client/Client.cc index b9dccc094ab..ebb761c4565 100644 --- a/ceph/client/Client.cc +++ b/ceph/client/Client.cc @@ -24,6 +24,10 @@ + + + + // cons/des Client::Client(MDCluster *mdc, int id, Messenger *m) @@ -1101,7 +1105,7 @@ int Client::read(fileh_t fh, char *buf, size_t size, off_t offset) C_Client_Cond *onfinish = new C_Client_Cond(&cond, &client_lock, &rvalue); - filer->read(in->inode.ino, size, offset, &blist, onfinish); + filer->read(in->inode.ino, g_OSD_FileLayout, size, offset, &blist, onfinish); cond.Wait(client_lock); @@ -1181,7 +1185,7 @@ int Client::write(fileh_t fh, const char *buf, size_t size, off_t offset) in->inflight_buffers.insert(blist); Context *onfinish = new C_Client_WriteBuffer( in, blist ); - filer->write(in->inode.ino, size, offset, *blist, 0, onfinish); + filer->write(in->inode.ino, g_OSD_FileLayout, size, offset, *blist, 0, onfinish); } else { // synchronous write @@ -1196,7 +1200,7 @@ int Client::write(fileh_t fh, const char *buf, size_t size, off_t offset) int rvalue; C_Client_Cond *onfinish = new C_Client_Cond(&cond, &client_lock, &rvalue); - filer->write(in->inode.ino, size, offset, blist, 0, onfinish); + filer->write(in->inode.ino, g_OSD_FileLayout, size, offset, blist, 0, onfinish); cond.Wait(client_lock); } diff --git a/ceph/common/Logger.cc b/ceph/common/Logger.cc index 5e06a851211..918b16b693a 100644 --- a/ceph/common/Logger.cc +++ b/ceph/common/Logger.cc @@ -124,7 +124,7 @@ void Logger::flush(bool force) // reset the counters for (vector::iterator it = type->inc_keys.begin(); it != type->inc_keys.end(); it++) - set(*it, 0); + this->vals[*it] = 0; } lock.Unlock(); diff --git a/ceph/config.cc b/ceph/config.cc index a162ad17910..0731857335f 100644 --- a/ceph/config.cc +++ b/ceph/config.cc @@ -1,5 +1,7 @@ #include "include/config.h" +#include "osd/OSDCluster.h" + //#define MDS_CACHE_SIZE 4*10000 -> <20mb //#define MDS_CACHE_SIZE 80000 62mb @@ -15,6 +17,16 @@ long buffer_total_alloc = 0; +//OSDFileLayout g_OSD_FileLayout( 1<<20, 1, 1<<20 ); // stripe files over whole objects +OSDFileLayout g_OSD_FileLayout( 1<<17, 4, 1<<20 ); // 128k stripes over sets of 4 + +// ?? +OSDFileLayout g_OSD_MDDirLayout( 1<<14, 1<<2, 1<<19 ); + +// stripe mds log over 128 byte bits (see mds_log_pad_entry below to match!) +OSDFileLayout g_OSD_MDLogLayout( 1<<7, 1<<3, 1<<20 ); + + md_config_t g_conf = { num_mds: 2, @@ -50,6 +62,7 @@ md_config_t g_conf = { mds_log_max_len: MDS_CACHE_SIZE / 3, mds_log_max_trimming: 256, mds_log_read_inc: 65536, + mds_log_pad_entry: 64, mds_log_before_reply: true, mds_log_flush_on_shutdown: true, diff --git a/ceph/config.h b/ceph/config.h index f192128727d..efec3620370 100644 --- a/ceph/config.h +++ b/ceph/config.h @@ -1,6 +1,10 @@ #ifndef __CONFIG_H #define __CONFIG_H +extern class OSDFileLayout g_OSD_FileLayout; +extern class OSDFileLayout g_OSD_MDDirLayout; +extern class OSDFileLayout g_OSD_MDLogLayout; + struct md_config_t { int num_mds; int num_osd; @@ -36,6 +40,7 @@ struct md_config_t { int mds_log_max_len; int mds_log_max_trimming; int mds_log_read_inc; + int mds_log_pad_entry; bool mds_log_before_reply; bool mds_log_flush_on_shutdown; diff --git a/ceph/mds/AnchorTable.cc b/ceph/mds/AnchorTable.cc index 94e41e9cf7e..a5bd7eec8f2 100644 --- a/ceph/mds/AnchorTable.cc +++ b/ceph/mds/AnchorTable.cc @@ -351,6 +351,7 @@ void AnchorTable::save(Context *onfinish) bufferlist bl; bl.append(tab.c_str(), tab.length()); mds->filer->write(MDS_INO_ANCHORTABLE+mds->get_nodeid(), + g_OSD_FileLayout, bl.length(), 0, bl, 0, onfinish); @@ -394,6 +395,7 @@ public: if (r > 0 && size > 0) { C_AT_Load *c = new C_AT_Load(size, at, onfinish); mds->filer->read(MDS_INO_ANCHORTABLE+mds->get_nodeid(), + g_OSD_FileLayout, size, sizeof(size), &c->bl, c); @@ -413,6 +415,7 @@ void AnchorTable::load(Context *onfinish) C_AT_LoadSize *c = new C_AT_LoadSize(this, mds, onfinish); mds->filer->read(MDS_INO_ANCHORTABLE+mds->get_nodeid(), + g_OSD_FileLayout, sizeof(size_t), 0, &c->bl, c); diff --git a/ceph/mds/IdAllocator.cc b/ceph/mds/IdAllocator.cc index c609cb62646..3fcce8515e5 100644 --- a/ceph/mds/IdAllocator.cc +++ b/ceph/mds/IdAllocator.cc @@ -89,6 +89,7 @@ void IdAllocator::save(Context *onfinish) // write (async) mds->filer->write(MDS_INO_IDS_OFFSET + mds->get_nodeid(), + g_OSD_FileLayout, data.length(), 0, bl, @@ -138,7 +139,8 @@ void IdAllocator::load(Context *onfinish) opening = true; mds->filer->read(MDS_INO_IDS_OFFSET + mds->get_nodeid(), - FILE_OBJECT_SIZE, + g_OSD_FileLayout, + g_OSD_FileLayout.stripe_size, 0, &c->bl, c); diff --git a/ceph/mds/LogEvent.h b/ceph/mds/LogEvent.h index fe4173e8a02..371b350821d 100644 --- a/ceph/mds/LogEvent.h +++ b/ceph/mds/LogEvent.h @@ -12,6 +12,9 @@ using namespace std; #define EVENT_UNLINK 3 #define EVENT_ALLOC 4 +#include "include/config.h" + + // generic log event class LogEvent { private: @@ -37,7 +40,21 @@ class LogEvent { // payload encode_payload(bl); + + // HACK: pad payload to match md log layout? + int elen = bl.length() - off + sizeof(_type); + if (elen % g_conf.mds_log_pad_entry > 0) { + int add = g_conf.mds_log_pad_entry - (elen % g_conf.mds_log_pad_entry); + //cout << "elen " << elen << " adding " << add << endl; + buffer *b = new buffer(add); + memset(b->c_str(), 0, add); + b->set_length(add); + bufferptr bp(b); + bl.append(bp); + } + len = bl.length() - off - sizeof(len); + bl.copy_in(off, sizeof(len), (char*)&len); } diff --git a/ceph/mds/LogStream.cc b/ceph/mds/LogStream.cc index aafaeabed12..5a0863bd95f 100644 --- a/ceph/mds/LogStream.cc +++ b/ceph/mds/LogStream.cc @@ -18,6 +18,10 @@ using namespace std; #define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mds_log) cout << "mds" << mds->get_nodeid() << ".logstream " + + + + // ---------------------------- // writing @@ -40,6 +44,7 @@ off_t LogStream::append(LogEvent *e) // serialize FIXME ******** bufferlist bl; e->encode(bl); + size_t elen = bl.length(); // append @@ -47,6 +52,7 @@ off_t LogStream::append(LogEvent *e) off_t off = append_pos; append_pos += elen; + //dout(15) << "write buf was " << write_buf.length() << " bl " << write_buf << endl; write_buf.claim_append(bl); //dout(15) << "write buf now " << write_buf.length() << " bl " << write_buf << endl; @@ -56,7 +62,7 @@ off_t LogStream::append(LogEvent *e) void LogStream::_append_2(off_t off) { - dout(15) << "sync_pos now " << off << endl; + dout(15) << "sync_pos now " << off << " skew " << off % g_conf.mds_log_pad_entry << endl; sync_pos = off; // discard written bufferlist @@ -107,6 +113,7 @@ void LogStream::flush() // write it mds->filer->write(log_ino, + g_OSD_MDLogLayout, writing_buffers[flush_pos]->length(), flush_pos, *writing_buffers[flush_pos], 0, @@ -174,6 +181,7 @@ LogEvent *LogStream::get_next_event() // decode le->decode_payload(read_buf, off); + off = sizeof(type) + sizeof(length) + length; // advance past any padding that wasn't decoded.. // discard front of read_buf read_pos += off; @@ -218,6 +226,7 @@ void LogStream::wait_for_next_event(Context *c) dout(15) << "wait_for_next_event reading from pos " << tail << " len " << size << endl; C_LS_ReadChunk *readc = new C_LS_ReadChunk(this); mds->filer->read(log_ino, + g_OSD_MDLogLayout, g_conf.mds_log_read_inc, tail, &readc->bl, readc); diff --git a/ceph/mds/MDCache.cc b/ceph/mds/MDCache.cc index 4fdc1605c1d..ea5a94dfaf1 100644 --- a/ceph/mds/MDCache.cc +++ b/ceph/mds/MDCache.cc @@ -483,7 +483,9 @@ bool MDCache::trim(__int32_t max) { // last link? if (in->inode.nlink == 0) { dout(7) << "last link, destroying inode " << *in << endl; // FIXME THIS IS WRONG PLACE FOR THIS! - mds->filer->remove(in->ino(), in->inode.size, + mds->filer->remove(in->ino(), + g_OSD_FileLayout, + in->inode.size, NULL); // FIXME } diff --git a/ceph/mds/MDStore.cc b/ceph/mds/MDStore.cc index 4e46214b78b..872059b9753 100644 --- a/ceph/mds/MDStore.cc +++ b/ceph/mds/MDStore.cc @@ -23,6 +23,8 @@ using namespace std; + + void MDStore::proc_message(Message *m) { switch (m->get_type()) { @@ -409,6 +411,7 @@ void MDStore::do_commit_dir( CDir *dir, // submit to osd mds->filer->write( dir->ino(), + g_OSD_MDDirLayout, bl.length(), 0, bl, 0, //OSD_OP_FLAGS_TRUNCATE, // truncate file/object after end of this write @@ -544,6 +547,7 @@ class MDDoFetchDirContext : public Context { MDDoFetchDirContext *fin = new MDDoFetchDirContext( mds, ino, context, hashcode ); fin->bl.claim( bl ); mds->filer->read(ino, + g_OSD_MDDirLayout, size - got, bl.length(), &fin->bl2, fin ); @@ -565,8 +569,10 @@ void MDStore::do_fetch_dir( CDir *dir, // read first bit mds->filer->read(dir->ino(), - FILE_OBJECT_SIZE, 0, // get first object's bit + g_OSD_MDDirLayout, + //FILE_OBJECT_SIZE, 0, // get first object's bit //16, 0, // just get front bit + g_OSD_MDDirLayout.stripe_size, 0, // grab first stripe bit (better be more than 16 bytes!) &fin->bl, fin ); } diff --git a/ceph/osd/OSDMap.h b/ceph/osd/OSDMap.h index d16a6ab9a5d..ff98a5516e9 100644 --- a/ceph/osd/OSDMap.h +++ b/ceph/osd/OSDMap.h @@ -29,12 +29,28 @@ using namespace __gnu_cxx; #define NUM_RUSH_REPLICAS 4 // this should be big enough to cope w/ failing disks. #define MAX_REPLICAS 3 -#define FILE_OBJECT_SIZE (1<<20) // 1 MB object size +//#define FILE_OBJECT_SIZE (1<<20) // 1 MB object size -#define OID_BLOCK_BITS 30 // 1mb * 10^9 = 1 petabyte files +#define OID_ONO_BITS 30 // 1mb * 10^9 = 1 petabyte files #define OID_INO_BITS (64-30) // 2^34 =~ 16 billion files -#define MAX_FILE_SIZE (FILE_OBJECT_SIZE << OID_BLOCK_BITS) // 1 PB +//#define MAX_FILE_SIZE (FILE_OBJECT_SIZE << OID_ONO_BITS) // 1 PB + + +/** OSDFileLayout + * specifies a striping strategy + */ + +class OSDFileLayout { + public: + int stripe_size; // stripe unit, in bytes + int stripe_count; // over this many objects + int object_size; // until objects are this big, then use a new set of objects. + + OSDFileLayout(int ss, int sc, int os) : + stripe_size(ss), stripe_count(sc), object_size(os) { } +}; + /** OSDGroup @@ -155,10 +171,10 @@ class OSDCluster { /* map (ino, blockno) into a replica group */ repgroup_t file_to_repgroup(inodeno_t ino, - size_t blockno) { + size_t ono) { // something simple for now - // hash this eventually - return (ino+blockno) % NUM_REPLICA_GROUPS; + // hash this eventually! + return (ino+ono) % NUM_REPLICA_GROUPS; } @@ -174,13 +190,13 @@ class OSDCluster { } - /* map (ino, block) to an object name + /* map (ino, ono) to an object name (to be used on any osd in the proper replica group) */ object_t file_to_object(inodeno_t ino, - size_t blockno) { + size_t ono) { assert(ino < (1LL<& extents) { + // layout constant + size_t stripes_per_object = layout.object_size / layout.stripe_size; + size_t cur = offset; size_t left = len; while (left > 0) { OSDExtent ex; - + + // layout into objects + size_t blockno = cur / layout.stripe_size; + size_t stripeno = blockno / layout.stripe_count; + size_t stripepos = blockno % layout.stripe_count; + size_t objectsetno = stripeno / stripes_per_object; + size_t objectno = objectsetno * layout.stripe_count + stripepos; + // find oid, osds - size_t blockno = cur / FILE_OBJECT_SIZE; - ex.oid = file_to_object( ino, blockno ); - ex.rg = file_to_repgroup(ino, blockno ); + ex.oid = file_to_object( ino, objectno ); + ex.rg = file_to_repgroup( ino, objectno ); ex.osd = get_rg_acting_primary( ex.rg ); - + // map range into object - ex.offset = cur % FILE_OBJECT_SIZE; - if (left + ex.offset > FILE_OBJECT_SIZE) - ex.len = FILE_OBJECT_SIZE - ex.offset; // doesn't fully fit + size_t block_start = (stripeno % stripes_per_object)*layout.stripe_size; + size_t block_off = cur % layout.stripe_size; + size_t max = layout.stripe_size - block_off; + + ex.offset = block_start + block_off; + if (left > max) + ex.len = max; else - ex.len = left; // fits! + ex.len = left; + + //cout << "map: ino " << ino << " oid " << ex.oid << " osd " << ex.osd << " offset " << ex.offset << " len " << ex.len << " ... left " << left << endl; left -= ex.len; cur += ex.len; diff --git a/ceph/osdc/Filer.cc b/ceph/osdc/Filer.cc index 25a80cfe3bc..9d41b971ae0 100644 --- a/ceph/osdc/Filer.cc +++ b/ceph/osdc/Filer.cc @@ -70,6 +70,7 @@ void Filer::send_outgoing() int Filer::read(inodeno_t ino, + OSDFileLayout& layout, size_t len, size_t offset, bufferlist *bl, @@ -84,7 +85,7 @@ Filer::read(inodeno_t ino, // find data list extents; - osdcluster->file_to_extents(ino, len, offset, extents); + osdcluster->file_to_extents(ino, layout, len, offset, extents); dout(7) << "osd read ino " << ino << " len " << len << " off " << offset << " in " << extents.size() << " extents" << endl; @@ -193,6 +194,7 @@ Filer::handle_osd_read_reply(MOSDOpReply *m) int Filer::write(inodeno_t ino, + OSDFileLayout& layout, size_t len, size_t offset, bufferlist& bl, @@ -207,7 +209,7 @@ Filer::write(inodeno_t ino, // find data list extents; - osdcluster->file_to_extents(ino, len, offset, extents); + osdcluster->file_to_extents(ino, layout, len, offset, extents); dout(7) << "osd write ino " << ino << " len " << len << " off " << offset << " in " << extents.size() << " extents" << endl; @@ -341,7 +343,10 @@ Filer::handle_osd_op_reply(MOSDOpReply *m) } -int Filer::remove(inodeno_t ino, size_t size, Context *onfinish) +int Filer::remove(inodeno_t ino, + OSDFileLayout& layout, + size_t size, + Context *onfinish) { // pending write record PendingOSDOp_t *p = new PendingOSDOp_t; @@ -349,7 +354,7 @@ int Filer::remove(inodeno_t ino, size_t size, Context *onfinish) // find data list extents; - osdcluster->file_to_extents(ino, size, 0, extents); + osdcluster->file_to_extents(ino, layout, size, 0, extents); dout(7) << "osd remove ino " << ino << " size " << size << " in " << extents.size() << " extents" << endl; @@ -384,7 +389,10 @@ int Filer::remove(inodeno_t ino, size_t size, Context *onfinish) } -int Filer::probe_size(inodeno_t ino, size_t *size, Context *onfinish) +int Filer::probe_size(inodeno_t ino, + OSDFileLayout& layout, + size_t *size, + Context *onfinish) { PendingOSDProbe_t *p = new PendingOSDProbe_t; p->final_size = size; diff --git a/ceph/osdc/Filer.h b/ceph/osdc/Filer.h index edda0b7a851..57663df15a1 100644 --- a/ceph/osdc/Filer.h +++ b/ceph/osdc/Filer.h @@ -24,6 +24,7 @@ using namespace __gnu_cxx; #include "include/types.h" #include "msg/Dispatcher.h" +#include "OSDCluster.h" class Context; class Messenger; @@ -92,20 +93,26 @@ class Filer : public Dispatcher { // osd fun int read(inodeno_t ino, + OSDFileLayout& layout, size_t len, size_t offset, bufferlist *bl, // ptr to data Context *c); int write(inodeno_t ino, + OSDFileLayout& layout, size_t len, size_t offset, bufferlist& bl, int flags, Context *c); - int probe_size(inodeno_t ino, size_t *size, Context *c); - int remove(inodeno_t ino, size_t size, Context *c); + int probe_size(inodeno_t ino, + OSDFileLayout& layout, + size_t *size, Context *c); + int remove(inodeno_t ino, + OSDFileLayout& layout, + size_t size, Context *c); //int zero(inodeno_t ino, size_t len, size_t offset, Context *c);