]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
file layout policies
authorsage <sage@29311d96-e01e-0410-9327-a35deaab8ce9>
Tue, 28 Jun 2005 05:50:23 +0000 (05:50 +0000)
committersage <sage@29311d96-e01e-0410-9327-a35deaab8ce9>
Tue, 28 Jun 2005 05:50:23 +0000 (05:50 +0000)
git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@351 29311d96-e01e-0410-9327-a35deaab8ce9

13 files changed:
ceph/client/Client.cc
ceph/common/Logger.cc
ceph/config.cc
ceph/config.h
ceph/mds/AnchorTable.cc
ceph/mds/IdAllocator.cc
ceph/mds/LogEvent.h
ceph/mds/LogStream.cc
ceph/mds/MDCache.cc
ceph/mds/MDStore.cc
ceph/osd/OSDMap.h
ceph/osdc/Filer.cc
ceph/osdc/Filer.h

index b9dccc094ab660ca76a956ea260a9b8e3bdac088..ebb761c45652edf5e4f41564cf3b425b2c93a59e 100644 (file)
 
 
 
+
+
+
+
 // cons/des
 
 Client::Client(MDCluster *mdc, int id, Messenger *m)
@@ -1101,7 +1105,7 @@ int Client::read(fileh_t fh, char *buf, size_t size, off_t offset)
        
        C_Client_Cond *onfinish = new C_Client_Cond(&cond, &client_lock, &rvalue);
        
-       filer->read(in->inode.ino, size, offset, &blist, onfinish);
+       filer->read(in->inode.ino, g_OSD_FileLayout, size, offset, &blist, onfinish);
        
        cond.Wait(client_lock);
 
@@ -1181,7 +1185,7 @@ int Client::write(fileh_t fh, const char *buf, size_t size, off_t offset)
        in->inflight_buffers.insert(blist);
 
        Context *onfinish = new C_Client_WriteBuffer( in, blist );
-       filer->write(in->inode.ino, size, offset, *blist, 0, onfinish);
+       filer->write(in->inode.ino, g_OSD_FileLayout, size, offset, *blist, 0, onfinish);
 
   } else {
        // synchronous write
@@ -1196,7 +1200,7 @@ int Client::write(fileh_t fh, const char *buf, size_t size, off_t offset)
        int rvalue;
        
        C_Client_Cond *onfinish = new C_Client_Cond(&cond, &client_lock, &rvalue);
-       filer->write(in->inode.ino, size, offset, blist, 0, onfinish);
+       filer->write(in->inode.ino, g_OSD_FileLayout, size, offset, blist, 0, onfinish);
        
        cond.Wait(client_lock);
   }
index 5e06a8512112ccf79427fe7da642f07fdf1ae283..918b16b693a151023e8bcd1942edac13b868eec8 100644 (file)
@@ -124,7 +124,7 @@ void Logger::flush(bool force)
 
        // reset the counters
        for (vector<string>::iterator it = type->inc_keys.begin(); it != type->inc_keys.end(); it++) 
-         set(*it, 0);
+         this->vals[*it] = 0;
   }
 
   lock.Unlock();
index a162ad17910a5407bce75141be88d578921499dc..0731857335f5e2ceb83c66d5edd69d20a8512eac 100644 (file)
@@ -1,5 +1,7 @@
 
 #include "include/config.h"
+#include "osd/OSDCluster.h"
+
 
 //#define MDS_CACHE_SIZE        4*10000   -> <20mb
 //#define MDS_CACHE_SIZE        80000         62mb
 long buffer_total_alloc = 0;
 
 
+//OSDFileLayout g_OSD_FileLayout( 1<<20, 1, 1<<20 );   // stripe files over whole objects
+OSDFileLayout g_OSD_FileLayout( 1<<17, 4, 1<<20 );   // 128k stripes over sets of 4
+
+// ??
+OSDFileLayout g_OSD_MDDirLayout( 1<<14, 1<<2, 1<<19 );
+
+// stripe mds log over 128 byte bits (see mds_log_pad_entry below to match!)
+OSDFileLayout g_OSD_MDLogLayout( 1<<7, 1<<3, 1<<20 );
+
+
 
 md_config_t g_conf = {
   num_mds: 2,
@@ -50,6 +62,7 @@ md_config_t g_conf = {
   mds_log_max_len:  MDS_CACHE_SIZE / 3,
   mds_log_max_trimming: 256,
   mds_log_read_inc: 65536,
+  mds_log_pad_entry: 64,
   mds_log_before_reply: true,
   mds_log_flush_on_shutdown: true,
 
index f192128727dc7db726bfb7da54eebbb14d451fdc..efec3620370849b58f095e5b1e8e4e51e6447a51 100644 (file)
@@ -1,6 +1,10 @@
 #ifndef __CONFIG_H
 #define __CONFIG_H
 
+extern class OSDFileLayout g_OSD_FileLayout;
+extern class OSDFileLayout g_OSD_MDDirLayout;
+extern class OSDFileLayout g_OSD_MDLogLayout;
+
 struct md_config_t {
   int  num_mds;
   int  num_osd;
@@ -36,6 +40,7 @@ struct md_config_t {
   int mds_log_max_len;
   int mds_log_max_trimming;
   int mds_log_read_inc;
+  int mds_log_pad_entry;
   bool  mds_log_before_reply;
   bool  mds_log_flush_on_shutdown;
   
index 94e41e9cf7e9bfb2392546c6d229a6f4bff1bf9b..a5bd7eec8f2e0770064101e1ba6df3caad201e09 100644 (file)
@@ -351,6 +351,7 @@ void AnchorTable::save(Context *onfinish)
   bufferlist bl;
   bl.append(tab.c_str(), tab.length());
   mds->filer->write(MDS_INO_ANCHORTABLE+mds->get_nodeid(),
+                                       g_OSD_FileLayout,
                                        bl.length(), 0,
                                        bl, 0, 
                                        onfinish);
@@ -394,6 +395,7 @@ public:
        if (r > 0 && size > 0) {
          C_AT_Load *c = new C_AT_Load(size, at, onfinish);
          mds->filer->read(MDS_INO_ANCHORTABLE+mds->get_nodeid(),
+                                          g_OSD_FileLayout,
                                           size, sizeof(size),
                                           &c->bl,
                                           c);
@@ -413,6 +415,7 @@ void AnchorTable::load(Context *onfinish)
   
   C_AT_LoadSize *c = new C_AT_LoadSize(this, mds, onfinish);
   mds->filer->read(MDS_INO_ANCHORTABLE+mds->get_nodeid(),
+                                  g_OSD_FileLayout,
                                   sizeof(size_t), 0,
                                   &c->bl,
                                   c);
index c609cb62646b4273042d4e460b72832444e283b8..3fcce8515e5e29062e8127127042c98f0bd38283 100644 (file)
@@ -89,6 +89,7 @@ void IdAllocator::save(Context *onfinish)
 
   // write (async)
   mds->filer->write(MDS_INO_IDS_OFFSET + mds->get_nodeid(),
+                                       g_OSD_FileLayout,
                                        data.length(),
                                        0,
                                        bl,
@@ -138,7 +139,8 @@ void IdAllocator::load(Context *onfinish)
   opening = true;
 
   mds->filer->read(MDS_INO_IDS_OFFSET + mds->get_nodeid(),
-                                  FILE_OBJECT_SIZE,
+                                  g_OSD_FileLayout,
+                                  g_OSD_FileLayout.stripe_size,
                                   0,
                                   &c->bl,
                                   c);
index fe4173e8a021d7b4dda5db2f821c855b9f0883d0..371b350821d1cab960d28d72bedf7fd05e3b2b3e 100644 (file)
@@ -12,6 +12,9 @@ using namespace std;
 #define EVENT_UNLINK       3
 #define EVENT_ALLOC        4
 
+#include "include/config.h"
+
+
 // generic log event
 class LogEvent {
  private:
@@ -37,7 +40,21 @@ class LogEvent {
 
        // payload
        encode_payload(bl);
+
+       // HACK: pad payload to match md log layout?
+       int elen = bl.length() - off + sizeof(_type);
+       if (elen % g_conf.mds_log_pad_entry > 0) {
+         int add = g_conf.mds_log_pad_entry - (elen % g_conf.mds_log_pad_entry);
+         //cout << "elen " << elen << "  adding " << add << endl;
+         buffer *b = new buffer(add);
+         memset(b->c_str(), 0, add);
+         b->set_length(add);
+         bufferptr bp(b);
+         bl.append(bp);
+       } 
+
        len = bl.length() - off - sizeof(len);
+
        bl.copy_in(off, sizeof(len), (char*)&len);
   }
   
index aafaeabed12178a0e78e0c9bb7a8d2bb99f6a5d7..5a0863bd95f9a93c00f2fd534f58e4a916f13fd3 100644 (file)
@@ -18,6 +18,10 @@ using namespace std;
 #define  dout(l)    if (l<=g_conf.debug || l<=g_conf.debug_mds_log) cout << "mds" << mds->get_nodeid() << ".logstream "
 
 
+
+
+
+
 // ----------------------------
 // writing
 
@@ -40,6 +44,7 @@ off_t LogStream::append(LogEvent *e)
   // serialize FIXME  ********
   bufferlist bl;
   e->encode(bl);
+
   size_t elen = bl.length();
   
   // append
@@ -47,6 +52,7 @@ off_t LogStream::append(LogEvent *e)
   
   off_t off = append_pos;
   append_pos += elen;
+  
   //dout(15) << "write buf was " << write_buf.length() << " bl " << write_buf << endl;
   write_buf.claim_append(bl);
   //dout(15) << "write buf now " << write_buf.length() << " bl " << write_buf << endl;
@@ -56,7 +62,7 @@ off_t LogStream::append(LogEvent *e)
 
 void LogStream::_append_2(off_t off)
 {
-  dout(15) << "sync_pos now " << off << endl;
+  dout(15) << "sync_pos now " << off << " skew " << off % g_conf.mds_log_pad_entry << endl;
   sync_pos = off;
 
   // discard written bufferlist
@@ -107,6 +113,7 @@ void LogStream::flush()
 
        // write it
        mds->filer->write(log_ino, 
+                                         g_OSD_MDLogLayout,
                                          writing_buffers[flush_pos]->length(), flush_pos,
                                          *writing_buffers[flush_pos],
                                          0,
@@ -174,6 +181,7 @@ LogEvent *LogStream::get_next_event()
 
   // decode
   le->decode_payload(read_buf, off);
+  off = sizeof(type) + sizeof(length) + length;  // advance past any padding that wasn't decoded..
 
   // discard front of read_buf
   read_pos += off;
@@ -218,6 +226,7 @@ void LogStream::wait_for_next_event(Context *c)
   dout(15) << "wait_for_next_event reading from pos " << tail << " len " << size << endl;
   C_LS_ReadChunk *readc = new C_LS_ReadChunk(this);
   mds->filer->read(log_ino,  
+                                  g_OSD_MDLogLayout,
                                   g_conf.mds_log_read_inc, tail,
                                   &readc->bl,
                                   readc);
index 4fdc1605c1dddf784ed54af82b28a5ae5e0a3be8..ea5a94dfaf15084c76ee8094d0cd808b5285bbfd 100644 (file)
@@ -483,7 +483,9 @@ bool MDCache::trim(__int32_t max) {
        // last link?
        if (in->inode.nlink == 0) {
          dout(7) << "last link, destroying inode " << *in << endl;             // FIXME THIS IS WRONG PLACE FOR THIS!
-         mds->filer->remove(in->ino(), in->inode.size, 
+         mds->filer->remove(in->ino(), 
+                                                g_OSD_FileLayout,
+                                                in->inode.size, 
                                                 NULL);   // FIXME
        }
 
index 4e46214b78ba70749551e592482db77af1c5997f..872059b97536d6336e98eb3af3daf544777b0d85 100644 (file)
@@ -23,6 +23,8 @@ using namespace std;
 
 
 
+
+
 void MDStore::proc_message(Message *m)
 {
   switch (m->get_type()) {
@@ -409,6 +411,7 @@ void MDStore::do_commit_dir( CDir *dir,
   
   // submit to osd
   mds->filer->write( dir->ino(),
+                                        g_OSD_MDDirLayout,
                                         bl.length(), 0,
                                         bl,
                                         0, //OSD_OP_FLAGS_TRUNCATE, // truncate file/object after end of this write
@@ -544,6 +547,7 @@ class MDDoFetchDirContext : public Context {
          MDDoFetchDirContext *fin = new MDDoFetchDirContext( mds, ino, context, hashcode );
          fin->bl.claim( bl );
          mds->filer->read(ino,
+                                          g_OSD_MDDirLayout,
                                           size - got, bl.length(),
                                           &fin->bl2,
                                           fin );
@@ -565,8 +569,10 @@ void MDStore::do_fetch_dir( CDir *dir,
 
   // read first bit
   mds->filer->read(dir->ino(),
-                                  FILE_OBJECT_SIZE, 0,  // get first object's bit
+                                  g_OSD_MDDirLayout,
+                                  //FILE_OBJECT_SIZE, 0,  // get first object's bit
                                   //16, 0,  // just get front bit
+                                  g_OSD_MDDirLayout.stripe_size, 0,  // grab first stripe bit (better be more than 16 bytes!)
                                   &fin->bl,
                                   fin );
 }
index d16a6ab9a5da4527db452ac6a74e8a442f6f816b..ff98a5516e935197039de6c8c7e997b067e1ad73 100644 (file)
@@ -29,12 +29,28 @@ using namespace __gnu_cxx;
 #define NUM_RUSH_REPLICAS         4   // this should be big enough to cope w/ failing disks.
 #define MAX_REPLICAS              3
 
-#define FILE_OBJECT_SIZE     (1<<20)  // 1 MB object size
+//#define FILE_OBJECT_SIZE     (1<<20)  // 1 MB object size
 
-#define OID_BLOCK_BITS     30       // 1mb * 10^9 = 1 petabyte files
+#define OID_ONO_BITS       30       // 1mb * 10^9 = 1 petabyte files
 #define OID_INO_BITS       (64-30)  // 2^34 =~ 16 billion files
 
-#define MAX_FILE_SIZE      (FILE_OBJECT_SIZE << OID_BLOCK_BITS)  // 1 PB
+//#define MAX_FILE_SIZE      (FILE_OBJECT_SIZE << OID_ONO_BITS)  // 1 PB
+
+
+/** OSDFileLayout 
+ * specifies a striping strategy
+ */
+
+class OSDFileLayout {
+ public:
+  int stripe_size;     // stripe unit, in bytes
+  int stripe_count;    // over this many objects
+  int object_size;     // until objects are this big, then use a new set of objects.
+
+  OSDFileLayout(int ss, int sc, int os) :
+       stripe_size(ss), stripe_count(sc), object_size(os) { }
+};
+
 
 
 /** OSDGroup
@@ -155,10 +171,10 @@ class OSDCluster {
 
   /* map (ino, blockno) into a replica group */
   repgroup_t file_to_repgroup(inodeno_t ino, 
-                                                         size_t blockno) {
+                                                         size_t ono) {
        // something simple for now
-       // hash this eventually
-       return (ino+blockno) % NUM_REPLICA_GROUPS;
+       // hash this eventually!
+       return (ino+ono) % NUM_REPLICA_GROUPS;
   }
 
 
@@ -174,13 +190,13 @@ class OSDCluster {
   }
 
 
-  /* map (ino, block) to an object name
+  /* map (ino, ono) to an object name
         (to be used on any osd in the proper replica group) */
   object_t file_to_object(inodeno_t ino,
-                                                 size_t    blockno) {  
+                                                 size_t    ono) {  
        assert(ino < (1LL<<OID_INO_BITS));       // legal ino can't be too big
-       assert(blockno < (1LL<<OID_BLOCK_BITS));
-       return (ino << OID_INO_BITS) + blockno;
+       assert(ono < (1LL<<OID_ONO_BITS));
+       return (ino << OID_INO_BITS) + ono;
   }
 
   
@@ -227,26 +243,42 @@ class OSDCluster {
   /* map (ino, offset, len) to a (list of) OSDExtents 
         (byte ranges in objects on osds) */
   void file_to_extents(inodeno_t ino,
+                                          OSDFileLayout& layout,
                                           size_t len,
                                           size_t offset,
                                           list<OSDExtent>& extents) {
+       // layout constant
+       size_t stripes_per_object = layout.object_size / layout.stripe_size;
+
        size_t cur = offset;
        size_t left = len;
        while (left > 0) {
          OSDExtent ex;
-         
+
+         // layout into objects
+         size_t blockno = cur / layout.stripe_size;
+         size_t stripeno = blockno / layout.stripe_count;
+         size_t stripepos = blockno % layout.stripe_count;
+         size_t objectsetno = stripeno / stripes_per_object;
+         size_t objectno = objectsetno * layout.stripe_count + stripepos;
+
          // find oid, osds
-         size_t blockno = cur / FILE_OBJECT_SIZE;
-         ex.oid = file_to_object( ino, blockno );
-         ex.rg = file_to_repgroup(ino, blockno );
+         ex.oid = file_to_object( ino, objectno );
+         ex.rg = file_to_repgroup( ino, objectno );
          ex.osd = get_rg_acting_primary( ex.rg );
-
+         
          // map range into object
-         ex.offset = cur % FILE_OBJECT_SIZE;
-         if (left + ex.offset > FILE_OBJECT_SIZE) 
-               ex.len = FILE_OBJECT_SIZE - ex.offset;   // doesn't fully fit
+         size_t block_start = (stripeno % stripes_per_object)*layout.stripe_size;
+         size_t block_off = cur % layout.stripe_size;
+         size_t max = layout.stripe_size - block_off;
+
+         ex.offset = block_start + block_off;
+         if (left > max)
+               ex.len = max;
          else
-               ex.len = left;                               // fits!
+               ex.len = left;
+
+         //cout << "map: ino " << ino << " oid " << ex.oid << " osd " << ex.osd << " offset " << ex.offset << " len " << ex.len << " ... left " << left << endl;
 
          left -= ex.len;
          cur += ex.len;
index 25a80cfe3bc2e4fa43ad48704cc6bc495179dfeb..9d41b971ae0fdb804ad9f97c00a0ba0df5f1c161 100644 (file)
@@ -70,6 +70,7 @@ void Filer::send_outgoing()
 
 int
 Filer::read(inodeno_t ino,
+                       OSDFileLayout& layout,
                        size_t len, 
                        size_t offset, 
                        bufferlist *bl,
@@ -84,7 +85,7 @@ Filer::read(inodeno_t ino,
 
   // find data
   list<OSDExtent> extents;
-  osdcluster->file_to_extents(ino, len, offset, extents);
+  osdcluster->file_to_extents(ino, layout, len, offset, extents);
 
   dout(7) << "osd read ino " << ino << " len " << len << " off " << offset << " in " << extents.size() << " extents" << endl;
 
@@ -193,6 +194,7 @@ Filer::handle_osd_read_reply(MOSDOpReply *m)
 
 int 
 Filer::write(inodeno_t ino,
+                        OSDFileLayout& layout,
                         size_t len, 
                         size_t offset, 
                         bufferlist& bl,
@@ -207,7 +209,7 @@ Filer::write(inodeno_t ino,
   
   // find data
   list<OSDExtent> extents;
-  osdcluster->file_to_extents(ino, len, offset, extents);
+  osdcluster->file_to_extents(ino, layout, len, offset, extents);
 
   dout(7) << "osd write ino " << ino << " len " << len << " off " << offset << " in " << extents.size() << " extents" << endl;
 
@@ -341,7 +343,10 @@ Filer::handle_osd_op_reply(MOSDOpReply *m)
 }
 
 
-int Filer::remove(inodeno_t ino, size_t size, Context *onfinish)
+int Filer::remove(inodeno_t ino, 
+                                 OSDFileLayout& layout,
+                                 size_t size, 
+                                 Context *onfinish)
 {
   // pending write record
   PendingOSDOp_t *p = new PendingOSDOp_t;
@@ -349,7 +354,7 @@ int Filer::remove(inodeno_t ino, size_t size, Context *onfinish)
   
   // find data
   list<OSDExtent> extents;
-  osdcluster->file_to_extents(ino, size, 0, extents);
+  osdcluster->file_to_extents(ino, layout, size, 0, extents);
 
   dout(7) << "osd remove ino " << ino << " size " << size << " in " << extents.size() << " extents" << endl;
 
@@ -384,7 +389,10 @@ int Filer::remove(inodeno_t ino, size_t size, Context *onfinish)
 }
 
 
-int Filer::probe_size(inodeno_t ino, size_t *size, Context *onfinish)
+int Filer::probe_size(inodeno_t ino, 
+                                         OSDFileLayout& layout,
+                                         size_t *size, 
+                                         Context *onfinish)
 {
   PendingOSDProbe_t *p = new PendingOSDProbe_t;
   p->final_size = size;
index edda0b7a851b198431b8fd0b0e849bbf97f3bc7a..57663df15a195c1c4829c955417cb54a73b166cb 100644 (file)
@@ -24,6 +24,7 @@ using namespace __gnu_cxx;
 
 #include "include/types.h"
 #include "msg/Dispatcher.h"
+#include "OSDCluster.h"
 
 class Context;
 class Messenger;
@@ -92,20 +93,26 @@ class Filer : public Dispatcher {
 
   // osd fun
   int read(inodeno_t ino,
+                  OSDFileLayout& layout,
                   size_t len, 
                   size_t offset, 
                   bufferlist *bl,   // ptr to data
                   Context *c);
 
   int write(inodeno_t ino,
+                       OSDFileLayout& layout,
                        size_t len, 
                        size_t offset, 
                        bufferlist& bl,
                        int flags, 
                        Context *c);
 
-  int probe_size(inodeno_t ino, size_t *size, Context *c);
-  int remove(inodeno_t ino, size_t size, Context *c);
+  int probe_size(inodeno_t ino, 
+                                OSDFileLayout& layout,
+                                size_t *size, Context *c);
+  int remove(inodeno_t ino, 
+                        OSDFileLayout& layout,
+                        size_t size, Context *c);
 
   //int zero(inodeno_t ino, size_t len, size_t offset, Context *c);