+
+
+
+
// cons/des
Client::Client(MDCluster *mdc, int id, Messenger *m)
C_Client_Cond *onfinish = new C_Client_Cond(&cond, &client_lock, &rvalue);
- filer->read(in->inode.ino, size, offset, &blist, onfinish);
+ filer->read(in->inode.ino, g_OSD_FileLayout, size, offset, &blist, onfinish);
cond.Wait(client_lock);
in->inflight_buffers.insert(blist);
Context *onfinish = new C_Client_WriteBuffer( in, blist );
- filer->write(in->inode.ino, size, offset, *blist, 0, onfinish);
+ filer->write(in->inode.ino, g_OSD_FileLayout, size, offset, *blist, 0, onfinish);
} else {
// synchronous write
int rvalue;
C_Client_Cond *onfinish = new C_Client_Cond(&cond, &client_lock, &rvalue);
- filer->write(in->inode.ino, size, offset, blist, 0, onfinish);
+ filer->write(in->inode.ino, g_OSD_FileLayout, size, offset, blist, 0, onfinish);
cond.Wait(client_lock);
}
// reset the counters
for (vector<string>::iterator it = type->inc_keys.begin(); it != type->inc_keys.end(); it++)
- set(*it, 0);
+ this->vals[*it] = 0;
}
lock.Unlock();
#include "include/config.h"
+#include "osd/OSDCluster.h"
+
//#define MDS_CACHE_SIZE 4*10000 -> <20mb
//#define MDS_CACHE_SIZE 80000 62mb
long buffer_total_alloc = 0;
+//OSDFileLayout g_OSD_FileLayout( 1<<20, 1, 1<<20 ); // stripe files over whole objects
+OSDFileLayout g_OSD_FileLayout( 1<<17, 4, 1<<20 ); // 128k stripes over sets of 4
+
+// ??
+OSDFileLayout g_OSD_MDDirLayout( 1<<14, 1<<2, 1<<19 );
+
+// stripe mds log over 128 byte bits (see mds_log_pad_entry below to match!)
+OSDFileLayout g_OSD_MDLogLayout( 1<<7, 1<<3, 1<<20 );
+
+
md_config_t g_conf = {
num_mds: 2,
mds_log_max_len: MDS_CACHE_SIZE / 3,
mds_log_max_trimming: 256,
mds_log_read_inc: 65536,
+ mds_log_pad_entry: 64,
mds_log_before_reply: true,
mds_log_flush_on_shutdown: true,
#ifndef __CONFIG_H
#define __CONFIG_H
+extern class OSDFileLayout g_OSD_FileLayout;
+extern class OSDFileLayout g_OSD_MDDirLayout;
+extern class OSDFileLayout g_OSD_MDLogLayout;
+
struct md_config_t {
int num_mds;
int num_osd;
int mds_log_max_len;
int mds_log_max_trimming;
int mds_log_read_inc;
+ int mds_log_pad_entry;
bool mds_log_before_reply;
bool mds_log_flush_on_shutdown;
bufferlist bl;
bl.append(tab.c_str(), tab.length());
mds->filer->write(MDS_INO_ANCHORTABLE+mds->get_nodeid(),
+ g_OSD_FileLayout,
bl.length(), 0,
bl, 0,
onfinish);
if (r > 0 && size > 0) {
C_AT_Load *c = new C_AT_Load(size, at, onfinish);
mds->filer->read(MDS_INO_ANCHORTABLE+mds->get_nodeid(),
+ g_OSD_FileLayout,
size, sizeof(size),
&c->bl,
c);
C_AT_LoadSize *c = new C_AT_LoadSize(this, mds, onfinish);
mds->filer->read(MDS_INO_ANCHORTABLE+mds->get_nodeid(),
+ g_OSD_FileLayout,
sizeof(size_t), 0,
&c->bl,
c);
// write (async)
mds->filer->write(MDS_INO_IDS_OFFSET + mds->get_nodeid(),
+ g_OSD_FileLayout,
data.length(),
0,
bl,
opening = true;
mds->filer->read(MDS_INO_IDS_OFFSET + mds->get_nodeid(),
- FILE_OBJECT_SIZE,
+ g_OSD_FileLayout,
+ g_OSD_FileLayout.stripe_size,
0,
&c->bl,
c);
#define EVENT_UNLINK 3
#define EVENT_ALLOC 4
+#include "include/config.h"
+
+
// generic log event
class LogEvent {
private:
// payload
encode_payload(bl);
+
+ // HACK: pad payload to match md log layout?
+ int elen = bl.length() - off + sizeof(_type);
+ if (elen % g_conf.mds_log_pad_entry > 0) {
+ int add = g_conf.mds_log_pad_entry - (elen % g_conf.mds_log_pad_entry);
+ //cout << "elen " << elen << " adding " << add << endl;
+ buffer *b = new buffer(add);
+ memset(b->c_str(), 0, add);
+ b->set_length(add);
+ bufferptr bp(b);
+ bl.append(bp);
+ }
+
len = bl.length() - off - sizeof(len);
+
bl.copy_in(off, sizeof(len), (char*)&len);
}
#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mds_log) cout << "mds" << mds->get_nodeid() << ".logstream "
+
+
+
+
// ----------------------------
// writing
// serialize FIXME ********
bufferlist bl;
e->encode(bl);
+
size_t elen = bl.length();
// append
off_t off = append_pos;
append_pos += elen;
+
//dout(15) << "write buf was " << write_buf.length() << " bl " << write_buf << endl;
write_buf.claim_append(bl);
//dout(15) << "write buf now " << write_buf.length() << " bl " << write_buf << endl;
void LogStream::_append_2(off_t off)
{
- dout(15) << "sync_pos now " << off << endl;
+ dout(15) << "sync_pos now " << off << " skew " << off % g_conf.mds_log_pad_entry << endl;
sync_pos = off;
// discard written bufferlist
// write it
mds->filer->write(log_ino,
+ g_OSD_MDLogLayout,
writing_buffers[flush_pos]->length(), flush_pos,
*writing_buffers[flush_pos],
0,
// decode
le->decode_payload(read_buf, off);
+ off = sizeof(type) + sizeof(length) + length; // advance past any padding that wasn't decoded..
// discard front of read_buf
read_pos += off;
dout(15) << "wait_for_next_event reading from pos " << tail << " len " << size << endl;
C_LS_ReadChunk *readc = new C_LS_ReadChunk(this);
mds->filer->read(log_ino,
+ g_OSD_MDLogLayout,
g_conf.mds_log_read_inc, tail,
&readc->bl,
readc);
// last link?
if (in->inode.nlink == 0) {
dout(7) << "last link, destroying inode " << *in << endl; // FIXME THIS IS WRONG PLACE FOR THIS!
- mds->filer->remove(in->ino(), in->inode.size,
+ mds->filer->remove(in->ino(),
+ g_OSD_FileLayout,
+ in->inode.size,
NULL); // FIXME
}
+
+
void MDStore::proc_message(Message *m)
{
switch (m->get_type()) {
// submit to osd
mds->filer->write( dir->ino(),
+ g_OSD_MDDirLayout,
bl.length(), 0,
bl,
0, //OSD_OP_FLAGS_TRUNCATE, // truncate file/object after end of this write
MDDoFetchDirContext *fin = new MDDoFetchDirContext( mds, ino, context, hashcode );
fin->bl.claim( bl );
mds->filer->read(ino,
+ g_OSD_MDDirLayout,
size - got, bl.length(),
&fin->bl2,
fin );
// read first bit
mds->filer->read(dir->ino(),
- FILE_OBJECT_SIZE, 0, // get first object's bit
+ g_OSD_MDDirLayout,
+ //FILE_OBJECT_SIZE, 0, // get first object's bit
//16, 0, // just get front bit
+ g_OSD_MDDirLayout.stripe_size, 0, // grab first stripe bit (better be more than 16 bytes!)
&fin->bl,
fin );
}
#define NUM_RUSH_REPLICAS 4 // this should be big enough to cope w/ failing disks.
#define MAX_REPLICAS 3
-#define FILE_OBJECT_SIZE (1<<20) // 1 MB object size
+//#define FILE_OBJECT_SIZE (1<<20) // 1 MB object size
-#define OID_BLOCK_BITS 30 // 1mb * 10^9 = 1 petabyte files
+#define OID_ONO_BITS 30 // 1mb * 10^9 = 1 petabyte files
#define OID_INO_BITS (64-30) // 2^34 =~ 16 billion files
-#define MAX_FILE_SIZE (FILE_OBJECT_SIZE << OID_BLOCK_BITS) // 1 PB
+//#define MAX_FILE_SIZE (FILE_OBJECT_SIZE << OID_ONO_BITS) // 1 PB
+
+
+/** OSDFileLayout
+ * specifies a striping strategy
+ */
+
+class OSDFileLayout {
+ public:
+ int stripe_size; // stripe unit, in bytes
+ int stripe_count; // over this many objects
+ int object_size; // until objects are this big, then use a new set of objects.
+
+ OSDFileLayout(int ss, int sc, int os) :
+ stripe_size(ss), stripe_count(sc), object_size(os) { }
+};
+
/** OSDGroup
/* map (ino, blockno) into a replica group */
repgroup_t file_to_repgroup(inodeno_t ino,
- size_t blockno) {
+ size_t ono) {
// something simple for now
- // hash this eventually
- return (ino+blockno) % NUM_REPLICA_GROUPS;
+ // hash this eventually!
+ return (ino+ono) % NUM_REPLICA_GROUPS;
}
}
- /* map (ino, block) to an object name
+ /* map (ino, ono) to an object name
(to be used on any osd in the proper replica group) */
object_t file_to_object(inodeno_t ino,
- size_t blockno) {
+ size_t ono) {
assert(ino < (1LL<<OID_INO_BITS)); // legal ino can't be too big
- assert(blockno < (1LL<<OID_BLOCK_BITS));
- return (ino << OID_INO_BITS) + blockno;
+ assert(ono < (1LL<<OID_ONO_BITS));
+ return (ino << OID_INO_BITS) + ono;
}
/* map (ino, offset, len) to a (list of) OSDExtents
(byte ranges in objects on osds) */
void file_to_extents(inodeno_t ino,
+ OSDFileLayout& layout,
size_t len,
size_t offset,
list<OSDExtent>& extents) {
+ // layout constant
+ size_t stripes_per_object = layout.object_size / layout.stripe_size;
+
size_t cur = offset;
size_t left = len;
while (left > 0) {
OSDExtent ex;
-
+
+ // layout into objects
+ size_t blockno = cur / layout.stripe_size;
+ size_t stripeno = blockno / layout.stripe_count;
+ size_t stripepos = blockno % layout.stripe_count;
+ size_t objectsetno = stripeno / stripes_per_object;
+ size_t objectno = objectsetno * layout.stripe_count + stripepos;
+
// find oid, osds
- size_t blockno = cur / FILE_OBJECT_SIZE;
- ex.oid = file_to_object( ino, blockno );
- ex.rg = file_to_repgroup(ino, blockno );
+ ex.oid = file_to_object( ino, objectno );
+ ex.rg = file_to_repgroup( ino, objectno );
ex.osd = get_rg_acting_primary( ex.rg );
-
+
// map range into object
- ex.offset = cur % FILE_OBJECT_SIZE;
- if (left + ex.offset > FILE_OBJECT_SIZE)
- ex.len = FILE_OBJECT_SIZE - ex.offset; // doesn't fully fit
+ size_t block_start = (stripeno % stripes_per_object)*layout.stripe_size;
+ size_t block_off = cur % layout.stripe_size;
+ size_t max = layout.stripe_size - block_off;
+
+ ex.offset = block_start + block_off;
+ if (left > max)
+ ex.len = max;
else
- ex.len = left; // fits!
+ ex.len = left;
+
+ //cout << "map: ino " << ino << " oid " << ex.oid << " osd " << ex.osd << " offset " << ex.offset << " len " << ex.len << " ... left " << left << endl;
left -= ex.len;
cur += ex.len;
int
Filer::read(inodeno_t ino,
+ OSDFileLayout& layout,
size_t len,
size_t offset,
bufferlist *bl,
// find data
list<OSDExtent> extents;
- osdcluster->file_to_extents(ino, len, offset, extents);
+ osdcluster->file_to_extents(ino, layout, len, offset, extents);
dout(7) << "osd read ino " << ino << " len " << len << " off " << offset << " in " << extents.size() << " extents" << endl;
int
Filer::write(inodeno_t ino,
+ OSDFileLayout& layout,
size_t len,
size_t offset,
bufferlist& bl,
// find data
list<OSDExtent> extents;
- osdcluster->file_to_extents(ino, len, offset, extents);
+ osdcluster->file_to_extents(ino, layout, len, offset, extents);
dout(7) << "osd write ino " << ino << " len " << len << " off " << offset << " in " << extents.size() << " extents" << endl;
}
-int Filer::remove(inodeno_t ino, size_t size, Context *onfinish)
+int Filer::remove(inodeno_t ino,
+ OSDFileLayout& layout,
+ size_t size,
+ Context *onfinish)
{
// pending write record
PendingOSDOp_t *p = new PendingOSDOp_t;
// find data
list<OSDExtent> extents;
- osdcluster->file_to_extents(ino, size, 0, extents);
+ osdcluster->file_to_extents(ino, layout, size, 0, extents);
dout(7) << "osd remove ino " << ino << " size " << size << " in " << extents.size() << " extents" << endl;
}
-int Filer::probe_size(inodeno_t ino, size_t *size, Context *onfinish)
+int Filer::probe_size(inodeno_t ino,
+ OSDFileLayout& layout,
+ size_t *size,
+ Context *onfinish)
{
PendingOSDProbe_t *p = new PendingOSDProbe_t;
p->final_size = size;
#include "include/types.h"
#include "msg/Dispatcher.h"
+#include "OSDCluster.h"
class Context;
class Messenger;
// osd fun
int read(inodeno_t ino,
+ OSDFileLayout& layout,
size_t len,
size_t offset,
bufferlist *bl, // ptr to data
Context *c);
int write(inodeno_t ino,
+ OSDFileLayout& layout,
size_t len,
size_t offset,
bufferlist& bl,
int flags,
Context *c);
- int probe_size(inodeno_t ino, size_t *size, Context *c);
- int remove(inodeno_t ino, size_t size, Context *c);
+ int probe_size(inodeno_t ino,
+ OSDFileLayout& layout,
+ size_t *size, Context *c);
+ int remove(inodeno_t ino,
+ OSDFileLayout& layout,
+ size_t size, Context *c);
//int zero(inodeno_t ino, size_t len, size_t offset, Context *c);