]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
merged r1937:1957 from trunk into branches/sage/mds
authorsageweil <sageweil@29311d96-e01e-0410-9327-a35deaab8ce9>
Wed, 17 Oct 2007 18:37:06 +0000 (18:37 +0000)
committersageweil <sageweil@29311d96-e01e-0410-9327-a35deaab8ce9>
Wed, 17 Oct 2007 18:37:06 +0000 (18:37 +0000)
git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1958 29311d96-e01e-0410-9327-a35deaab8ce9

40 files changed:
branches/sage/mds/client/Client.cc
branches/sage/mds/client/SyntheticClient.cc
branches/sage/mds/config.cc
branches/sage/mds/config.h
branches/sage/mds/crush2/Makefile [new file with mode: 0644]
branches/sage/mds/crush2/buckets.c [new file with mode: 0644]
branches/sage/mds/crush2/buckets.h [new file with mode: 0644]
branches/sage/mds/crush2/crush.c [new file with mode: 0644]
branches/sage/mds/crush2/crush.h [new file with mode: 0644]
branches/sage/mds/crush2/hash.h [new file with mode: 0644]
branches/sage/mds/crush2/types.h [new file with mode: 0644]
branches/sage/mds/include/ceph_fs.h [new file with mode: 0644]
branches/sage/mds/include/ceph_inttypes.h [deleted file]
branches/sage/mds/include/object.h
branches/sage/mds/include/types.h
branches/sage/mds/jobs/runjobsample [new file with mode: 0644]
branches/sage/mds/kernel/bufferlist.h [new file with mode: 0644]
branches/sage/mds/kernel/ceph_fs.h [deleted file]
branches/sage/mds/kernel/inode.c
branches/sage/mds/kernel/kmsg.h [new file with mode: 0644]
branches/sage/mds/kernel/kmsgbits.h [new file with mode: 0644]
branches/sage/mds/kernel/mds_client.h [new file with mode: 0644]
branches/sage/mds/kernel/mdsmap.h
branches/sage/mds/kernel/monmap.h
branches/sage/mds/kernel/osd_client.h [new file with mode: 0644]
branches/sage/mds/kernel/super.h [new file with mode: 0644]
branches/sage/mds/mds/ClientMap.cc
branches/sage/mds/mds/IdAllocator.cc
branches/sage/mds/mds/MDLog.cc
branches/sage/mds/msg/Message.h
branches/sage/mds/msg/ceph_msg_types.h [deleted file]
branches/sage/mds/msg/msg_types.h
branches/sage/mds/osd/OSDMap.h
branches/sage/mds/osd/PG.h
branches/sage/mds/osd/osd_types.h
branches/sage/mds/osdc/Filer.cc
branches/sage/mds/osdc/Journaler.cc
branches/sage/mds/osdc/Journaler.h
branches/sage/mds/osdc/Objecter.cc
branches/sage/mds/osdc/Objecter.h

index 4a7e6baacaf5ac06b4fe86a06fc9f484ecd84368..67c5af7101ed5fe1b49405cc11c7f588697beb0d 100644 (file)
@@ -3847,21 +3847,21 @@ int Client::get_stripe_unit(int fd)
 {
   FileLayout layout;
   describe_layout(fd, &layout);
-  return layout.stripe_unit;
+  return layout.fl_stripe_unit;
 }
 
 int Client::get_stripe_width(int fd)
 {
   FileLayout layout;
   describe_layout(fd, &layout);
-  return layout.stripe_width();
+  return ceph_file_layout_stripe_width(layout);
 }
 
 int Client::get_stripe_period(int fd)
 {
   FileLayout layout;
   describe_layout(fd, &layout);
-  return layout.period();
+  return ceph_file_layout_period(layout);
 }
 
 int Client::enumerate_layout(int fd, list<ObjectExtent>& result,
index 931ea790625bb81c5174c91f0d0b472ef3e8fe0e..1695631b8b8cb8035128da8f4b0f582d69bf105f 100644 (file)
@@ -1683,7 +1683,7 @@ int SyntheticClient::write_file(string& fn, int size, int wrsize)   // size is i
     // = 128 bits (16 bytes)
     uint64_t *p = (uint64_t*)buf;
     while ((char*)p < buf + wrsize) {
-      *p = i*wrsize + (char*)p - buf;      
+      *p = (uint64_t)i*(uint64_t)wrsize + (uint64_t)((char*)p - buf);      
       p++;
       *p = client->get_nodeid();
       p++;
@@ -1729,11 +1729,12 @@ int SyntheticClient::read_file(string& fn, int size, int rdsize, bool ignoreprin
  
     // verify fingerprint
     int bad = 0;
-    int64_t *p = (int64_t*)buf;
-    int64_t readoff, readclient;
+    uint64_t *p = (uint64_t*)buf;
+    uint64_t readoff;
+    int64_t readclient;
     while ((char*)p + 32 < buf + rdsize) {
       readoff = *p;
-      int64_t wantoff = i*rdsize + (int64_t)((char*)p - buf);
+      uint64_t wantoff = (uint64_t)i*(uint64_t)rdsize + (uint64_t)((char*)p - buf);
       p++;
       readclient = *p;
       p++;
@@ -1817,7 +1818,7 @@ int SyntheticClient::create_objects(int nobj, int osize, int inflight)
     if (time_to_stop()) break;
 
     object_t oid(0x1000, i);
-    ObjectLayout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, g_OSD_FileLayout.pg_size);
+    ObjectLayout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, g_OSD_FileLayout.fl_pg_size);
     
     if (i % inflight == 0) {
       dout(6) << "create_objects " << i << "/" << (nobj+1) << dendl;
@@ -1919,7 +1920,7 @@ int SyntheticClient::object_rw(int nobj, int osize, int wrpc,
     }
     object_t oid(0x1000, o);
 
-    ObjectLayout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, g_OSD_FileLayout.pg_size);
+    ObjectLayout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, g_OSD_FileLayout.fl_pg_size);
     
     client->client_lock.Lock();
     utime_t start = g_clock.now();
index f9dea43f893a13582372c7bbdaf5f91b153f87b7..f037fe728dfe46b0483ae0ef628b16ef4430ccfa 100644 (file)
@@ -41,10 +41,45 @@ ostream *_dout = &std::cout;
 ostream *_derr = &std::cerr;
 
 // file layouts
-FileLayout g_OSD_FileLayout( 1<<22, 1, 1<<22, pg_t::TYPE_REP, 2 );   // 4M objects, 2x replication
-FileLayout g_OSD_MDDirLayout( 1<<22, 1, 1<<22, pg_t::TYPE_REP, 2 );  // 4M objects, 2x replication.  (a lie, just object layout policy)
-FileLayout g_OSD_MDLogLayout( 1<<20, 1, 1<<20, pg_t::TYPE_REP, 2 );  // 1M objects
-FileLayout g_OSD_MDAnchorTableLayout( 1<<20, 1, 1<<20, pg_t::TYPE_REP, 2 );  // 1M objects.  (a lie, just object layout policy)
+struct ceph_file_layout g_OSD_FileLayout = {
+ fl_stripe_unit: 1<<22,
+ fl_stripe_count: 1,
+ fl_object_size: 1<<22,
+ fl_object_stripe_unit: 0,
+ fl_pg_preferred: -1,
+ fl_pg_type: CEPH_PG_TYPE_REP,
+ fl_pg_size: 2
+};
+
+struct ceph_file_layout g_OSD_MDDirLayout = {
+ fl_stripe_unit: 1<<22,
+ fl_stripe_count: 1,
+ fl_object_size: 1<<22,
+ fl_object_stripe_unit: 0,
+ fl_pg_preferred: -1,
+ fl_pg_type: CEPH_PG_TYPE_REP,
+ fl_pg_size: 2
+};
+
+struct ceph_file_layout g_OSD_MDLogLayout = {
+ fl_stripe_unit: 1<<20,
+ fl_stripe_count: 1,
+ fl_object_size: 1<<20,
+ fl_object_stripe_unit: 0,
+ fl_pg_preferred: -1,
+ fl_pg_type: CEPH_PG_TYPE_REP,
+ fl_pg_size: 2
+};
+
+struct ceph_file_layout g_OSD_MDAnchorTableLayout = {
+ fl_stripe_unit: 1<<20,
+ fl_stripe_count: 1,
+ fl_object_size: 1<<20,
+ fl_object_stripe_unit: 0,
+ fl_pg_preferred: -1,
+ fl_pg_type: CEPH_PG_TYPE_REP,
+ fl_pg_size: 2
+};
 
 #include <msg/msg_types.h>
 
@@ -266,8 +301,8 @@ md_config_t g_conf = {
   osd_stat_refresh_interval: .5,
 
   osd_pg_bits: 4,  // bits per osd
-  osd_object_layout: OBJECT_LAYOUT_HASHINO,//LINEAR,//HASHINO,
-  osd_pg_layout: PG_LAYOUT_CRUSH,//LINEAR,//CRUSH,
+  osd_object_layout: CEPH_OBJECT_LAYOUT_HASHINO,//LINEAR,//HASHINO,
+  osd_pg_layout: CEPH_PG_LAYOUT_CRUSH,//LINEAR,//CRUSH,
   osd_max_rep: 4,
   osd_min_raid_width: 4,
   osd_max_raid_width: 3, //6, 
@@ -896,18 +931,18 @@ void parse_config_options(std::vector<char*>& args)
 
     else if (strcmp(args[i], "--osd_object_layout") == 0) {
       i++;
-      if (strcmp(args[i], "linear") == 0) g_conf.osd_object_layout = OBJECT_LAYOUT_LINEAR;
-      else if (strcmp(args[i], "hashino") == 0) g_conf.osd_object_layout = OBJECT_LAYOUT_HASHINO;
-      else if (strcmp(args[i], "hash") == 0) g_conf.osd_object_layout = OBJECT_LAYOUT_HASH;
+      if (strcmp(args[i], "linear") == 0) g_conf.osd_object_layout = CEPH_OBJECT_LAYOUT_LINEAR;
+      else if (strcmp(args[i], "hashino") == 0) g_conf.osd_object_layout = CEPH_OBJECT_LAYOUT_HASHINO;
+      else if (strcmp(args[i], "hash") == 0) g_conf.osd_object_layout = CEPH_OBJECT_LAYOUT_HASH;
       else assert(0);
     }
     
     else if (strcmp(args[i], "--osd_pg_layout") == 0) {
       i++;
-      if (strcmp(args[i], "linear") == 0) g_conf.osd_pg_layout = PG_LAYOUT_LINEAR;
-      else if (strcmp(args[i], "hash") == 0) g_conf.osd_pg_layout = PG_LAYOUT_HASH;
-      else if (strcmp(args[i], "hybrid") == 0) g_conf.osd_pg_layout = PG_LAYOUT_HYBRID;
-      else if (strcmp(args[i], "crush") == 0) g_conf.osd_pg_layout = PG_LAYOUT_CRUSH;
+      if (strcmp(args[i], "linear") == 0) g_conf.osd_pg_layout = CEPH_PG_LAYOUT_LINEAR;
+      else if (strcmp(args[i], "hash") == 0) g_conf.osd_pg_layout = CEPH_PG_LAYOUT_HASH;
+      else if (strcmp(args[i], "hybrid") == 0) g_conf.osd_pg_layout = CEPH_PG_LAYOUT_HYBRID;
+      else if (strcmp(args[i], "crush") == 0) g_conf.osd_pg_layout = CEPH_PG_LAYOUT_CRUSH;
       else assert(0);
     }
     
@@ -917,38 +952,38 @@ void parse_config_options(std::vector<char*>& args)
       g_conf.tick = atoi(args[++i]);
 
     else if (strcmp(args[i], "--file_layout_unit") == 0) 
-      g_OSD_FileLayout.stripe_unit = atoi(args[++i]);
+      g_OSD_FileLayout.fl_stripe_unit = atoi(args[++i]);
     else if (strcmp(args[i], "--file_layout_count") == 0) 
-      g_OSD_FileLayout.stripe_count = atoi(args[++i]);
+      g_OSD_FileLayout.fl_stripe_count = atoi(args[++i]);
     else if (strcmp(args[i], "--file_layout_osize") == 0) 
-      g_OSD_FileLayout.object_size = atoi(args[++i]);
+      g_OSD_FileLayout.fl_object_size = atoi(args[++i]);
     else if (strcmp(args[i], "--file_layout_pg_type") == 0) 
-      g_OSD_FileLayout.pg_type = atoi(args[++i]);
+      g_OSD_FileLayout.fl_pg_type = atoi(args[++i]);
     else if (strcmp(args[i], "--file_layout_pg_size") == 0) 
-      g_OSD_FileLayout.pg_size = atoi(args[++i]);
+      g_OSD_FileLayout.fl_pg_size = atoi(args[++i]);
 
     else if (strcmp(args[i], "--meta_dir_layout_unit") == 0) 
-      g_OSD_MDDirLayout.stripe_unit = atoi(args[++i]);
+      g_OSD_MDDirLayout.fl_stripe_unit = atoi(args[++i]);
     else if (strcmp(args[i], "--meta_dir_layout_scount") == 0) 
-      g_OSD_MDDirLayout.stripe_count = atoi(args[++i]);
+      g_OSD_MDDirLayout.fl_stripe_count = atoi(args[++i]);
     else if (strcmp(args[i], "--meta_dir_layout_osize") == 0) 
-      g_OSD_MDDirLayout.object_size = atoi(args[++i]);
+      g_OSD_MDDirLayout.fl_object_size = atoi(args[++i]);
     else if (strcmp(args[i], "--meta_dir_layout_pg_type") == 0) 
-      g_OSD_MDDirLayout.pg_type = atoi(args[++i]);
+      g_OSD_MDDirLayout.fl_pg_type = atoi(args[++i]);
     else if (strcmp(args[i], "--meta_dir_layout_pg_size") == 0) 
-      g_OSD_MDDirLayout.pg_size = atoi(args[++i]);
+      g_OSD_MDDirLayout.fl_pg_size = atoi(args[++i]);
 
     else if (strcmp(args[i], "--meta_log_layout_unit") == 0) 
-      g_OSD_MDLogLayout.stripe_unit = atoi(args[++i]);
+      g_OSD_MDLogLayout.fl_stripe_unit = atoi(args[++i]);
     else if (strcmp(args[i], "--meta_log_layout_scount") == 0) 
-      g_OSD_MDLogLayout.stripe_count = atoi(args[++i]);
+      g_OSD_MDLogLayout.fl_stripe_count = atoi(args[++i]);
     else if (strcmp(args[i], "--meta_log_layout_osize") == 0) 
-      g_OSD_MDLogLayout.object_size = atoi(args[++i]);
+      g_OSD_MDLogLayout.fl_object_size = atoi(args[++i]);
     else if (strcmp(args[i], "--meta_log_layout_pg_type") == 0) 
-      g_OSD_MDLogLayout.pg_type = atoi(args[++i]);
+      g_OSD_MDLogLayout.fl_pg_type = atoi(args[++i]);
     else if (strcmp(args[i], "--meta_log_layout_pg_size") == 0) {
-      g_OSD_MDLogLayout.pg_size = atoi(args[++i]);
-      if (!g_OSD_MDLogLayout.pg_size)
+      g_OSD_MDLogLayout.fl_pg_size = atoi(args[++i]);
+      if (!g_OSD_MDLogLayout.fl_pg_size)
         g_conf.mds_log = false;
     }
 
index 3c56f6af2094187efc9a0e3313c7480bb07d1073..b5cdf6cbd586d8478d2d9088337be6a273c9caf9 100644 (file)
 #ifndef __CONFIG_H
 #define __CONFIG_H
 
-extern class FileLayout g_OSD_FileLayout;
-extern class FileLayout g_OSD_MDDirLayout;
-extern class FileLayout g_OSD_MDLogLayout;
-extern class FileLayout g_OSD_MDAnchorTableLayout;
+extern struct ceph_file_layout g_OSD_FileLayout;
+extern struct ceph_file_layout g_OSD_MDDirLayout;
+extern struct ceph_file_layout g_OSD_MDLogLayout;
+extern struct ceph_file_layout g_OSD_MDAnchorTableLayout;
 
 #include <vector>
 #include <map>
diff --git a/branches/sage/mds/crush2/Makefile b/branches/sage/mds/crush2/Makefile
new file mode 100644 (file)
index 0000000..7db40f7
--- /dev/null
@@ -0,0 +1,26 @@
+
+CC = gcc
+CFLAGS = -Wall
+CFLAGS += -O3 -g
+LD = ld
+RM = rm
+
+all: depend libcrush.o
+
+clean:
+       rm -f *.o libcrush.o
+
+%.o: %.c
+       ${CC} ${CFLAGS} -c $< -o $@
+
+libcrush.o: crush.o buckets.o
+       $(LD) -i -o $@ $^
+
+.depend:
+       touch .depend
+
+depend:
+       $(RM) .depend
+       makedepend -f- -- $(CFLAGS) -- *.c > .depend 2>/dev/null
+
+include .depend
diff --git a/branches/sage/mds/crush2/buckets.c b/branches/sage/mds/crush2/buckets.c
new file mode 100644 (file)
index 0000000..2a2e170
--- /dev/null
@@ -0,0 +1,56 @@
+
+#include "hash.h"
+#include "buckets.h"
+
+int 
+crush_bucket_uniform_choose(struct crush_bucket_uniform *bucket, int x, int r)
+{
+  unsigned o, p, s;
+  o = crush_hash32_2(x, bucket->h.id);
+  p = bucket->primes[crush_hash32_2(bucket->h.id, x) % bucket->h.size];
+  s = (x + o + (r+1)*p) % bucket->h.size;
+  return bucket->h.items[s];
+}
+
+int 
+crush_bucket_list_choose(struct crush_bucket_list *bucket, int x, int r)
+{
+  int i;
+  __u64 w;
+
+  for (i=0; i<bucket->h.size; i++) {
+    w = crush_hash32_4(x, bucket->h.items[i], r, bucket->h.id) & 0xffff;
+    w = (w * bucket->sum_weights[i]) >> 32;
+    if (w < bucket->item_weights[i])
+      return bucket->h.items[i];
+  }
+
+  BUG_ON(1);
+  return 0;
+}
+
+int 
+crush_bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r)
+{
+  return 0;
+}
+
+int 
+crush_bucket_straw_choose(struct crush_bucket_straw *bucket, int x, int r)
+{
+  int i;
+  int high = 0;
+  unsigned high_draw = 0;
+  __u64 draw;
+
+  for (i=0; i<bucket->h.size; i++) {
+    draw = (crush_hash32_3(x, bucket->h.items[i], r) & 0xffff) * bucket->straws[i];
+    draw = draw >> 32;
+    if (i == 0 || draw > high_draw) {
+      high = i;
+      high_draw = draw;
+    }
+  }
+
+  return high;
+}
diff --git a/branches/sage/mds/crush2/buckets.h b/branches/sage/mds/crush2/buckets.h
new file mode 100644 (file)
index 0000000..c83d522
--- /dev/null
@@ -0,0 +1,49 @@
+#ifndef _CRUSH_BUCKETS_H
+#define _CRUSH_BUCKETS_H
+
+#include "types.h"
+
+enum {
+  CRUSH_BUCKET_UNIFORM = 1,
+  CRUSH_BUCKET_LIST = 2,
+  CRUSH_BUCKET_TREE = 3,
+  CRUSH_BUCKET_STRAW = 4
+};
+
+struct crush_bucket {
+  __u32 id;
+  __u32 type;
+  __u32 weight;    /* 16-bit fixed point */
+  __u32 size;      /* num items */
+  __s32 *items;
+};
+
+struct crush_bucket_uniform {
+  struct crush_bucket h;
+  __u32 item_weight;  /* 16-bit fixed point */
+  __u32 item_type;
+  __u32 *primes;
+};
+
+struct crush_bucket_list {
+  struct crush_bucket h;
+  __u32 *item_weights;  /* 16-bit fixed point */
+  __u32 *sum_weights;   /* 16-bit fixed point */
+};
+
+struct crush_bucket_tree {
+  struct crush_bucket h;
+
+};
+
+struct crush_bucket_straw {
+  struct crush_bucket h;
+  __u32 *straws;  /* 16-bit fixed point */
+};
+
+extern int crush_bucket_uniform_choose(struct crush_bucket_uniform *bucket, int x, int r);
+extern int crush_bucket_list_choose(struct crush_bucket_list *bucket, int x, int r);
+extern int crush_bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r);
+extern int crush_bucket_straw_choose(struct crush_bucket_straw *bucket, int x, int r);
+
+#endif
diff --git a/branches/sage/mds/crush2/crush.c b/branches/sage/mds/crush2/crush.c
new file mode 100644 (file)
index 0000000..3b420c4
--- /dev/null
@@ -0,0 +1,236 @@
+
+#include "crush.h"
+#include "hash.h"
+
+/*
+ * choose numrep distinct items of given type
+ */
+static int crush_choose(struct crush_map *map,
+                       struct crush_bucket *bucket,
+                       int x, int numrep, int type,
+                       int *out, int firstn,
+                       int *outmap)
+{
+  int rep;
+  int ftotal, flocal;
+  int retry_rep, skip_rep;
+  struct crush_bucket *in = bucket;
+  int r;
+  int i;
+  int item;
+  int itemtype;
+  int outpos;
+  int collide, bad;
+
+  outpos = 0;
+
+  for (rep = 0; rep < numrep; rep++) {
+    /* keep trying until we get a non-out, non-colliding item */
+    ftotal = 0;
+    skip_rep = 0;
+
+    while (1) {
+      in = bucket;               /* initial bucket */
+
+      /* choose through intervening buckets */
+      flocal = 0;
+      retry_rep = 0;
+
+      while (1) {
+       r = rep;
+       if (in->type == CRUSH_BUCKET_UNIFORM) {
+         /* be careful */
+         if (firstn || numrep >= in->size) {
+           r += ftotal;           /* r' = r + f_total */
+         } else {
+           r += numrep * flocal;  /* r' = r + n*f_local */
+           /* make sure numrep is not a multiple of bucket size */
+           if (in->size % numrep == 0)
+             /* shift seq once per pass through the bucket */
+             r += numrep * flocal / in->size;  
+         }
+       } else {
+         if (firstn) 
+           r += ftotal;           /* r' = r + f_total */
+         else 
+           r += numrep * flocal;  /* r' = r + n*f_local */
+       }
+
+       /* bucket choose */
+       switch (in->type) {
+       case CRUSH_BUCKET_UNIFORM:
+         item = crush_bucket_uniform_choose((struct crush_bucket_uniform*)in, x, r);
+         break;
+       case CRUSH_BUCKET_LIST:
+         item = crush_bucket_list_choose((struct crush_bucket_list*)in, x, r);
+         break;
+       case CRUSH_BUCKET_TREE:
+         item = crush_bucket_tree_choose((struct crush_bucket_tree*)in, x, r);
+         break;
+       case CRUSH_BUCKET_STRAW:
+         item = crush_bucket_straw_choose((struct crush_bucket_straw*)in, x, r);
+         break;
+       default:
+         BUG_ON(1);
+       }
+       
+       /* desired type? */
+       if (in->type == CRUSH_BUCKET_UNIFORM) 
+         itemtype = ((struct crush_bucket_uniform*)in)->item_type;
+       else if (item < 0) 
+         itemtype = map->buckets[-item].type;
+       else 
+         itemtype = 0;
+
+       /* keep going? */
+       if (itemtype != type) {
+         in = &map->buckets[-item];
+         continue;
+       }
+
+       /* collision? */
+       collide = 0;
+       for (i=0; i<rep; i++) {
+         if (out[i] == item) {
+           collide = 1;
+           break;
+         }
+       }
+       
+       /* bad (out)? */
+       bad = 0;
+       if (itemtype == 0 && out[item] != 0xffff) {
+         if (out[item] == 0)
+           bad = 1;
+         else if ((crush_hash32_2(x, item) & 0xffff) > out[item])
+           bad = 1;
+       }
+
+       if (bad || collide) {
+         ftotal++;
+         flocal++;
+         
+         if (collide && flocal < 3) 
+           continue;   /* locally a few times */
+         if (ftotal >= 10) {
+           /* give up, ignore dup, fixme */
+           skip_rep = 1;
+           break;
+         }
+         retry_rep = 1;
+       }
+       break;
+      }
+      
+      if (retry_rep) continue;
+    }
+    
+    if (skip_rep) continue;
+
+    out[outpos] = item;
+    outpos++;
+  }
+
+  return outpos;
+}
+
+
+int crush_do_rule(struct crush_map *map,
+                 int ruleno,
+                 int x, int *result, int result_max,
+                 int *outmap,      /* array of size max_devices, values 0...0xffff */
+                 int forcefeed)    /* -1 for none */
+{
+  int result_len;
+  int force_stack[CRUSH_MAX_DEPTH];
+  int force_pos = -1;
+  int a[CRUSH_MAX_SET];
+  int b[CRUSH_MAX_SET];
+  int *w;
+  int wsize = 0;
+  int *o;
+  int osize;
+  int *tmp;
+  struct crush_rule *rule;
+  int step;
+  int i;
+  int numrep;
+  
+  rule = &map->rules[ruleno];
+  result_len = 0;
+  w = a;
+  o = b;
+
+  /* determine hierarchical context of forcefeed, if any */
+  if (forcefeed >= 0) {
+    while (1) {
+      force_stack[++force_pos] = forcefeed;
+      if (forcefeed >= 0)
+       forcefeed = map->device_parent_map[forcefeed];
+      else
+       forcefeed = map->bucket_parent_map[-forcefeed];
+      if (forcefeed == 0) break;
+    }
+  }
+    
+  for (step = 0; step < rule->len; step++) {
+    switch (rule->steps[step].op) {
+    case CRUSH_RULE_TAKE:
+      if (force_pos >= 0) {
+       w[0] = force_stack[force_pos];
+       force_pos--;
+       BUG_ON(w[0] != rule->steps[step].arg1);
+      } else {
+       w[0] = rule->steps[step].arg1;
+      }
+      wsize = 1;
+      break;
+      
+    case CRUSH_RULE_CHOOSE_FIRSTN:
+    case CRUSH_RULE_CHOOSE_INDEP:
+      BUG_ON(wsize == 0);
+      
+      /* reset output */
+      osize = 0;
+      
+      for (i = 0; i < wsize; i++) {
+       numrep = rule->steps[step].arg1;
+
+       if (force_pos >= 0) {
+         o[osize++] = force_stack[force_pos];
+         force_pos--;
+         numrep--;
+       }
+       if (numrep)
+         crush_choose(map,
+                      &map->buckets[-w[i]],
+                      x, numrep, rule->steps[step].arg2,
+                      o+osize, rule->steps[step].op == CRUSH_RULE_CHOOSE_FIRSTN,
+                      outmap);
+      }
+      
+      /* swap t and w arrays */
+      tmp = o;
+      o = w;
+      w = o;
+      wsize = osize;
+      break;      
+      
+
+    case CRUSH_RULE_EMIT:
+      for (i=0; i<wsize && result_max; i++) {
+       result[result_len] = w[i];
+       result_len++;
+       result_max--;
+      }
+      wsize = 0;
+      break;
+
+    default:
+      BUG_ON(1);
+    }
+  }
+  
+  return result_len;
+}
+
diff --git a/branches/sage/mds/crush2/crush.h b/branches/sage/mds/crush2/crush.h
new file mode 100644 (file)
index 0000000..21b42e5
--- /dev/null
@@ -0,0 +1,49 @@
+#ifndef _CRUSH_CRUSH_H
+#define _CRUSH_CRUSH_H
+
+#include "types.h"
+#include "buckets.h"
+
+enum {
+  CRUSH_RULE_TAKE,
+  CRUSH_RULE_CHOOSE_FIRSTN,
+  CRUSH_RULE_CHOOSE_INDEP,
+  CRUSH_RULE_EMIT
+};
+
+#define CRUSH_MAX_DEPTH 10
+#define CRUSH_MAX_SET   10
+
+struct crush_rule_step {
+  __u32 op;
+  __s32 arg1;
+  __s32 arg2;
+};
+
+struct crush_rule {
+  __u32 len;
+  struct crush_rule_step *steps;
+};
+
+struct crush_map {
+  struct crush_bucket *buckets;
+  struct crush_rule *rules;
+
+  /* parent pointers */
+  __u32 *bucket_parent_map;
+  __u32 *device_parent_map;
+
+  __u32 max_buckets;
+  __u32 max_rules;
+  __u32 max_devices;
+};
+
+extern int crush_do_rule(struct crush_map *map,
+                        int ruleno,
+                        int x, int *result, int result_max,
+                        int *outmap,    /* array of size max_devices, values 0...0xffff */
+                        int forcefeed); /* -1 for none */
+
+/*extern int crush_decode(struct crush_map *map, struct ceph_bufferlist *bl);*/
+
+#endif
diff --git a/branches/sage/mds/crush2/hash.h b/branches/sage/mds/crush2/hash.h
new file mode 100644 (file)
index 0000000..0a3370b
--- /dev/null
@@ -0,0 +1,80 @@
+#ifndef _CRUSH_HASH_H
+#define _CRUSH_HASH_H
+
+// Robert Jenkins' function for mixing 32-bit values
+// http://burtleburtle.net/bob/hash/evahash.html
+// a, b = random bits, c = input and output
+#define hashmix(a,b,c) \
+        a=a-b;  a=a-c;  a=a^(c>>13); \
+        b=b-c;  b=b-a;  b=b^(a<<8);  \
+        c=c-a;  c=c-b;  c=c^(b>>13); \
+        a=a-b;  a=a-c;  a=a^(c>>12); \
+        b=b-c;  b=b-a;  b=b^(a<<16); \
+        c=c-a;  c=c-b;  c=c^(b>>5);  \
+        a=a-b;  a=a-c;  a=a^(c>>3); \
+        b=b-c;  b=b-a;  b=b^(a<<10); \
+        c=c-a;  c=c-b;  c=c^(b>>15); 
+
+#define crush_hash_seed 1315423911
+
+static __inline__ unsigned crush_hash32(unsigned a) {
+      unsigned hash = crush_hash_seed ^ a;
+      unsigned b = a;
+      unsigned x = 231232;
+      unsigned y = 1232;
+      hashmix(b, x, hash);
+      hashmix(y, a, hash);
+      return (hash & 0xFFFFFFFF);
+}
+
+static __inline__ unsigned crush_hash32_2(unsigned a, unsigned b) {
+      unsigned hash = crush_hash_seed ^ a ^ b;
+      unsigned x = 231232;
+      unsigned y = 1232;
+      hashmix(a, b, hash);
+      hashmix(x, a, hash);
+      hashmix(b, y, hash);
+      return (hash & 0xFFFFFFFF);
+}
+
+static __inline__ unsigned crush_hash32_3(unsigned a, unsigned b, unsigned c) {
+      unsigned int hash = crush_hash_seed ^ a ^ b ^ c;
+      unsigned x = 231232;
+      unsigned y = 1232;
+      hashmix(a, b, hash);
+      hashmix(c, x, hash);
+      hashmix(y, a, hash);
+      hashmix(b, x, hash);
+      hashmix(y, c, hash);
+      return (hash & 0xFFFFFFFF);
+}
+
+static __inline__ unsigned crush_hash32_4(unsigned a, unsigned b, unsigned c, unsigned d) {
+      unsigned int hash = crush_hash_seed ^a ^ b ^ c ^ d;
+      unsigned x = 231232;
+      unsigned y = 1232;
+      hashmix(a, b, hash);
+      hashmix(c, d, hash);
+      hashmix(a, x, hash);
+      hashmix(y, b, hash);
+      hashmix(c, x, hash);
+      hashmix(y, d, hash);
+      return (hash & 0xFFFFFFFF);
+}
+
+static __inline__ unsigned crush_hash32_5(unsigned a, unsigned b, unsigned c, unsigned d, unsigned e) {
+      unsigned int hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
+      unsigned x = 231232;
+      unsigned y = 1232;
+      hashmix(a, b, hash);
+      hashmix(c, d, hash);
+      hashmix(e, x, hash);
+      hashmix(y, a, hash);
+      hashmix(b, x, hash);
+      hashmix(y, c, hash);
+      hashmix(d, x, hash);
+      hashmix(y, e, hash);
+      return (hash & 0xFFFFFFFF);
+}
+
+#endif
diff --git a/branches/sage/mds/crush2/types.h b/branches/sage/mds/crush2/types.h
new file mode 100644 (file)
index 0000000..ea68240
--- /dev/null
@@ -0,0 +1,11 @@
+#ifndef _CRUSH_TYPES_H
+#define _CRUSH_TYPES_H
+
+#include <linux/types.h>  /* just for int types */
+
+#ifndef BUG_ON
+# include <assert.h>
+# define BUG_ON(x) assert(!(x))
+#endif
+
+#endif
diff --git a/branches/sage/mds/include/ceph_fs.h b/branches/sage/mds/include/ceph_fs.h
new file mode 100644 (file)
index 0000000..ede0663
--- /dev/null
@@ -0,0 +1,163 @@
+/* ceph_fs.h
+ *
+ * C data types to share between kernel and userspace
+ */
+
+#ifndef _FS_CEPH_CEPH_FS_H
+#define _FS_CEPH_CEPH_FS_H
+
+#include <linux/types.h>
+
+
+typedef __u64 ceph_ino_t;
+
+
+/**
+ * object id
+ */
+struct ceph_object {
+       ceph_ino_t ino;  /* inode "file" identifier */
+       __u32 bno;  /* "block" (object) in that "file" */
+       __u32 rev;  /* revision.  normally ctime (as epoch). */
+};
+typedef struct ceph_object ceph_object_t;
+
+
+
+
+/** object layout
+ * how objects are mapped into PGs
+ */
+#define CEPH_OBJECT_LAYOUT_HASH     1
+#define CEPH_OBJECT_LAYOUT_LINEAR   2
+#define CEPH_OBJECT_LAYOUT_HASHINO  3
+
+/**
+ * pg layout -- how PGs are mapped into (sets of) OSDs
+ */
+#define CEPH_PG_LAYOUT_CRUSH  0   
+#define CEPH_PG_LAYOUT_HASH   1
+#define CEPH_PG_LAYOUT_LINEAR 2
+#define CEPH_PG_LAYOUT_HYBRID 3
+
+
+/**
+ * ceph_file_layout - describe data layout for a file/inode
+ */
+struct ceph_file_layout {
+       /* file -> object mapping */
+       __u32 fl_stripe_unit;     /* stripe unit, in bytes.  must be multiple of page size. */
+       __u32 fl_stripe_count;    /* over this many objects */
+       __u32 fl_object_size;     /* until objects are this big, then move to new objects */
+       
+       /* pg -> disk layout */
+       __u32 fl_object_stripe_unit;   /* for per-object raid */
+
+       /* object -> pg layout */
+       __s32 fl_pg_preferred; /* preferred primary for pg */
+       __u8  fl_pg_type;      /* pg type; see PG_TYPE_* */
+       __u8  fl_pg_size;      /* pg size (num replicas, raid stripe width, etc. */
+};
+
+#define ceph_file_layout_stripe_width(l) (l.fl_stripe_unit * l.fl_stripe_count)
+
+/* period = bytes before i start on a new set of objects */
+#define ceph_file_layout_period(l) (l.fl_object_size * l.fl_stripe_count)
+
+
+
+/**
+ * placement group id
+ */
+#define CEPH_PG_TYPE_REP   1
+#define CEPH_PG_TYPE_RAID4 2
+
+union ceph_pg {
+       __u64 pg64;
+       struct {
+               __s32 preferred; /* preferred primary osd */
+               __u16 ps;        /* placement seed */
+               __u8 type;
+               __u8 size;
+       } pg;
+};
+typedef union ceph_pg ceph_pg_t;
+
+#define ceph_pg_is_rep(pg) (pg.pg.type == CEPH_PG_TYPE_REP)
+#define ceph_pg_is_raid4(pg) (pg.pg.type == CEPH_PG_TYPE_RAID4)
+
+/**
+ * object layout
+ *
+ * describe how a given object should be stored.
+ */
+struct ceph_object_layout {
+       ceph_pg_t ol_pgid;
+       __u32 ol_stripe_unit;  
+};
+
+
+
+/**
+ * object extent
+ */
+struct ceph_object_extent {
+       ceph_object_t oe_oid;
+       __u64 oe_start;
+       __u64 oe_length;
+       struct ceph_object_layout oe_object_layout;
+       
+       /* buffer extent reverse mapping? */
+};
+
+
+
+
+
+/*********************************************
+ * message types
+ */
+
+/*
+ * entity_name
+ */
+struct ceph_entity_name {
+       __u32 type;
+       __u32 num;
+};
+
+#define CEPH_ENTITY_TYPE_MON    1
+#define CEPH_ENTITY_TYPE_MDS    2
+#define CEPH_ENTITY_TYPE_OSD    3
+#define CEPH_ENTITY_TYPE_CLIENT 4
+#define CEPH_ENTITY_TYPE_ADMIN  5
+
+
+/*
+ * entity_addr
+ * ipv4 only for now
+ */
+struct ceph_entity_addr {
+       __u64 nonce;
+       __u32 port;
+       __u8  ipq[4];
+};
+
+
+struct ceph_entity_inst {
+       struct ceph_entity_name name;
+       struct ceph_entity_addr addr;
+};
+
+
+/*
+ * message header
+ */
+struct ceph_message_header {
+       __u32 type;
+       struct ceph_entity_inst src, dst;
+       __u32 source_port, dest_port;
+       __u32 nchunks;
+};
+
+#endif
diff --git a/branches/sage/mds/include/ceph_inttypes.h b/branches/sage/mds/include/ceph_inttypes.h
deleted file mode 100644 (file)
index c31c76a..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __CEPH_INTTYPES_H
-#define __CEPH_INTTYPES_H
-
-typedef uint32_t __u32;
-typedef uint16_t __u16;
-typedef uint8_t __u8;
-
-#endif
index a1382626f7cae5cf754835f16c104eb0817be521..3b8ac05a86b387a755b84e6de74d76defb09dd40 100644 (file)
@@ -85,17 +85,6 @@ inline ostream& operator<<(ostream& out, const object_t o) {
 
 
 namespace __gnu_cxx {
-#ifndef __LP64__
-  template<> struct hash<uint64_t> {
-    size_t operator()(uint64_t __x) const { 
-      static hash<uint32_t> H;
-      return H((__x >> 32) ^ (__x & 0xffffffff)); 
-      //static rjhash<uint64_t> H;
-      //return H(__x);
-    }
-  };
-#endif
-
   template<> struct hash<object_t> {
     size_t operator()(const object_t &r) const { 
       static rjhash<uint64_t> H;
@@ -107,15 +96,4 @@ namespace __gnu_cxx {
   };
 
 }
-
-/*
-  template<> struct rjhash<object_t> {
-    size_t operator()(const object_t &r) const { 
-      static rjhash<uint64_t> H;
-      static rjhash<uint32_t> I;
-      return H(r.ino) ^ I(r.bno) ^ I(r.rev);
-    }
-  };
-*/
-
 #endif
index 92bcb94c6dc5f5b962c11578494511c71d5e8a23..cf8374d329a774759e7b1f9e12f2bae99014e16f 100644 (file)
@@ -36,6 +36,8 @@ using namespace std;
 #include <ext/hash_map>
 using namespace __gnu_cxx;
 
+#include "ceph_fs.h"
+
 
 #include "object.h"
 #include "utime.h"
@@ -68,6 +70,12 @@ namespace __gnu_cxx {
       return H((__x >> 32) ^ (__x & 0xffffffff)); 
     }
   };
+  template<> struct hash<uint64_t> {
+    size_t operator()(uint64_t __x) const { 
+      static hash<uint32_t> H;
+      return H((__x >> 32) ^ (__x & 0xffffffff)); 
+    }
+  };
 #endif
 
 }
@@ -105,76 +113,18 @@ typedef uint64_t version_t;
 typedef uint32_t epoch_t;       // map epoch  (32bits -> 13 epochs/second for 10 years)
 
 
-// object and pg layout
-// specified in g_conf.osd_*
 
 #define O_LAZY 01000000
 
 
-/** object layout
- * how objects are mapped into PGs
- */
-#define OBJECT_LAYOUT_HASH     1
-#define OBJECT_LAYOUT_LINEAR   2
-#define OBJECT_LAYOUT_HASHINO  3
-
-/** pg layout
- * how PGs are mapped into (sets of) OSDs
- */
-#define PG_LAYOUT_CRUSH  0   
-#define PG_LAYOUT_HASH   1
-#define PG_LAYOUT_LINEAR 2
-#define PG_LAYOUT_HYBRID 3
-
-
-
-// -----------------------
-// FileLayout
-
-/** FileLayout 
- * specifies a striping and replication strategy
- */
-
-//#define FILE_LAYOUT_CRUSH    0    // stripe via crush
-//#define FILE_LAYOUT_LINEAR   1    // stripe linearly across cluster
-
-struct FileLayout {
-  // -- file -> object mapping --
-  int32_t stripe_unit;     // stripe unit, in bytes
-  int32_t stripe_count;    // over this many objects
-  int32_t object_size;     // until objects are this big, then move to new objects
-
-  int stripe_width() { return stripe_unit * stripe_count; }
-
-  // period = bytes before i start on a new set of objects.
-  int period() { return object_size * stripe_count; }
-
-  // -- object -> pg layout --
-  char pg_type;        // pg type (replicated, raid, etc.) (see pg_t::TYPE_*)
-  char pg_size;        // pg size (num replicas, or raid4 stripe width)
-  int32_t  preferred;  // preferred primary osd?
-
-  // -- pg -> disk layout --
-  int32_t  object_stripe_unit;  // for per-object raid
-
-  FileLayout() { }
-  FileLayout(int su, int sc, int os, int pgt, int pgs, int o=-1) :
-    stripe_unit(su), stripe_count(sc), object_size(os), 
-    pg_type(pgt), pg_size(pgs), preferred(o),
-    object_stripe_unit(su)   // note: bad default, we pbly want su/(pgs-1)
-  {
-    assert(object_size % stripe_unit == 0);
-  }
-
-};
-
 
+typedef ceph_file_layout FileLayout;
 
 
 // --------------------------------------
 // inode
 
-typedef uint64_t _inodeno_t;
+typedef __uint64_t _inodeno_t;
 
 struct inodeno_t {
   _inodeno_t val;
@@ -228,14 +178,6 @@ namespace __gnu_cxx {
 
 inline int DT_TO_MODE(int dt) {
   return dt << 12;
-  /*
-  switch (dt) {
-  case DT_REG: return INODE_MODE_FILE;
-  case DT_DIR: return INODE_MODE_DIR;
-  case DT_LNK: return INODE_MODE_SYMLINK;
-  default: assert(0); return 0;
-  }
-  */
 }
 
 struct inode_t {
@@ -278,13 +220,6 @@ struct inode_t {
 
 inline unsigned char MODE_TO_DT(int mode) {
   return mode >> 12;
-  /*
-  if (S_ISREG(mode)) return inode_t::DT_REG;
-  if (S_ISLNK(mode)) return inode_t::DT_LNK;
-  if (S_ISDIR(mode)) return inode_t::DT_DIR;
-  assert(0);
-  return 0;
-  */
 }
 
 
diff --git a/branches/sage/mds/jobs/runjobsample b/branches/sage/mds/jobs/runjobsample
new file mode 100644 (file)
index 0000000..590be20
--- /dev/null
@@ -0,0 +1,26 @@
+#!/usr/bin/perl
+
+# hi there
+{
+    '_sleep' => 3,
+    
+    'nummds' => 1,
+    'numosd' => 16, #[8],#10,14,16],
+    'numclient' => 32,#,4,10,20,40], #[10*16],
+    '_n' => 32,
+        
+    '_start' => 15,
+    '_end' => 45,
+    '_kill_after' => 190,
+    
+    'osd_pg_bits' => [4, 6],
+    'osd_auto_weight' => [0,1],
+    'file_layout_pg_size' => [1,2],
+
+    '_custom' => '--syn createobjects 1000000 1048576 2',
+    
+    '_comb' => {
+       'x' => 'osd_pg_bits',
+       'vars' => [ 'osd.c_wrb' ]
+       }
+};
diff --git a/branches/sage/mds/kernel/bufferlist.h b/branches/sage/mds/kernel/bufferlist.h
new file mode 100644 (file)
index 0000000..78e4c6f
--- /dev/null
@@ -0,0 +1,74 @@
+#ifndef _FS_CEPH_BUFFERLIST_H
+#define _FS_CEPH_BUFFERLIST_H
+
+
+
+#define CEPH_BUFFERLIST_START_IOVLEN  8  /* embed some statically, for fast normal case */
+
+struct ceph_bufferlist {
+       struct iovec *b_iov;   /* data payload */
+       struct iovec b_iov_array[CEPH_BUFFERLIST_START_IOVLEN];  
+       int b_iovlen;          /* used/defined elements in b_iov */         
+       int b_iovmax;          /* allocated size of b_iov array */
+       struct iovec b_append; /* preallocated memory for appending data to this bufferlist */
+};
+
+struct ceph_bufferlist_iterator {
+       int i_iov;  /* which iov */
+       int i_off;  /* offset in that iov */
+};
+
+/*
+ * add referenced memory to the bufferlist.
+ * expand b_iov array if necessary.
+ * extend tail iovec if the added region is contiguous.
+ */
+void ceph_bufferlist_append_ref(struct ceph_bufferlist *bl, void *p, int len)
+{
+       struct iovec *tmpvec;
+       if (bl->b_iovlen == bl->b_iovmax) {
+               if (bl->b_iovmax) {
+                       bl->b_iovmax *= 2;
+                       tmpvec = kmalloc(bl->b_iovmax);
+                       memcpy(tmpvec, bl->b_iov, sizeof(iovec)*bl->b_iovlen);
+                       if (bl->b_iovlen > CEPH_BUFFERLIST_START_IOVLEN)
+                               kfree(bl->b_iov);
+                       bl->b_iov = tmpvec;
+                       memset(tmpvec + bl->b_iovlen, 0, 
+                              sizeof(iovec)*(bl->b_iovmax - bl->b_iovlen));
+               } else {
+                       bl->b_iovmax = CEPH_BUFFERLIST_START_IOVLEN;
+                       bl->b_iov = bl->b_iov_array;
+               }
+       }
+
+       if (bl->b_iovlen && 
+           p == bl->b_iov[bl->b_iovlen-1].iov_base + bl->b_iov[bl->b_iovlen-1].iov_base) {
+               bl->b_iov[bl->b_iovlen-1].iov_len += len;
+       } else {
+               bl->b_iov[bl->b_iovlen].iov_base = p;
+               bl->b_iov[bl->b_iovlen].iov_len = len;
+               bl->b_iovlen++;
+       }
+}
+
+void ceph_bufferlist_append_copy(struct ceph_bufferlist *bl, void *p, int len)
+{
+       int s;
+       while (len > 0) {
+               /* allocate more space? */
+               if (!bl->b_append.iov_len) {
+                       bl->b_append.iov_len = (len + PAGE_SIZE - 1) & ~(PAGE_SIZE-1);
+                       bl->b_append.iov_base = kmalloc(bl->b_append.iov_len, GFP_KERNEL);
+               }
+
+               /* copy what we can */
+               s = min(bl->b_append.iov_len, len);
+               memcpy(bl->b_append.iov_base, s);
+               ceph_bufferlist_append_ref(bl, b_append.iov_base, b_append.iov_len);
+               len -= s;
+               bl->b_append.iov_len -= s;
+       }
+}
+
+#endif
diff --git a/branches/sage/mds/kernel/ceph_fs.h b/branches/sage/mds/kernel/ceph_fs.h
deleted file mode 100644 (file)
index 5804b49..0000000
+++ /dev/null
@@ -1,79 +0,0 @@
-/* -*- mode:C++; tab-width:8; c-basic-offset:8; indent-tabs-mode:t -*- 
- * vim: ts=8 sw=8 smarttab
- */
-
-#ifndef _FS_CEPH_CEPH_H
-#define _FS_CEPH_CEPH_H
-
-/* #include <linux/ceph_fs.h> */
-
-#include "kmsg.h"
-
-#include "mdsmap.h"
-#include "monmap.h"
-
-/* do these later
-#include "osdmap.h"
-*/
-struct ceph_osdmap;
-
-
-
-/*
- * state associated with an individual MDS<->client session
- */
-struct ceph_mds_session {
-       __u64 s_push_seq;  
-       /* wait queue? */
-};
-
-struct ceph_mds_request {
-
-};
-
-/*
- * CEPH file system in-core superblock info
- */
-struct ceph_sb_info {
-       __u32  s_whoami;               /* client number */
-       struct ceph_kmsg   *s_kmsg;    /* messenger instance */
-
-       struct ceph_monmap *s_monmap;  /* monitor map */
-       struct ceph_mdsmap *s_mdsmap;  /* mds map */
-       struct ceph_osdmap *s_osdmap;  /* osd map */
-
-       /* mds sessions */
-       struct ceph_mds_session **s_mds_sessions;     /* sparse array; elements NULL if no session */
-       int                      s_max_mds_sessions;  /* size of s_mds_sessions array */
-       
-       
-
-       /* current requests */
-       /* ... */
-       __u64 last_tid;
-};
-
-/*
- * CEPH file system in-core inode info
- */
-struct ceph_inode_info {
-       unsigned long val;  /* inode from types.h is uint64_t */
-       struct inode vfs_inode;
-};
-
-static inline struct ceph_inode_info *CEPH_I(struct inode *inode)
-{
-       return list_entry(inode, struct ceph_inode_info, vfs_inode);
-}
-
-
-/* file.c */
-extern const struct inode_operations ceph_file_inops;
-extern const struct file_operations ceph_file_operations;
-extern const struct address_space_operations ceph_aops;
-
-/* dir.c */
-extern const struct inode_operations ceph_dir_inops;
-extern const struct file_operations ceph_dir_operations;
-
-#endif /* _FS_CEPH_CEPH_H */
index 99fdaf84de4c1617d19dcfddea3fb40597d74a0d..f21fa5838693559a63edfa6c3fccd7e8f71c944f 100644 (file)
@@ -1,7 +1,3 @@
-/* -*- mode:C++; tab-width:8; c-basic-offset:8; indent-tabs-mode:t -*- 
- * vim: ts=8 sw=8 smarttab
- */
-
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/smp_lock.h>
diff --git a/branches/sage/mds/kernel/kmsg.h b/branches/sage/mds/kernel/kmsg.h
new file mode 100644 (file)
index 0000000..cc44b9f
--- /dev/null
@@ -0,0 +1,51 @@
+#ifndef __FS_CEPH_KMSG_H
+#define __FS_CEPH_KMSG_H
+
+#include <linux/uio.h>
+#include <linux/radix-tree.h>
+#include <linux/ceph_fs.h>
+#include "ceph_kthread.h"
+
+
+struct ceph_kthreadpool *msg_threadpool;       /* thread pool */
+
+struct ceph_kmsgr {
+       void *m_parent;
+       struct radix_tree_root mpipes;          /* other nodes talk to */
+       struct client_thread_info cthread;      /* listener thread info */
+};
+
+struct ceph_message {
+       struct ceph_message_header *msghdr;     /* header */
+       struct kvec *m_iov;                     /* data storage */
+       size_t m_iovlen;        /* is this kvec.iov_len why need it in kvec? */
+       struct list_head m_list_head;
+};
+
+struct ceph_kmsg_pipe {
+       int p_sd;         /* socket descriptor */
+       __u64 p_out_seq;  /* last message sent */
+       __u64 p_in_seq;   /* last message received */
+
+       /* out queue */
+       struct list_head p_out_queue;
+       struct ceph_message *p_out_partial;  /* partially sent message */
+       int p_out_partial_pos;
+       struct list_head p_out_sent;  /* sent but unacked; may need resend if connection drops */
+
+       /* partially read message contents */
+       struct kvec *p_in_partial_iov;   /* hrm, this probably isn't what we want */
+       size_t p_in_partial_iovlen;
+       size_t p_in_parital_iovmax;  /* size of currently allocated m_iov array */
+       /* .. or something like that? .. */
+
+};
+
+/* 
+ * function prototypes
+ */
+extern void ceph_read_message(struct ceph_message *message);
+extern void ceph_write_message(struct ceph_message *message);
+extern void ceph_client_dispatch(void *fs_client, struct ceph_message *message );
+extern void queue_message(struct ceph_message *message);
+#endif
diff --git a/branches/sage/mds/kernel/kmsgbits.h b/branches/sage/mds/kernel/kmsgbits.h
new file mode 100644 (file)
index 0000000..730ff7f
--- /dev/null
@@ -0,0 +1,50 @@
+
+
+
+struct ceph_message {
+       struct ceph_message_header m_hdr; /* header */
+       struct iovec *m_iov;  /* payload */
+       int m_iovlen;
+       struct list_head m_list_head;    /* i'll sit in a queue */
+};
+
+
+
+/* dispatch method type */
+typedef void (*ceph_kmsg_dispatch_t)(void *h, struct ceph_message *message);
+
+struct ceph_kmsg {
+       ceph_kmsg_dispatch_t m_dispatch; /* where incoming messages go */
+       void *m_parent;                  /* passed to dispatch method */
+       
+       struct ceph_kmsg_threadpool *m_threadpool;  /* pool of threads */
+       /* possibly shared among multiple kmsg instances? */
+
+       /* other nodes i talk to */
+       struct radix_tree_root m_pipes;   /* key: dest addr, value: ceph_kmsg_pipe */
+       
+       /* ... */
+};
+
+
+struct ceph_kmsg_pipe {
+       int p_sd;         /* socket descriptor */
+       __u64 p_out_seq;  /* last message sent */
+       __u64 p_in_seq;   /* last message received */
+
+       /* out queue */
+       struct list_head p_out_queue;
+       struct ceph_message *p_out_partial;  /* partially sent message */
+       int p_out_partial_pos;
+       struct list_head p_out_sent;  /* sent but unacked; may need resend if connection drops */
+
+       /* partially read message contents */
+       struct iovec *p_in_partial_iov;   /* hrm, this probably isn't what we want */
+       int p_in_partial_iovlen;
+       int p_in_parital_iovmax;  /* size of currently allocated m_iov array */
+       /* .. or something like that? .. */
+
+};
+
+
+
diff --git a/branches/sage/mds/kernel/mds_client.h b/branches/sage/mds/kernel/mds_client.h
new file mode 100644 (file)
index 0000000..764d7cc
--- /dev/null
@@ -0,0 +1,42 @@
+#ifndef _FS_CEPH_MDS_CLIENT_H
+#define _FS_CEPH_MDS_CLIENT_H
+
+#include <linux/radix-tree.h>
+#include "kmsg.h"
+
+/*
+ * state associated with an individual MDS<->client session
+ */
+struct ceph_mds_session {
+       __u64 s_push_seq;  
+       /* wait queue? */
+};
+
+struct ceph_mds_request {
+       __u64 r_tid;
+       struct ceph_message *r_msg;
+       __u8  r_idempotent;
+       
+       __u32 r_mds[4];        /* set of mds's with whom request may be outstanding */
+       __u32 r_num_mds;       /* items in r_mds */
+       
+       __u32 r_num_fwd;       /* number of forward attempts */
+        __s32 r_resend_mds;    /* mds to resend to next, if any*/
+       
+       /* waiter/callback? */  
+};
+
+
+struct ceph_mds_client {
+       struct ceph_mdsmap *s_mdsmap;  /* mds map */
+
+       /* mds sessions */
+       struct ceph_mds_session **s_mds_sessions;     /* sparse array; elements NULL if no session */
+       int s_max_mds_sessions;            /* size of s_mds_sessions array */
+
+       __u64 s_last_mds_tid;              /* id of last mds request */
+       struct radix_tree_root s_mds_requests;  /* in-flight mds requests */
+
+};
+
+#endif
index 4b3cb8460a9a1e6a4dca1ebec2a179063eead4bb..c5a970992c36ca8e6d8a8ee8a6588b1f61240411 100644 (file)
@@ -1,7 +1,3 @@
-/* -*- mode:C++; tab-width:8; c-basic-offset:8; indent-tabs-mode:t -*- 
- * vim: ts=8 sw=8 smarttab
- */
-
 #ifndef _FS_CEPH_MDSMAP_H
 #define _FS_CEPH_MDSMAP_H
 
index 9f7e535264a8e598b56c6c2e0019029ab0defbe2..2f60c8a0c343666d853891936e792c6955e1d962 100644 (file)
@@ -1,10 +1,8 @@
-/* -*- mode:C++; tab-width:8; c-basic-offset:8; indent-tabs-mode:t -*- 
- * vim: ts=8 sw=8 smarttab
- */
-
 #ifndef _FS_CEPH_MONMAP_H
 #define _FS_CEPH_MONMAP_H
 
+#include <linux/uio.h>
+
 /*
  * monitor map
  */
@@ -15,7 +13,7 @@ struct ceph_monmap {
   struct ceph_entity_inst m_mon_inst;
 };
 
-extern int ceph_monmap_pick_mon(ceph_monmap *m);
-extern int ceph_monmap_decode(ceph_monmap *m, iovec *v);
+extern int ceph_monmap_pick_mon(struct ceph_monmap *m);
+extern int ceph_monmap_decode(struct ceph_monmap *m, struct kvec *v);
 
 #endif
diff --git a/branches/sage/mds/kernel/osd_client.h b/branches/sage/mds/kernel/osd_client.h
new file mode 100644 (file)
index 0000000..6efa3b8
--- /dev/null
@@ -0,0 +1,18 @@
+#ifndef _FS_CEPH_OSD_CLIENT_H
+#define _FS_CEPH_OSD_CLIENT_H
+
+/* this will be equivalent to osdc/Objecter.h */
+
+
+/* do these later
+#include "osdmap.h"
+*/
+struct ceph_osdmap;
+
+
+struct ceph_osd_client {
+       struct ceph_osdmap *s_osdmap;  /* osd map */
+
+};
+
+#endif
diff --git a/branches/sage/mds/kernel/super.h b/branches/sage/mds/kernel/super.h
new file mode 100644 (file)
index 0000000..9441851
--- /dev/null
@@ -0,0 +1,75 @@
+#ifndef _FS_CEPH_CEPH_H
+#define _FS_CEPH_CEPH_H
+
+/* #include <linux/ceph_fs.h> */
+
+#include "kmsg.h"
+#include "monmap.h"
+#include "mds_client.h"
+#include "osd_client.h"
+
+
+
+/* 
+ * CEPH per-filesystem client state
+ * 
+ * possibly shared by multiple mount points, if they are 
+ * mounting the same ceph filesystem/cluster.
+ */
+struct ceph_fs_client {
+       __u64 s_fsid;  /* hmm this should be part of the monmap? */
+
+       __u32 s_whoami;                /* my client number */
+       struct ceph_kmsg   *s_kmsg;    /* messenger instance */
+
+       struct ceph_monmap *s_monmap;  /* monitor map */
+
+       struct ceph_mds_client *s_mds_client;
+       struct ceph_osd_client *s_osd_client;
+
+       int s_ref;    /* reference count (for each sb_info that points to me) */
+};
+
+/*
+ * directory of filesystems mounted by this host
+ *
+ *   key: fsid?  ipquad of monitor?  hmm!
+ * value: struct ceph_fs_client*
+ */
+extern struct radix_tree ceph_fs_clients; 
+
+
+/*
+ * CEPH per-mount superblock info
+ */
+struct ceph_sb_info {
+       struct ceph_fs_client *sb_client;
+       
+       /* FIXME: add my relative offset into the filesystem,
+          so we can appropriately mangle/adjust path names in requests, etc. */
+};
+
+/*
+ * CEPH file system in-core inode info
+ */
+struct ceph_inode_info {
+       struct ceph_file_layout i_layout;
+       struct inode vfs_inode;
+};
+
+static inline struct ceph_inode_info *CEPH_I(struct inode *inode)
+{
+       return list_entry(inode, struct ceph_inode_info, vfs_inode);
+}
+
+
+/* file.c */
+extern const struct inode_operations ceph_file_inops;
+extern const struct file_operations ceph_file_operations;
+extern const struct address_space_operations ceph_aops;
+
+/* dir.c */
+extern const struct inode_operations ceph_dir_inops;
+extern const struct file_operations ceph_dir_operations;
+
+#endif /* _FS_CEPH_CEPH_H */
index 5170f3fe9b3eb602404ac62c23f7574cdadb9f56..1d781b9ba48c3314c788c88101da153a0f8623ae 100644 (file)
@@ -61,7 +61,7 @@ void ClientMap::load(Context *onload)
   
   C_CM_Load *c = new C_CM_Load(this);
   mds->filer->read(inode,
-                   0, inode.layout.stripe_unit,
+                   0, inode.layout.fl_stripe_unit,
                    &c->bl,
                    c);
 
index 3a490c48c263d2b7db7b00c0bc720ebd50f8d835..36a36ea9eb037a2ee72740321b0a7c8a71e8ea12 100644 (file)
@@ -174,7 +174,7 @@ void IdAllocator::load(Context *onfinish)
 
   C_ID_Load *c = new C_ID_Load(this, onfinish);
   mds->filer->read(inode,
-                   0, inode.layout.stripe_unit,
+                   0, inode.layout.fl_stripe_unit,
                    &c->bl,
                    c);
 }
index fc7cdffbe6e10e5c9df48ec976259028cefcd1ee..eeea99c7217512fd002785a793b812fe833180ac 100644 (file)
@@ -82,7 +82,7 @@ void MDLog::init_journaler()
   log_inode.layout = g_OSD_MDLogLayout;
   
   if (g_conf.mds_local_osd) 
-    log_inode.layout.preferred = mds->get_nodeid() + g_conf.mds_local_osd_offset;  // hack
+    log_inode.layout.fl_pg_preferred = mds->get_nodeid() + g_conf.mds_local_osd_offset;  // hack
   
   // log streamer
   if (journaler) delete journaler;
@@ -191,8 +191,8 @@ void MDLog::submit_entry( LogEvent *le, Context *c )
   off_t last_seg = get_last_segment_offset();
   if (!segments.empty() && 
       !writing_subtree_map &&
-      (journaler->get_write_pos() / log_inode.layout.period()) != (last_seg / log_inode.layout.period()) &&
-      (journaler->get_write_pos() - last_seg > log_inode.layout.period()/2)) {
+      (journaler->get_write_pos() / ceph_file_layout_period(log_inode.layout) != (last_seg / ceph_file_layout_period(log_inode.layout)) &&
+       (journaler->get_write_pos() - last_seg > ceph_file_layout_period(log_inode.layout)/2))) {
     dout(10) << "submit_entry also starting new segment: last = " << last_seg
             << ", cur pos = " << journaler->get_write_pos() << dendl;
     start_new_segment();
index a0de9a24ddab7be7381ca39c41c6e1f673ba6c9b..9f0175e7a7d1e2f7a32547f5212fbda43f19edb8 100644 (file)
@@ -174,11 +174,11 @@ public:
 
  public:
   Message() { 
-    env.source_port = env.dest_port = -1;
+    env.source_port = env.dest_port = 0;
     env.nchunks = 0;
   };
   Message(int t) {
-    env.source_port = env.dest_port = -1;
+    env.source_port = env.dest_port = 0;
     env.nchunks = 0;
     env.type = t;
   }
diff --git a/branches/sage/mds/msg/ceph_msg_types.h b/branches/sage/mds/msg/ceph_msg_types.h
deleted file mode 100644 (file)
index 559c972..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-/* -*- mode:C++; tab-width:8; c-basic-offset:8; indent-tabs-mode:t -*- 
- * vim: ts=8 sw=8 smarttab
- */
-#ifndef __CEPH_MSG_TYPES_H
-#define __CEPH_MSG_TYPES_H
-
-/*
- * entity_name
- */
-struct ceph_entity_name {
-       __u32 type;
-       __u32 num;
-};
-
-#define CEPH_ENTITY_TYPE_MON    1
-#define CEPH_ENTITY_TYPE_MDS    2
-#define CEPH_ENTITY_TYPE_OSD    3
-#define CEPH_ENTITY_TYPE_CLIENT 4
-#define CEPH_ENTITY_TYPE_ADMIN  5
-
-
-/*
- * entity_addr
- * ipv4 only for now
- */
-struct ceph_entity_addr {
-       __u8  ipq[4];
-       __u32 port;
-       __u32 nonce;
-};
-
-
-struct ceph_entity_inst {
-       struct ceph_entity_name name;
-       struct ceph_entity_addr addr;
-};
-
-
-/*
- * message header
- */
-struct ceph_message_header {
-       __u32 type;
-       struct ceph_entity_inst src, dst;
-       __u32 source_port, dest_port;
-       __u32 nchunks;
-};
-
-#endif
index 652525729cdfcedfd9901d789ab3571a907c9526..52b1e69c8886c865f3cddc1be5a952f68fad7f92 100644 (file)
 #ifndef __MSG_TYPES_H
 #define __MSG_TYPES_H
 
-// raw C structs
-#include "include/ceph_inttypes.h"
-#include "ceph_msg_types.h"
-
 #include "include/types.h"
 #include "include/blobhash.h"
 #include "tcp.h"
@@ -97,11 +93,7 @@ namespace __gnu_cxx {
   };
 }
 
-// get rid of these
-//#define MSG_ADDR_MDS(x)     entity_name_t(entity_name_t::TYPE_MDS,x)
-//#define MSG_ADDR_OSD(x)     entity_name_t(entity_name_t::TYPE_OSD,x)
-//#define MSG_ADDR_MON(x)     entity_name_t(entity_name_t::TYPE_MON,x)
-//#define MSG_ADDR_CLIENT(x)  entity_name_t(entity_name_t::TYPE_CLIENT,x)
+
 
 /*
  * an entity's network address.
@@ -111,11 +103,9 @@ namespace __gnu_cxx {
  */
 struct entity_addr_t {
   struct ceph_entity_addr v;
-  uint32_t _pad;
 
-  entity_addr_t() : _pad(0) { 
-    v.port = v.nonce = 0; 
-    v.ipq[0] = v.ipq[1] = v.ipq[2] = v.ipq[3] = 0;
+  entity_addr_t() { 
+    memset(&v, 0, sizeof(v));
   }
 
   void set_addr(tcpaddr_t a) {
index b50f725687d2385c5777520c77fe69fdc3c65d0b..fda57d73ef99e9a16935c9f5b08b992465582573 100644 (file)
@@ -316,7 +316,7 @@ private:
 
   // oid -> pg
   ObjectLayout file_to_object_layout(object_t oid, FileLayout& layout) {
-    return make_object_layout(oid, layout.pg_type, layout.pg_size, layout.preferred, layout.object_stripe_unit);
+    return make_object_layout(oid, layout.fl_pg_type, layout.fl_pg_size, layout.fl_pg_preferred, layout.fl_object_stripe_unit);
   }
 
   ObjectLayout make_object_layout(object_t oid, int pg_type, int pg_size, int preferred=-1, int object_stripe_unit = 0) {
@@ -328,16 +328,16 @@ private:
     // calculate ps (placement seed)
     ps_t ps;
     switch (g_conf.osd_object_layout) {
-    case OBJECT_LAYOUT_LINEAR:
+    case CEPH_OBJECT_LAYOUT_LINEAR:
       ps = stable_mod(oid.bno + oid.ino, num, num_mask);
       break;
       
-    case OBJECT_LAYOUT_HASHINO:
+    case CEPH_OBJECT_LAYOUT_HASHINO:
       //ps = stable_mod(oid.bno + H(oid.bno+oid.ino)^H(oid.ino>>32), num, num_mask);
       ps = stable_mod(oid.bno + H(oid.ino)^H(oid.ino>>32), num, num_mask);
       break;
 
-    case OBJECT_LAYOUT_HASH:
+    case CEPH_OBJECT_LAYOUT_HASH:
       //ps = stable_mod(H( (oid.bno & oid.ino) ^ ((oid.bno^oid.ino) >> 32) ), num, num_mask);
       //ps = stable_mod(H(oid.bno) + H(oid.ino)^H(oid.ino>>32), num, num_mask);
       //ps = stable_mod(oid.bno + H(oid.bno+oid.ino)^H(oid.bno+oid.ino>>32), num, num_mask);
@@ -361,7 +361,7 @@ private:
                  vector<int>& osds) {       // list of osd addr's
     // map to osds[]
     switch (g_conf.osd_pg_layout) {
-    case PG_LAYOUT_CRUSH:
+    case CEPH_PG_LAYOUT_CRUSH:
       {
        // what crush rule?
        int rule;
@@ -382,12 +382,12 @@ private:
       }
       break;
       
-    case PG_LAYOUT_LINEAR:
+    case CEPH_PG_LAYOUT_LINEAR:
       for (int i=0; i<pg.size(); i++) 
        osds.push_back( (i + pg.ps()*pg.size()) % g_conf.num_osd );
       break;
       
-    case PG_LAYOUT_HYBRID:
+    case CEPH_PG_LAYOUT_HYBRID:
       {
        static crush::Hash H(777);
        int h = H(pg.ps());
@@ -396,7 +396,7 @@ private:
       }
       break;
       
-    case PG_LAYOUT_HASH:
+    case CEPH_PG_LAYOUT_HASH:
       {
        static crush::Hash H(777);
        for (int i=0; i<pg.size(); i++) {
@@ -420,7 +420,7 @@ private:
   
     // no crush, but forcefeeding?
     if (pg.preferred() >= 0 &&
-       g_conf.osd_pg_layout != PG_LAYOUT_CRUSH) {
+       g_conf.osd_pg_layout != CEPH_PG_LAYOUT_CRUSH) {
       int osd = pg.preferred();
       
       // already in there?
index 21de9e8f00b0d743217a577cbcdd58eace6d68bc..7c68ecec2b6e45f25b887e2c9c1c461647c5f314 100644 (file)
@@ -214,7 +214,6 @@ public:
       int        op;   // write, zero, trunc, remove
       object_t   oid;
       eversion_t version;
-      objectrev_t rev;
       
       osdreqid_t reqid;  // caller+tid to uniquely identify request
       
index 08292252934ec55793ba04940f48847a226f5fe2..24dd9eca74234d3d8ec6a2de128251c0022b3a72 100644 (file)
@@ -82,41 +82,33 @@ typedef uint8_t pruleset_t;
 // placement group id
 struct pg_t {
 public:
-  static const int TYPE_REP   = 1;
-  static const int TYPE_RAID4 = 2;
+  static const int TYPE_REP   = CEPH_PG_TYPE_REP;
+  static const int TYPE_RAID4 = CEPH_PG_TYPE_RAID4;
 
 private:
-  union {
-    struct {
-      int32_t preferred;
-      uint8_t type;
-      uint8_t size;
-      uint16_t ps;
-    } fields;
-    uint64_t val;          // 64
-  } u;
+  union ceph_pg u;
 
 public:
-  pg_t() { u.val = 0; }
-  pg_t(const pg_t& o) { u.val = o.u.val; }
+  pg_t() { u.pg64 = 0; }
+  pg_t(const pg_t& o) { u.pg64 = o.u.pg64; }
   pg_t(int type, int size, ps_t seed, int pref) {//, pruleset_t r=0) {
-    u.fields.type = type;
-    u.fields.size = size;
-    u.fields.ps = seed;
-    u.fields.preferred = pref;   // hack: avoid negative.
-    //u.fields.ruleset = r;
-    assert(sizeof(u.fields) == sizeof(u.val));
+    u.pg.type = type;
+    u.pg.size = size;
+    u.pg.ps = seed;
+    u.pg.preferred = pref;   // hack: avoid negative.
+    //u.pg.ruleset = r;
+    assert(sizeof(u.pg) == sizeof(u.pg64));
   }
-  pg_t(uint64_t v) { u.val = v; }
+  pg_t(uint64_t v) { u.pg64 = v; }
 
-  int type()      { return u.fields.type; }
+  int type()      { return u.pg.type; }
   bool is_rep()   { return type() == TYPE_REP; }
   bool is_raid4() { return type() == TYPE_RAID4; }
 
-  int size() { return u.fields.size; }
-  ps_t ps() { return u.fields.ps; }
-  //pruleset_t ruleset() { return u.fields.ruleset; }
-  int preferred() { return u.fields.preferred; }   // hack: avoid negative.
+  int size() { return u.pg.size; }
+  ps_t ps() { return u.pg.ps; }
+  //pruleset_t ruleset() { return u.pg.ruleset; }
+  int preferred() { return u.pg.preferred; }   // hack: avoid negative.
   
   /*
   pg_t operator=(uint64_t v) { u.val = v; return *this; }
@@ -125,9 +117,9 @@ public:
   pg_t operator-=(pg_t o) { u.val -= o.val; return *this; }
   pg_t operator++() { ++u.val; return *this; }
   */
-  operator uint64_t() const { return u.val; }
+  operator uint64_t() const { return u.pg64; }
 
-  object_t to_object() const { return object_t(PG_INO, u.val >> 32, u.val & 0xffffffff); }
+  object_t to_object() const { return object_t(PG_INO, u.pg64 >> 32, u.pg64 & 0xffffffff); }
 };
 
 inline ostream& operator<<(ostream& out, pg_t pg) 
@@ -282,14 +274,12 @@ class ObjectExtent {
   off_t       start;     // in object
   size_t      length;    // in object
 
-  objectrev_t rev;       // which revision?
-
   ObjectLayout layout;   // object layout (pgid, etc.)
 
   map<size_t, size_t>  buffer_extents;  // off -> len.  extents in buffer being mapped (may be fragmented bc of striping!)
   
-  ObjectExtent() : start(0), length(0), rev(0) {}
-  ObjectExtent(object_t o, off_t s=0, size_t l=0) : oid(o), start(s), length(l), rev(0) { }
+  ObjectExtent() : start(0), length(0) {}
+  ObjectExtent(object_t o, off_t s=0, size_t l=0) : oid(o), start(s), length(l) { }
 };
 
 inline ostream& operator<<(ostream& out, ObjectExtent &ex)
index 5d13174d559525766a8bc2a2e9241faf31cc90fa..193089d3915b1b28af7834eef23281b9cb17e5e8 100644 (file)
@@ -57,7 +57,7 @@ int Filer::probe_fwd(inode_t& inode,
   Probe *probe = new Probe(inode, start_from, end, onfinish);
 
   // period (bytes before we jump unto a new set of object(s))
-  off_t period = inode.layout.period();
+  off_t period = ceph_file_layout_period(inode.layout);
 
   // start with 1+ periods.
   probe->probing_len = period;
@@ -132,7 +132,7 @@ void Filer::_probed(Probe *probe, object_t oid, off_t size)
   if (end == 0) {
     // keep probing!
     dout(10) << "_probed didn't find end, probing further" << dendl;
-    off_t period = probe->inode.layout.object_size * probe->inode.layout.stripe_count;
+    off_t period = probe->inode.layout.fl_object_size * probe->inode.layout.fl_stripe_count;
     probe->from += probe->probing_len;
     probe->probing_len = period;
     _probe(probe);
@@ -170,36 +170,35 @@ void Filer::file_to_extents(inode_t inode,
    */
   map< object_t, ObjectExtent > object_extents;
   
-  assert(inode.layout.object_size >= inode.layout.stripe_unit);
-  off_t stripes_per_object = inode.layout.object_size / inode.layout.stripe_unit;
+  assert(inode.layout.fl_object_size >= inode.layout.fl_stripe_unit);
+  off_t stripes_per_object = inode.layout.fl_object_size / inode.layout.fl_stripe_unit;
   dout(20) << " stripes_per_object " << stripes_per_object << dendl;
 
   off_t cur = offset;
   off_t left = len;
   while (left > 0) {
     // layout into objects
-    off_t blockno = cur / inode.layout.stripe_unit;          // which block
-    off_t stripeno = blockno / inode.layout.stripe_count;    // which horizontal stripe        (Y)
-    off_t stripepos = blockno % inode.layout.stripe_count;   // which object in the object set (X)
+    off_t blockno = cur / inode.layout.fl_stripe_unit;          // which block
+    off_t stripeno = blockno / inode.layout.fl_stripe_count;    // which horizontal stripe        (Y)
+    off_t stripepos = blockno % inode.layout.fl_stripe_count;   // which object in the object set (X)
     off_t objectsetno = stripeno / stripes_per_object;       // which object set
-    off_t objectno = objectsetno * inode.layout.stripe_count + stripepos;  // object id
+    off_t objectno = objectsetno * inode.layout.fl_stripe_count + stripepos;  // object id
     
     // find oid, extent
     ObjectExtent *ex = 0;
-    object_t oid( inode.ino, objectno );
+    object_t oid( inode.ino, objectno, rev );
     if (object_extents.count(oid)) 
       ex = &object_extents[oid];
     else {
       ex = &object_extents[oid];
       ex->oid = oid;
-      ex->rev = rev;
       ex->layout = objecter->osdmap->file_to_object_layout( oid, inode.layout );
     }
     
     // map range into object
-    off_t block_start = (stripeno % stripes_per_object)*inode.layout.stripe_unit;
-    off_t block_off = cur % inode.layout.stripe_unit;
-    off_t max = inode.layout.stripe_unit - block_off;
+    off_t block_start = (stripeno % stripes_per_object)*inode.layout.fl_stripe_unit;
+    off_t block_off = cur % inode.layout.fl_stripe_unit;
+    off_t max = inode.layout.fl_stripe_unit - block_off;
     
     off_t x_offset = block_start + block_off;
     off_t x_len;
index c2719549e22473d742680d0a7d51c8bd26bccead..363b7c60de9aa79874707c10e0f29f4d6385c996 100644 (file)
@@ -31,7 +31,7 @@ void Journaler::reset()
   state = STATE_ACTIVE;
   write_pos = flush_pos = ack_pos =
     read_pos = requested_pos = received_pos =
-    expire_pos = trimming_pos = trimmed_pos = inode.layout.period();
+    expire_pos = trimming_pos = trimmed_pos = ceph_file_layout_period(inode.layout);
 }
 
 
@@ -239,7 +239,7 @@ off_t Journaler::append_entry(bufferlist& bl, Context *onsync)
 
   if (!g_conf.journaler_allow_split_entries) {
     // will we span a stripe boundary?
-    int p = inode.layout.stripe_unit;
+    int p = inode.layout.fl_stripe_unit;
     if (write_pos / p != (write_pos + (off_t)(bl.length() + sizeof(s))) / p) {
       // yes.
       // move write_pos forward.
@@ -613,7 +613,7 @@ public:
 void Journaler::trim()
 {
   off_t trim_to = last_committed.expire_pos;
-  trim_to -= trim_to % inode.layout.period();
+  trim_to -= trim_to % ceph_file_layout_period(inode.layout);
   dout(10) << "trim last_commited head was " << last_committed
           << ", can trim to " << trim_to
           << dendl;
index 6463d9caf0e6fa0d74da1ded7f036c4552c102ee..a90ec5f9e348ff519692fc8902a5df3c7efe025e 100644 (file)
@@ -183,7 +183,7 @@ public:
     // prefetch intelligently.
     // (watch out, this is big if you use big objects or weird striping)
     if (!fetch_len)
-      fetch_len = inode.layout.object_size*inode.layout.stripe_count *
+      fetch_len = inode.layout.fl_object_size*inode.layout.fl_stripe_count *
        g_conf.journaler_prefetch_periods;
     if (!prefetch_from)
       prefetch_from = fetch_len / 2;
index e6efee1aa4a33d537713405e84ef169be9e37667..84563b0af97205f441295ae537ae76b5928b06dc 100644 (file)
@@ -320,13 +320,11 @@ void Objecter::handle_osd_op_reply(MOSDOpReply *m)
 
 // stat -----------------------------------
 
-tid_t Objecter::stat(object_t oid, off_t *size, ObjectLayout ol, Context *onfinish,
-                                        objectrev_t rev)
+tid_t Objecter::stat(object_t oid, off_t *size, ObjectLayout ol, Context *onfinish)
 {
   OSDStat *st = new OSDStat(size);
   st->extents.push_back(ObjectExtent(oid, 0, 0));
   st->extents.front().layout = ol;
-  st->extents.front().rev = rev;
   st->onfinish = onfinish;
 
   return stat_submit(st);
@@ -424,14 +422,12 @@ void Objecter::handle_osd_stat_reply(MOSDOpReply *m)
 // read -----------------------------------
 
 
-tid_t Objecter::read(object_t oid, off_t off, size_t len, ObjectLayout ol, bufferlist *bl, 
-                     Context *onfinish, 
-                                        objectrev_t rev)
+tid_t Objecter::read(object_t oid, off_t off, size_t len, ObjectLayout ol, bufferlist *bl,
+                     Context *onfinish)
 {
   OSDRead *rd = new OSDRead(bl);
   rd->extents.push_back(ObjectExtent(oid, off, len));
   rd->extents.front().layout = ol;
-  rd->extents.front().rev = rev;
   readx(rd, onfinish);
   return last_tid;
 }
@@ -665,14 +661,12 @@ void Objecter::handle_osd_read_reply(MOSDOpReply *m)
 // write ------------------------------------
 
 tid_t Objecter::write(object_t oid, off_t off, size_t len, ObjectLayout ol, bufferlist &bl, 
-                      Context *onack, Context *oncommit,
-                                         objectrev_t rev)
+                      Context *onack, Context *oncommit)
 {
   OSDWrite *wr = new OSDWrite(bl);
   wr->extents.push_back(ObjectExtent(oid, off, len));
   wr->extents.front().layout = ol;
   wr->extents.front().buffer_extents[0] = len;
-  wr->extents.front().rev = rev;
   modifyx(wr, onack, oncommit);
   return last_tid;
 }
@@ -681,13 +675,11 @@ tid_t Objecter::write(object_t oid, off_t off, size_t len, ObjectLayout ol, buff
 // zero
 
 tid_t Objecter::zero(object_t oid, off_t off, size_t len, ObjectLayout ol,
-                     Context *onack, Context *oncommit,
-                                        objectrev_t rev)
+                     Context *onack, Context *oncommit)
 {
   OSDModify *z = new OSDModify(OSD_OP_ZERO);
   z->extents.push_back(ObjectExtent(oid, off, len));
   z->extents.front().layout = ol;
-  z->extents.front().rev = rev;
   modifyx(z, onack, oncommit);
   return last_tid;
 }
@@ -760,7 +752,6 @@ tid_t Objecter::modifyx_submit(OSDModify *wr, ObjectExtent &ex, tid_t usetid)
                           wr->op);
     m->set_length(ex.length);
     m->set_offset(ex.start);
-    m->set_rev(ex.rev);
     if (usetid > 0)
       m->set_retry_attempt(true);
     
index ed5c44745604e24f80f41a9e05808105a2eccdab..82a437aa04f8de3f876044b3d97455cc555d3405 100644 (file)
@@ -213,16 +213,12 @@ class Objecter {
 
   // even lazier
   tid_t read(object_t oid, off_t off, size_t len, ObjectLayout ol, bufferlist *bl, 
-             Context *onfinish, 
-            objectrev_t rev=0);
+             Context *onfinish);
   tid_t write(object_t oid, off_t off, size_t len, ObjectLayout ol, bufferlist &bl, 
-              Context *onack, Context *oncommit, 
-             objectrev_t rev=0);
+              Context *onack, Context *oncommit);
   tid_t zero(object_t oid, off_t off, size_t len, ObjectLayout ol,  
-             Context *onack, Context *oncommit, 
-            objectrev_t rev=0);
-  tid_t stat(object_t oid, off_t *size, ObjectLayout ol, Context *onfinish, 
-            objectrev_t rev=0);  
+             Context *onack, Context *oncommit);
+  tid_t stat(object_t oid, off_t *size, ObjectLayout ol, Context *onfinish);
   
   tid_t lock(int op, object_t oid, ObjectLayout ol, Context *onack, Context *oncommit);