]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
Implementation of the radosstriper interface.
authorSebastien Ponce <sebastien.ponce@cern.ch>
Thu, 5 Jun 2014 15:17:40 +0000 (17:17 +0200)
committerJosh Durgin <josh.durgin@inktank.com>
Sat, 7 Jun 2014 01:12:34 +0000 (18:12 -0700)
The user facing API is implemented in libradosstriper.cc and the backend in RadosStriperImpl.cc.
Details on how the code works are given in a comment at the top of RadosStriperImple.cc

Signed-off-by: Sebastien Ponce <sebastien.ponce@cern.ch>
src/Makefile-env.am
src/Makefile.am
src/libradosstriper/Makefile.am [new file with mode: 0644]
src/libradosstriper/MultiAioCompletionImpl.cc [new file with mode: 0644]
src/libradosstriper/MultiAioCompletionImpl.h [new file with mode: 0644]
src/libradosstriper/RadosStriperImpl.cc [new file with mode: 0644]
src/libradosstriper/RadosStriperImpl.h [new file with mode: 0644]
src/libradosstriper/libradosstriper.cc [new file with mode: 0644]

index b6af080faf5b80e2cc18908bd21bf42324efabf6..e33f8cc3233da334e75482890895a1874a894221 100644 (file)
@@ -147,6 +147,7 @@ LIBMDS = libmds.la
 LIBCLIENT = libclient.la
 LIBCLIENT_FUSE = libclient_fuse.la
 LIBRADOS = librados.la
+LIBRADOSSTRIPER = libradosstriper.la
 LIBRGW = librgw.la
 LIBRBD = librbd.la
 LIBKRBD = libkrbd.la
index 5725291abf86dc8ed029337824933964c0859ed2..3501a7c6b0bc6651465ed68cdea921a0170002da 100644 (file)
@@ -26,6 +26,7 @@ include msg/Makefile.am
 include messages/Makefile.am
 include include/Makefile.am
 include librados/Makefile.am
+include libradosstriper/Makefile.am
 include librbd/Makefile.am
 include rgw/Makefile.am
 include cls/Makefile.am
diff --git a/src/libradosstriper/Makefile.am b/src/libradosstriper/Makefile.am
new file mode 100644 (file)
index 0000000..e88f594
--- /dev/null
@@ -0,0 +1,19 @@
+libradosstriper_la_SOURCES = \
+       libradosstriper/libradosstriper.cc \
+       libradosstriper/RadosStriperImpl.cc \
+       libradosstriper/MultiAioCompletionImpl.cc
+
+# We need this to avoid basename conflicts with the libradosstriper build tests in test/Makefile.am
+libradosstriper_la_CXXFLAGS = ${AM_CXXFLAGS}
+
+LIBRADOSSTRIPER_DEPS = $(LIBRADOS)
+libradosstriper_la_LIBADD = $(LIBRADOSSTRIPER_DEPS)
+libradosstriper_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0
+if LINUX
+libradosstriper_la_LDFLAGS += -export-symbols-regex '^radosstriper_.*'
+endif
+lib_LTLIBRARIES += libradosstriper.la
+
+noinst_HEADERS += \
+       libradosstriper/RadosStriperImpl.h \
+       libradosstriper/MultiAioCompletionImpl.h
diff --git a/src/libradosstriper/MultiAioCompletionImpl.cc b/src/libradosstriper/MultiAioCompletionImpl.cc
new file mode 100644 (file)
index 0000000..2701829
--- /dev/null
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Sebastien Ponce <sebastien.ponce@cern.ch>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/dout.h"
+
+#include "libradosstriper/MultiAioCompletionImpl.h"
+
+void libradosstriper::MultiAioCompletionImpl::complete_request(ssize_t r)
+{
+  lock.Lock();
+  if (rval >= 0) {
+    if (r < 0 && r != -EEXIST)
+      rval = r;
+    else if (r > 0)
+      rval += r;
+  }
+  assert(pending_complete);
+  int count = --pending_complete;
+  if (!count && !building) {
+    complete();
+  }
+  put_unlock();
+}
+
+void libradosstriper::MultiAioCompletionImpl::safe_request(ssize_t r)
+{
+  lock.Lock();
+  if (rval >= 0) {
+    if (r < 0 && r != -EEXIST)
+      rval = r;
+  }
+  assert(pending_safe);
+  int count = --pending_safe;
+  if (!count && !building) {
+    safe();
+  }
+  put_unlock();
+}
+
+void libradosstriper::MultiAioCompletionImpl::finish_adding_requests()
+{
+  lock.Lock();
+  assert(building);
+  building = false;
+  if (!pending_complete)
+    complete();
+  if (!pending_safe)
+    safe();
+  lock.Unlock();
+}
diff --git a/src/libradosstriper/MultiAioCompletionImpl.h b/src/libradosstriper/MultiAioCompletionImpl.h
new file mode 100644 (file)
index 0000000..6ad8ed0
--- /dev/null
@@ -0,0 +1,181 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Sebastien Ponce <sebastien.ponce@cern.ch>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_LIBRADOSSTRIPERSTRIPER_MULTIAIOCOMPLETIONIMPL_H
+#define CEPH_LIBRADOSSTRIPERSTRIPER_MULTIAIOCOMPLETIONIMPL_H
+
+#include "common/Cond.h"
+#include "common/Mutex.h"
+
+#include "include/radosstriper/libradosstriper.hpp"
+
+struct libradosstriper::MultiAioCompletionImpl {
+
+  Mutex lock;
+  Cond cond;
+  int ref, rval;
+  bool released;
+  int pending_complete, pending_safe;
+  rados_callback_t callback_complete, callback_safe;
+  void *callback_complete_arg, *callback_safe_arg;
+  bool building;       ///< true if we are still building this completion
+  bufferlist bl;       /// only used for read case in C api of rados striper
+  std::list<bufferlist*> bllist; /// keep temporary buffer lists used for destriping
+
+  MultiAioCompletionImpl() : lock("MultiAioCompletionImpl lock", false, false),
+    ref(1), rval(0), released(false),
+    pending_complete(0), pending_safe(0),
+    callback_complete(0), callback_safe(0),
+    callback_complete_arg(0), callback_safe_arg(0),
+    building(true) {};
+
+  ~MultiAioCompletionImpl() {
+    // deallocate temporary buffer lists
+    for (std::list<bufferlist*>::iterator it = bllist.begin();
+        it != bllist.end();
+        it++) {
+      delete *it;
+    }
+    bllist.clear();
+  }
+
+  int set_complete_callback(void *cb_arg, rados_callback_t cb) {
+    lock.Lock();
+    callback_complete = cb;
+    callback_complete_arg = cb_arg;
+    lock.Unlock();
+    return 0;
+  }
+  int set_safe_callback(void *cb_arg, rados_callback_t cb) {
+    lock.Lock();
+    callback_safe = cb;
+    callback_safe_arg = cb_arg;
+    lock.Unlock();
+    return 0;
+  }
+  int wait_for_complete() {
+    lock.Lock();
+    while (pending_complete)
+      cond.Wait(lock);
+    lock.Unlock();
+    return 0;
+  }
+  int wait_for_safe() {
+    lock.Lock();
+    while (pending_safe)
+      cond.Wait(lock);
+    lock.Unlock();
+    return 0;
+  }
+  bool is_complete() {
+    lock.Lock();
+    int r = pending_complete;
+    lock.Unlock();
+    return 0 == r;
+  }
+  bool is_safe() {
+    lock.Lock();
+    int r = pending_safe;
+    lock.Unlock();
+    return r == 0;
+  }
+  void wait_for_complete_and_cb() {
+    lock.Lock();
+    while (pending_complete || callback_complete)
+      cond.Wait(lock);
+    lock.Unlock();
+  }
+  void wait_for_safe_and_cb() {
+    lock.Lock();
+    while (pending_safe || callback_safe)
+      cond.Wait(lock);
+    lock.Unlock();
+  }
+  bool is_complete_and_cb() {
+    lock.Lock();
+    bool r = ((0 == pending_complete) && !callback_complete);
+    lock.Unlock();
+    return r;
+  }
+  bool is_safe_and_cb() {
+    lock.Lock();
+    int r = ((0 == pending_safe) && !callback_safe);
+    lock.Unlock();
+    return r;
+  }
+  int get_return_value() {
+    lock.Lock();
+    int r = rval;
+    lock.Unlock();
+    return r;
+  }
+  void release() {
+    lock.Lock();
+    assert(!released);
+    released = true;
+    put_unlock();
+  }
+  void get() {
+    lock.Lock();
+    _get();
+    lock.Unlock();
+  }
+  void _get() {
+    assert(lock.is_locked());
+    assert(ref > 0);
+    ++ref;
+  }
+  void put() {
+    lock.Lock();
+    put_unlock();
+  }
+  void put_unlock() {
+    assert(ref > 0);
+    int n = --ref;
+    lock.Unlock();
+    if (!n)
+      delete this;
+  }
+  void add_request() {
+    lock.Lock();
+    pending_complete++;
+    _get();
+    pending_safe++;
+    _get();
+    lock.Unlock();
+  }
+  void complete() {
+    assert(lock.is_locked());
+    if (callback_complete) {
+      callback_complete(this, callback_complete_arg);
+      callback_complete = 0;
+    }
+    cond.Signal();
+  }
+  void safe() {
+    assert(lock.is_locked());
+    if (callback_safe) {
+      callback_safe(this, callback_safe_arg);
+      callback_safe = 0;
+    }
+    cond.Signal();
+  };
+
+  void complete_request(ssize_t r);
+  void safe_request(ssize_t r);
+  void finish_adding_requests();
+
+};
+
+#endif // CEPH_LIBRADOSSTRIPERSTRIPER_MULTIAIOCOMPLETIONIMPL_H
diff --git a/src/libradosstriper/RadosStriperImpl.cc b/src/libradosstriper/RadosStriperImpl.cc
new file mode 100644 (file)
index 0000000..5d8e2f0
--- /dev/null
@@ -0,0 +1,1058 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Sebastien Ponce <sebastien.ponce@cern.ch>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "libradosstriper/RadosStriperImpl.h"
+
+#include <errno.h>
+
+#include <sstream>
+#include <iomanip>
+#include <algorithm>
+
+#include "include/types.h"
+#include "include/uuid.h"
+#include "include/ceph_fs.h"
+#include "common/dout.h"
+#include "common/strtol.h"
+#include "osdc/Striper.h"
+#include "libradosstriper/MultiAioCompletionImpl.h"
+#include "librados/AioCompletionImpl.h"
+#include <cls/lock/cls_lock_client.h>
+
+/*
+ * This file contents the actual implementation of the rados striped objects interface.
+ *
+ * Striped objects are stored in rados in a set of regular rados objects, after their
+ * content has been striped using the osdc/Striper interface.
+ *
+ * The external attributes of the striped object are mapped to the attributes of the
+ * first underlying object. This first object has a set of extra external attributes
+ * storing the layout of the striped object for future read back. These attributes are :
+ *  - striper.layout.object_size : the size of rados objects used.
+ *                                 Must be a multiple of striper.layout.stripe_unit
+ *  - striper.layout.stripe_unit : the size of a stripe unit
+ *  - striper.layout.stripe_count : the number of stripes used
+ *  - striper.size : total striped object size
+ *
+ * In general operations on striped objects are not atomic.
+ * However, a certain number of safety guards have been put to make the interface closer
+ * to atomicity :
+ *  - each data operation takes a shared lock on the first rados object for the
+ *    whole time of the operation
+ *  - the remove and trunc operations take an exclusive lock on the first rados object
+ *    for the whole time of the operation
+ * This makes sure that no removal/truncation of a striped object occurs while
+ * data operations are happening and vice versa. It thus makes sure that the layout
+ * of a striped object does not change during data operation, which is essential for
+ * data consistency.
+ *
+ * Still the writing to a striped object is not atomic. This means in particular that
+ * the size of an object may not be in sync with its content at all times.
+ * As the size is always garanteed to be updated first and in an atomic way, and as
+ * sparse striped objects are supported (see below), what will typically happen is
+ * that a reader that comes too soon after a write will read 0s instead of the actual
+ * data.
+ *
+ * Note that remove handles the pieces of the striped object in reverse order,
+ * so that the head object is removed last, making the completion of the deletion atomic.
+ *
+ * Striped objects can be sparse, typically in case data was written at the end of the
+ * striped object only. In such a case, some rados objects constituing the striped object
+ * may be missing. Other can be partial (only the beginning will have data)
+ * When dealing with such sparse striped files, missing objects are detected and
+ * considered as full of 0s. They are however not created until real data is written
+ * to them.
+ *
+ * There are a number of missing features/improvements that could be implemented.
+ * Here are some ideas :
+ *    - asynchronous stat and deletion
+ *    - improvement of the synchronous deletion to launch asynchrously
+ *      the deletion of the rados objects
+ *    - make the truncation asynchronous in aio_write_full
+ *    - implementation of missing entry points (compared to rados)
+ *      In particular : clone_range, sparse_read, exec, aio_flush_async, tmaps, omaps, ...
+ *
+ */
+
+#define dout_subsys ceph_subsys_rados
+#undef dout_prefix
+#define dout_prefix *_dout << "libradosstriper: "
+
+/// size of xattr buffer
+#define XATTR_BUFFER_SIZE 32
+
+/// names of the different xattr entries
+#define XATTR_LAYOUT_STRIPE_UNIT "striper.layout.stripe_unit"
+#define XATTR_LAYOUT_STRIPE_COUNT "striper.layout.stripe_count"
+#define XATTR_LAYOUT_OBJECT_SIZE "striper.layout.object_size"
+#define XATTR_SIZE "striper.size"
+#define LOCK_PREFIX "lock."
+
+/// name of the lock used on objects to ensure layout stability during IO
+#define RADOS_LOCK_NAME "striper.lock"
+
+/// format of the extension of rados objects created for a given striped object
+#define RADOS_OBJECT_EXTENSION_FORMAT ".%016llx"
+
+/// default object layout (external declaration)
+extern ceph_file_layout g_default_file_layout;
+
+///////////////////////// CompletionData /////////////////////////////
+
+libradosstriper::RadosStriperImpl::CompletionData::CompletionData
+(libradosstriper::RadosStriperImpl* striper,
+ const std::string& soid,
+ const std::string& lockCookie,
+ librados::AioCompletionImpl *userCompletion) :
+  m_striper(striper), m_soid(soid), m_lockCookie(lockCookie), m_ack(0) {
+  m_striper->get();
+  if (userCompletion) m_ack = new librados::IoCtxImpl::C_aio_Ack(userCompletion);
+}
+
+libradosstriper::RadosStriperImpl::CompletionData::~CompletionData() {
+  if (m_ack) delete m_ack;
+  m_striper->put();
+}
+
+void libradosstriper::RadosStriperImpl::CompletionData::complete(int r) {
+  if (m_ack) m_ack->finish(r);
+}
+
+libradosstriper::RadosStriperImpl::ReadCompletionData::ReadCompletionData
+(libradosstriper::RadosStriperImpl* striper,
+ const std::string& soid,
+ const std::string& lockCookie,
+ librados::AioCompletionImpl *userCompletion,
+ bufferlist* bl,
+ std::vector<ObjectExtent>* extents,
+ std::vector<bufferlist>* resultbl) :
+  CompletionData(striper, soid, lockCookie, userCompletion),
+  m_bl(bl), m_extents(extents), m_resultbl(resultbl) {}
+
+libradosstriper::RadosStriperImpl::ReadCompletionData::~ReadCompletionData() {
+  delete m_extents;
+  delete m_resultbl;
+}
+
+void libradosstriper::RadosStriperImpl::ReadCompletionData::complete(int r) {
+  // gather data into final buffer
+  Striper::StripedReadResult readResult;
+  vector<bufferlist>::iterator bit = m_resultbl->begin();
+  for (vector<ObjectExtent>::iterator eit = m_extents->begin();
+       eit != m_extents->end();
+       ++eit, ++bit) {
+    readResult.add_partial_result(m_striper->cct(), *bit, eit->buffer_extents);
+  }
+  m_bl->clear();
+  readResult.assemble_result(m_striper->cct(), *m_bl, true);
+  // call parent's completion method
+  CompletionData::complete(r?r:m_bl->length());
+}
+
+libradosstriper::RadosStriperImpl::WriteCompletionData::WriteCompletionData
+(libradosstriper::RadosStriperImpl* striper,
+ const std::string& soid,
+ const std::string& lockCookie,
+ librados::AioCompletionImpl *userCompletion) :
+  CompletionData(striper, soid, lockCookie, userCompletion), m_safe(0) {
+  if (userCompletion) m_safe = new librados::IoCtxImpl::C_aio_Safe(userCompletion);
+}
+
+libradosstriper::RadosStriperImpl::WriteCompletionData::~WriteCompletionData() {
+  if (m_safe) delete m_safe;
+}
+
+void libradosstriper::RadosStriperImpl::WriteCompletionData::safe(int r) {
+  if (m_safe) m_safe->finish(r);
+}
+
+///////////////////////// RadosExclusiveLock /////////////////////////////
+
+libradosstriper::RadosStriperImpl::RadosExclusiveLock::RadosExclusiveLock(librados::IoCtx* ioCtx,
+                                                                         const std::string& oid) :
+  m_ioCtx(ioCtx), m_oid(oid)
+{
+  librados::ObjectWriteOperation op;
+  op.assert_exists();
+  m_lockCookie = RadosStriperImpl::getUUID();
+  utime_t dur = utime_t();
+  rados::cls::lock::lock(&op, RADOS_LOCK_NAME, LOCK_EXCLUSIVE, m_lockCookie, "", "", dur, 0);
+  int rc = m_ioCtx->operate(oid, &op);
+  if (rc) throw ErrorCode(rc);
+}
+
+libradosstriper::RadosStriperImpl::RadosExclusiveLock::~RadosExclusiveLock() {
+  m_ioCtx->unlock(m_oid, RADOS_LOCK_NAME, m_lockCookie);
+}
+
+///////////////////////// constructor /////////////////////////////
+
+libradosstriper::RadosStriperImpl::RadosStriperImpl(librados::IoCtx& ioctx, librados::IoCtxImpl *ioctx_impl) :
+  m_refCnt(0), m_radosCluster(ioctx), m_ioCtx(ioctx), m_ioCtxImpl(ioctx_impl),
+  m_layout(g_default_file_layout) {}
+
+///////////////////////// layout /////////////////////////////
+
+int libradosstriper::RadosStriperImpl::setObjectLayoutStripeUnit
+(unsigned int stripe_unit)
+{
+  /* stripe unit must be non-zero, 64k increment */
+  if (!stripe_unit || (stripe_unit & (CEPH_MIN_STRIPE_UNIT-1)))
+    return -EINVAL;
+  m_layout.fl_stripe_unit = stripe_unit;
+  return 0;
+}
+
+int libradosstriper::RadosStriperImpl::setObjectLayoutStripeCount
+(unsigned int stripe_count)
+{
+  /* stripe count must be non-zero */
+  if (!stripe_count)
+    return -EINVAL;
+  m_layout.fl_stripe_count = stripe_count;
+  return 0;
+}
+
+int libradosstriper::RadosStriperImpl::setObjectLayoutObjectSize
+(unsigned int object_size)
+{
+  /* object size must be non-zero, 64k increment */
+  if (!object_size || (object_size & (CEPH_MIN_STRIPE_UNIT-1)))
+    return -EINVAL;
+  /* object size must be a multiple of stripe unit */
+  if (object_size < m_layout.fl_stripe_unit ||
+      object_size % m_layout.fl_stripe_unit)
+    return -EINVAL;
+  m_layout.fl_object_size = object_size;
+  return 0;
+}
+
+///////////////////////// xattrs /////////////////////////////
+
+int libradosstriper::RadosStriperImpl::getxattr(const object_t& soid,
+                                                const char *name,
+                                                bufferlist& bl)
+{
+  std::string firstObjOid = getObjectId(soid, 0);
+  return m_ioCtx.getxattr(firstObjOid, name, bl);
+}
+
+int libradosstriper::RadosStriperImpl::setxattr(const object_t& soid,
+                                                const char *name,
+                                                bufferlist& bl)
+{
+  std::string firstObjOid = getObjectId(soid, 0);
+  return m_ioCtx.setxattr(firstObjOid, name, bl);
+}
+
+int libradosstriper::RadosStriperImpl::getxattrs(const object_t& soid,
+                                                 map<string, bufferlist>& attrset)
+{
+  std::string firstObjOid = getObjectId(soid, 0);
+  int rc = m_ioCtx.getxattrs(firstObjOid, attrset);
+  if (rc) return rc;
+  // cleanup internal attributes dedicated to striping and locking
+  attrset.erase(XATTR_LAYOUT_STRIPE_UNIT);
+  attrset.erase(XATTR_LAYOUT_STRIPE_COUNT);
+  attrset.erase(XATTR_LAYOUT_OBJECT_SIZE);
+  attrset.erase(XATTR_SIZE);
+  attrset.erase(std::string(LOCK_PREFIX) + RADOS_LOCK_NAME);
+  return rc;
+}
+
+int libradosstriper::RadosStriperImpl::rmxattr(const object_t& soid,
+                                               const char *name)
+{
+  std::string firstObjOid = getObjectId(soid, 0);
+  return m_ioCtx.rmxattr(firstObjOid, name);
+}
+
+///////////////////////// io /////////////////////////////
+
+int libradosstriper::RadosStriperImpl::write(const std::string& soid,
+                                            const bufferlist& bl,
+                                            size_t len,
+                                            uint64_t off) 
+{
+  // open the object. This will create it if needed, retrieve its layout
+  // and size and take a shared lock on it
+  ceph_file_layout layout;
+  std::string lockCookie;
+  int rc = createAndOpenStripedObject(soid, &layout, len+off, &lockCookie, true);
+  if (rc) return rc;
+  return write_in_open_object(soid, layout, lockCookie, bl, len, off);
+}
+
+int libradosstriper::RadosStriperImpl::append(const std::string& soid,
+                                             const bufferlist& bl,
+                                             size_t len) 
+{
+  // open the object. This will create it if needed, retrieve its layout
+  // and size and take a shared lock on it
+  ceph_file_layout layout;
+  size_t size = len;
+  std::string lockCookie;
+  int rc = openStripedObjectForWrite(soid, &layout, &size, &lockCookie, false);
+  if (rc) return rc;
+  return write_in_open_object(soid, layout, lockCookie, bl, len, size);
+}
+
+int libradosstriper::RadosStriperImpl::write_full(const std::string& soid,
+                                                 const bufferlist& bl) 
+{
+  int rc = trunc(soid, 0);
+  if (rc) return rc;
+  return write(soid, bl, bl.length(), 0);
+}
+
+int libradosstriper::RadosStriperImpl::read(const std::string& soid,
+                                           bufferlist* bl,
+                                           size_t len,
+                                           uint64_t off)
+{
+  // create a completion object
+  librados::AioCompletionImpl c;
+  // call asynchronous method
+  int rc = aio_read(soid, &c, bl, len, off);
+  // and wait for completion
+  if (!rc) {
+    // wait for completion
+    c.wait_for_complete_and_cb();
+    // return result
+    rc = c.get_return_value();
+  }
+  return rc;
+}
+
+///////////////////////// asynchronous io /////////////////////////////
+
+int libradosstriper::RadosStriperImpl::aio_write(const std::string& soid,
+                                                librados::AioCompletionImpl *c,
+                                                const bufferlist& bl,
+                                                size_t len,
+                                                uint64_t off)
+{
+  ceph_file_layout layout;
+  std::string lockCookie;
+  int rc = createAndOpenStripedObject(soid, &layout, len+off, &lockCookie, true);
+  if (rc) return rc;
+  return aio_write_in_open_object(soid, c, layout, lockCookie, bl, len, off);
+}
+
+int libradosstriper::RadosStriperImpl::aio_append(const std::string& soid,
+                                                 librados::AioCompletionImpl *c,
+                                                 const bufferlist& bl,
+                                                 size_t len)
+{
+  ceph_file_layout layout;
+  size_t size = len;
+  std::string lockCookie;
+  int rc = openStripedObjectForWrite(soid, &layout, &size, &lockCookie, false);
+  if (rc) return rc;
+  // create a completion object
+  return aio_write_in_open_object(soid, c, layout, lockCookie, bl, len, size);
+}
+
+int libradosstriper::RadosStriperImpl::aio_write_full(const std::string& soid,
+                                                     librados::AioCompletionImpl *c,
+                                                     const bufferlist& bl)
+{
+  int rc = trunc(soid, 0);
+  if (rc) return rc;
+  return aio_write(soid, c, bl, bl.length(), 0);
+}
+
+static void striper_read_aio_req_complete(rados_striper_multi_completion_t c, void *arg)
+{
+  libradosstriper::RadosStriperImpl::ReadCompletionData *cdata =
+    reinterpret_cast<libradosstriper::RadosStriperImpl::ReadCompletionData*>(arg);
+  cdata->m_striper->unlockObject(cdata->m_soid, cdata->m_lockCookie);
+  libradosstriper::MultiAioCompletionImpl *comp =
+    reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
+  cdata->complete(comp->rval);
+  delete cdata;
+  comp->ref--;
+}
+
+static void rados_req_read_safe(rados_completion_t c, void *arg)
+{
+  libradosstriper::RadosStriperImpl::RadosReadCompletionData *data =
+    reinterpret_cast<libradosstriper::RadosStriperImpl::RadosReadCompletionData*>(arg);
+  int rc = rados_aio_get_return_value(c);
+  // ENOENT means that we are dealing with a sparse file. This is fine,
+  // data (0s) will be created on the fly by the rados_req_read_complete method
+  if (rc == -ENOENT) rc = 0;
+  librados::AioCompletion *comp = reinterpret_cast<librados::AioCompletion*>(c);
+  libradosstriper::MultiAioCompletionImpl *multiAioComp = data->m_multiAioCompl;
+  if (0 == comp->pc->ack) delete data;
+  multiAioComp->safe_request(rc);
+}
+
+static void rados_req_read_complete(rados_completion_t c, void *arg)
+{
+  libradosstriper::RadosStriperImpl::RadosReadCompletionData *data =
+    reinterpret_cast<libradosstriper::RadosStriperImpl::RadosReadCompletionData*>(arg);
+  int rc = rados_aio_get_return_value(c);
+  // We need to handle the case of sparse files here
+  if (rc == -ENOENT) {
+    // the object did not exist at all. This can happen for sparse files.
+    // we consider we've read 0 bytes and it will fall into next case
+    rc = 0;
+  }
+  if (rc >= 0 && (((uint64_t)rc) < data->m_expectedBytes)) {
+    // only partial data were present in the object (or the object did not
+    // even exist if we've gone through previous case).
+    // This is typical of sparse file and we need to complete with 0s.
+    unsigned int lenOfZeros = data->m_expectedBytes-rc;
+    unsigned int existingDataToZero = min(data->m_bl->length()-rc, lenOfZeros);
+    if (existingDataToZero > 0) {
+      data->m_bl->zero(rc, existingDataToZero);
+    }
+    if (lenOfZeros > existingDataToZero) {
+      ceph::bufferptr zeros(ceph::buffer::create(lenOfZeros-existingDataToZero));
+      zeros.zero();
+      data->m_bl->push_back(zeros);
+    }
+    rc = data->m_expectedBytes;
+  }
+  librados::AioCompletion *comp = reinterpret_cast<librados::AioCompletion*>(c);
+  libradosstriper::MultiAioCompletionImpl * multiAioComp = data->m_multiAioCompl;
+  if (0 == comp->pc->safe) delete data;
+  multiAioComp->complete_request(rc);
+}
+
+int libradosstriper::RadosStriperImpl::aio_read(const std::string& soid,
+                                               librados::AioCompletionImpl *c,
+                                               bufferlist* bl,
+                                               size_t len,
+                                               uint64_t off)
+{
+  // open the object. This will retrieve its layout and size
+  // and take a shared lock on it
+  ceph_file_layout layout;
+  size_t size;
+  std::string lockCookie;
+  int rc = openStripedObjectForRead(soid, &layout, &size, &lockCookie);
+  if (rc) return rc;
+  // find out the actual number of bytes we can read
+  uint64_t read_len;
+  if (off >= size) {
+    // nothing to read ! We are done.
+    read_len = 0;
+  } else {
+    read_len = min(len, (size_t)(size-off));
+  }
+  // get list of extents to be read from
+  vector<ObjectExtent> *extents = new vector<ObjectExtent>();
+  if (read_len > 0) {
+    std::string format = soid + RADOS_OBJECT_EXTENSION_FORMAT;
+    Striper::file_to_extents(cct(), format.c_str(), &layout, off, read_len, 0, *extents);
+  }
+  
+  // create a completion object and transfer ownership of extents and resultbl
+  vector<bufferlist> *resultbl = new vector<bufferlist>(extents->size());
+  c->is_read = true;
+  c->io = m_ioCtxImpl;
+  ReadCompletionData *cdata = new ReadCompletionData(this, soid, lockCookie, c,
+                                                    bl, extents, resultbl);
+  libradosstriper::MultiAioCompletionImpl *nc = new libradosstriper::MultiAioCompletionImpl;
+  nc->set_complete_callback(cdata, striper_read_aio_req_complete);
+  // go through the extents
+  int r = 0, i = 0;
+  for (vector<ObjectExtent>::iterator p = extents->begin(); p != extents->end(); ++p) {
+    // create a buffer list describing where to place data read from current extend
+    bufferlist *oid_bl = &((*resultbl)[i++]);
+    for (vector<pair<uint64_t,uint64_t> >::iterator q = p->buffer_extents.begin();
+        q != p->buffer_extents.end();
+        ++q) {
+      bufferlist buffer_bl;
+      buffer_bl.substr_of(*bl, q->first, q->second);
+      oid_bl->append(buffer_bl);
+    }
+    // read all extends of a given object in one go
+    nc->add_request();
+    RadosReadCompletionData *data = new RadosReadCompletionData(nc, p->length, oid_bl);
+    librados::AioCompletion *rados_completion =
+      m_radosCluster.aio_create_completion(data, rados_req_read_complete, rados_req_read_safe);
+    r = m_ioCtx.aio_read(p->oid.name, rados_completion, oid_bl, p->length, p->offset);
+    rados_completion->release();
+    if (r < 0)
+      break;
+  }
+  nc->finish_adding_requests();
+  return r;
+}
+
+int libradosstriper::RadosStriperImpl::aio_read(const std::string& soid,
+                                               librados::AioCompletionImpl *c,
+                                               char* buf,
+                                               size_t len,
+                                               uint64_t off)
+{
+  // create a buffer list and store it inside the completion object
+  c->bl.clear();
+  c->bl.push_back(buffer::create_static(len, buf));
+  // call the bufferlist version of this method
+  return aio_read(soid, c, &c->bl, len, off);
+}
+
+int libradosstriper::RadosStriperImpl::aio_flush() 
+{
+  // pass to the rados level
+  return m_ioCtx.aio_flush();
+}
+
+///////////////////////// stat and deletion /////////////////////////////
+
+int libradosstriper::RadosStriperImpl::stat(const std::string& soid, uint64_t *psize, time_t *pmtime)
+{
+  // get pmtime as the pmtime of the first object
+  std::string firstObjOid = getObjectId(soid, 0);
+  uint64_t obj_size;
+  int rc = m_ioCtx.stat(firstObjOid, &obj_size, pmtime);
+  if (rc < 0) return rc;
+  // get the pmsize from the first object attributes
+  bufferlist bl;
+  rc = getxattr(soid, XATTR_SIZE, bl);
+  if (rc < 0) return rc;
+  std::string err;
+  // this intermediate string allows to add a null terminator before calling strtol
+  std::string strsize(bl.c_str(), bl.length());
+  *psize = strict_strtol(strsize.c_str(), 10, &err);
+  if (!err.empty()) {
+    lderr(cct()) << XATTR_SIZE << " : " << err << dendl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+int libradosstriper::RadosStriperImpl::remove(const std::string& soid)
+{
+  std::string firstObjOid = getObjectId(soid, 0);
+  try {
+    // lock the object in exclusive mode. Will be released when leaving the scope
+    RadosExclusiveLock lock(&m_ioCtx, firstObjOid);
+    // check size and get number of rados objects to delete
+    uint64_t nb_objects = 0;
+    bufferlist bl2;
+    int rc = getxattr(soid, XATTR_SIZE, bl2);
+    if (rc < 0) {
+      // no object size (or not able to get it)
+      // try to find the number of object "by hand"
+      uint64_t psize;
+      time_t pmtime;
+      while (!m_ioCtx.stat(getObjectId(soid, nb_objects), &psize, &pmtime)) {
+        nb_objects++;
+      }
+    } else {
+      // count total number of rados objects in the striped object
+      std::string err;
+      // this intermediate string allows to add a null terminator before calling strtol
+      std::string strsize(bl2.c_str(), bl2.length());
+      uint64_t size = strict_strtol(strsize.c_str(), 10, &err);
+      if (!err.empty()) {
+        lderr(cct()) << XATTR_SIZE << " : " << err << dendl;
+        
+        return -EINVAL;
+      }
+      uint64_t object_size = m_layout.fl_object_size;
+      uint64_t su = m_layout.fl_stripe_unit;
+      uint64_t stripe_count = m_layout.fl_stripe_count;
+      uint64_t nb_complete_sets = size / (object_size*stripe_count);
+      uint64_t remaining_data = size % (object_size*stripe_count);
+      uint64_t remaining_stripe_units = (remaining_data + su -1) / su;
+      uint64_t remaining_objects = std::min(remaining_stripe_units, stripe_count);
+      nb_objects = nb_complete_sets * stripe_count + remaining_objects;
+    }
+    // delete rados objects in reverse order
+    int rcr = 0;
+    for (int i = nb_objects-1; i >= 0; i--) {
+      rcr = m_ioCtx.remove(getObjectId(soid, i));
+      if (rcr < 0 and -ENOENT != rcr) {
+        lderr(cct()) << "RadosStriperImpl::remove : deletion incomplete for " << soid
+                  << ", as " << getObjectId(soid, i) << " could not be deleted (rc=" << rc << ")"
+                  << dendl;
+        break;
+      }
+    }
+    // return
+    return rcr;
+  } catch (ErrorCode e) {
+    // errror caught when trying to take the exclusive lock
+    return e.m_code;
+  }
+}
+
+int libradosstriper::RadosStriperImpl::trunc(const std::string& soid, uint64_t size)
+{
+  // lock the object in exclusive mode. Will be released when leaving the scope
+  std::string firstObjOid = getObjectId(soid, 0);
+  try {
+    RadosExclusiveLock lock(&m_ioCtx, firstObjOid);
+  } catch (ErrorCode e) {
+    return e.m_code;
+  }
+  // load layout and size
+  ceph_file_layout layout;
+  uint64_t original_size;
+  int rc = internal_get_layout_and_size(firstObjOid, &layout, &original_size);
+  if (rc) return rc;
+  if (size < original_size) {
+    rc = truncate(soid, original_size, size, layout);
+  } else if (size > original_size) {
+    rc = grow(soid, original_size, size, layout);
+  }
+  return rc;
+}
+
+///////////////////////// private helpers /////////////////////////////
+
+std::string libradosstriper::RadosStriperImpl::getObjectId(const object_t& soid,
+                                                           long long unsigned objectno)
+{
+  std::ostringstream s;
+  s << soid << '.' << std::setfill ('0') << std::setw(16) << std::hex << objectno;
+  return s.str();
+}
+
+int libradosstriper::RadosStriperImpl::closeForWrite(const std::string& soid,
+                                                    const std::string& lockCookie)
+{
+  // unlock the shared lock on the first rados object
+  unlockObject(soid, lockCookie);
+  return 0;
+}
+
+void libradosstriper::RadosStriperImpl::unlockObject(const std::string& soid,
+                                                    const std::string& lockCookie)
+{
+  // unlock the shared lock on the first rados object
+  std::string firstObjOid = getObjectId(soid, 0);
+  m_ioCtx.unlock(firstObjOid, RADOS_LOCK_NAME, lockCookie);
+}
+
+static void striper_write_req_complete(rados_striper_multi_completion_t c, void *arg)
+{
+  libradosstriper::RadosStriperImpl::WriteCompletionData *cdata =
+    reinterpret_cast<libradosstriper::RadosStriperImpl::WriteCompletionData*>(arg);
+  cdata->m_striper->closeForWrite(cdata->m_soid, cdata->m_lockCookie);
+}
+
+int libradosstriper::RadosStriperImpl::write_in_open_object(const std::string& soid,
+                                                           const ceph_file_layout& layout,
+                                                           const std::string& lockCookie,
+                                                           const bufferlist& bl,
+                                                           size_t len,
+                                                           uint64_t off) {
+  // create a completion object
+  WriteCompletionData *cdata = new WriteCompletionData(this, soid, lockCookie);
+  libradosstriper::MultiAioCompletionImpl *c = new libradosstriper::MultiAioCompletionImpl;
+  c->set_complete_callback(cdata, striper_write_req_complete);
+  // call the asynchronous API
+  int rc = internal_aio_write(soid, c, bl, len, off, layout);
+  if (!rc) {
+    // wait for completion and safety of data
+    c->wait_for_complete_and_cb();
+    c->wait_for_safe_and_cb();
+    // return result
+    rc = c->get_return_value();
+  }
+  delete cdata;
+  c->release();
+  return rc;
+}
+
+static void striper_write_aio_req_complete(rados_striper_multi_completion_t c, void *arg)
+{
+  libradosstriper::RadosStriperImpl::WriteCompletionData *cdata =
+    reinterpret_cast<libradosstriper::RadosStriperImpl::WriteCompletionData*>(arg);
+  cdata->m_striper->closeForWrite(cdata->m_soid, cdata->m_lockCookie);
+  libradosstriper::MultiAioCompletionImpl *comp =
+    reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
+  cdata->complete(comp->rval);
+  if (0 == comp->pending_safe) {
+    delete cdata;
+    comp->ref--;
+  }
+}
+
+static void striper_write_aio_req_safe(rados_striper_multi_completion_t c, void *arg)
+{
+  libradosstriper::RadosStriperImpl::WriteCompletionData *cdata =
+    reinterpret_cast<libradosstriper::RadosStriperImpl::WriteCompletionData*>(arg);
+  libradosstriper::MultiAioCompletionImpl *comp =
+    reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
+  cdata->safe(comp->rval);
+  if (0 == comp->pending_complete) {
+    delete cdata;
+    comp->ref--;
+  }
+}
+
+int libradosstriper::RadosStriperImpl::aio_write_in_open_object(const std::string& soid,
+                                                               librados::AioCompletionImpl *c,
+                                                               const ceph_file_layout& layout,
+                                                               const std::string& lockCookie,
+                                                               const bufferlist& bl,
+                                                               size_t len,
+                                                               uint64_t off) {
+  // create a completion object
+  m_ioCtxImpl->get();
+  WriteCompletionData *cdata = new WriteCompletionData(this, soid, lockCookie, c);
+  c->io = m_ioCtxImpl;
+  libradosstriper::MultiAioCompletionImpl *nc = new libradosstriper::MultiAioCompletionImpl;
+  nc->set_complete_callback(cdata, striper_write_aio_req_complete);
+  nc->set_safe_callback(cdata, striper_write_aio_req_safe);
+  // internal asynchronous API
+  return internal_aio_write(soid, nc, bl, len, off, layout);
+}
+
+static void rados_req_write_safe(rados_completion_t c, void *arg)
+{
+  libradosstriper::MultiAioCompletionImpl *comp =
+    reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(arg);
+  comp->safe_request(rados_aio_get_return_value(c));
+}
+
+static void rados_req_write_complete(rados_completion_t c, void *arg)
+{
+  libradosstriper::MultiAioCompletionImpl *comp =
+    reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(arg);
+  comp->complete_request(rados_aio_get_return_value(c));
+}
+
+int
+libradosstriper::RadosStriperImpl::internal_aio_write(const std::string& soid,
+                                                     libradosstriper::MultiAioCompletionImpl *c,
+                                                     const bufferlist& bl,
+                                                     size_t len,
+                                                     uint64_t off,
+                                                     const ceph_file_layout& layout)
+{
+  // get list of extents to be written to
+  vector<ObjectExtent> extents;
+  std::string format = soid + RADOS_OBJECT_EXTENSION_FORMAT;
+  Striper::file_to_extents(cct(), format.c_str(), &layout, off, len, 0, extents);
+  // go through the extents
+  int r = 0;
+  for (vector<ObjectExtent>::iterator p = extents.begin(); p != extents.end(); ++p) {
+    // assemble pieces of a given object into a single buffer list
+    bufferlist oid_bl;
+    for (vector<pair<uint64_t,uint64_t> >::iterator q = p->buffer_extents.begin();
+        q != p->buffer_extents.end();
+        ++q) {
+      bufferlist buffer_bl;
+      buffer_bl.substr_of(bl, q->first, q->second);
+      oid_bl.append(buffer_bl);
+    }    
+    // and write the object
+    c->add_request();
+    librados::AioCompletion *rados_completion =
+      m_radosCluster.aio_create_completion(c, rados_req_write_complete, rados_req_write_safe);
+    r = m_ioCtx.aio_write(p->oid.name, rados_completion, oid_bl, p->length, p->offset);
+    rados_completion->release();
+    if (r < 0) 
+      break;
+  }    
+  c->finish_adding_requests();
+  return r;
+}
+
+int libradosstriper::RadosStriperImpl::extract_uint32_attr
+(std::map<std::string, bufferlist> &attrs,
+ const std::string& key,
+ ceph_le32 *value)
+{
+  std::map<std::string, bufferlist>::iterator attrsIt = attrs.find(key);
+  if (attrsIt != attrs.end()) {
+    // this intermediate string allows to add a null terminator before calling strtol
+    std::string strvalue(attrsIt->second.c_str(), attrsIt->second.length());
+    std::string err;   
+    *value = strict_strtol(strvalue.c_str(), 10, &err);
+    if (!err.empty()) {
+      lderr(cct()) << key << " : " << err << dendl;
+      return -EINVAL;
+    }
+  } else {
+    return -ENOENT;
+  }
+  return 0;
+}
+
+int libradosstriper::RadosStriperImpl::extract_sizet_attr
+(std::map<std::string, bufferlist> &attrs,
+ const std::string& key,
+ size_t *value)
+{
+  std::map<std::string, bufferlist>::iterator attrsIt = attrs.find(key);
+  if (attrsIt != attrs.end()) {
+    // this intermediate string allows to add a null terminator before calling strtol
+    std::string strvalue(attrsIt->second.c_str(), attrsIt->second.length());
+    std::string err;   
+    *value = strict_strtoll(strvalue.c_str(), 10, &err);
+    if (!err.empty()) {
+      lderr(cct()) << key << " : " << err << dendl;
+      return -EINVAL;
+    }
+  } else {
+    return -ENOENT;
+  }
+  return 0;
+}
+
+int libradosstriper::RadosStriperImpl::internal_get_layout_and_size(const std::string& oid,
+                                                                   ceph_file_layout *layout,
+                                                                   uint64_t *size) 
+{
+  // get external attributes of the first rados object
+  std::map<std::string, bufferlist> attrs;
+  int rc = m_ioCtx.getxattrs(oid, attrs);
+  if (rc) return rc;
+  // deal with stripe_unit
+  rc = extract_uint32_attr(attrs, XATTR_LAYOUT_STRIPE_UNIT, &layout->fl_stripe_unit);
+  if (rc) return rc;
+  // deal with stripe_count
+  rc = extract_uint32_attr(attrs, XATTR_LAYOUT_STRIPE_COUNT, &layout->fl_stripe_count);
+  if (rc) return rc;
+  // deal with object_size
+  rc = extract_uint32_attr(attrs, XATTR_LAYOUT_OBJECT_SIZE, &layout->fl_object_size);
+  if (rc) return rc;
+  // deal with size
+  rc = extract_sizet_attr(attrs, XATTR_SIZE, size);
+  return rc;
+}
+
+int libradosstriper::RadosStriperImpl::openStripedObjectForRead(const std::string& soid,
+                                                               ceph_file_layout *layout,
+                                                               uint64_t *size,
+                                                               std::string *lockCookie)
+{
+  // take a lock the first rados object, if it exists and gets its size
+  // check, lock and size reading must be atomic and are thus done within a single operation
+  librados::ObjectWriteOperation op;
+  op.assert_exists();
+  *lockCookie = getUUID();
+  utime_t dur = utime_t();
+  rados::cls::lock::lock(&op, RADOS_LOCK_NAME, LOCK_SHARED, *lockCookie, "Tag", "", dur, 0);
+  std::string firstObjOid = getObjectId(soid, 0);
+  int rc = m_ioCtx.operate(firstObjOid, &op);
+  if (rc) {
+    // error case (including -ENOENT)
+    return rc;
+  }
+  rc = internal_get_layout_and_size(firstObjOid, layout, size);
+  if (rc) {
+    m_ioCtx.unlock(firstObjOid, RADOS_LOCK_NAME, *lockCookie);
+    lderr(cct()) << "RadosStriperImpl::openStripedObjectForRead : "
+                << "could not load layout and size for "
+                << soid << " : rc = " << rc << dendl;
+  }
+  return rc;
+}
+
+int libradosstriper::RadosStriperImpl::openStripedObjectForWrite(const std::string& soid,
+                                                                ceph_file_layout *layout,
+                                                                uint64_t *size,
+                                                                std::string *lockCookie,
+                                                                bool isFileSizeAbsolute)
+{
+  // take a lock the first rados object, if it exists
+  // check and lock must be atomic and are thus done within a single operation
+  librados::ObjectWriteOperation op;
+  op.assert_exists();
+  *lockCookie = getUUID();
+  utime_t dur = utime_t();
+  rados::cls::lock::lock(&op, RADOS_LOCK_NAME, LOCK_SHARED, *lockCookie, "Tag", "", dur, 0);
+  std::string firstObjOid = getObjectId(soid, 0);
+  int rc = m_ioCtx.operate(firstObjOid, &op);
+  if (rc) {
+    if (rc == -ENOENT) {
+      // object does not exist, delegate to createEmptyStripedObject
+      int rc = createAndOpenStripedObject(soid, layout, *size, lockCookie, isFileSizeAbsolute);
+      // return original size
+      *size = 0;
+      return rc; 
+    } else {
+      return rc;
+    }
+  }
+  // all fine
+  uint64_t curSize;
+  rc = internal_get_layout_and_size(firstObjOid, layout, &curSize);
+  if (rc) {
+    m_ioCtx.unlock(firstObjOid, RADOS_LOCK_NAME, *lockCookie);
+    lderr(cct()) << "RadosStriperImpl::openStripedObjectForWrite : "
+                  << "could not load layout and size for "
+                  << soid << " : rc = " << rc << dendl;
+    return rc;
+  }
+  // atomically update object size, only if smaller than current one
+  if (!isFileSizeAbsolute)
+    *size += curSize;
+  librados::ObjectWriteOperation writeOp;
+  writeOp.cmpxattr(XATTR_SIZE, LIBRADOS_CMPXATTR_OP_GT, *size);
+  std::ostringstream oss;
+  oss << *size;
+  bufferlist bl;
+  bl.append(oss.str());
+  writeOp.setxattr(XATTR_SIZE, bl);
+  rc = m_ioCtx.operate(firstObjOid, &writeOp);
+  // return current size
+  *size = curSize;
+  // handle case where objectsize is already bigger than size
+  if (-ECANCELED == rc) 
+    rc = 0;
+  if (rc) {
+    m_ioCtx.unlock(firstObjOid, RADOS_LOCK_NAME, *lockCookie);
+    lderr(cct()) << "RadosStriperImpl::openStripedObjectForWrite : "
+                  << "could not set new size for "
+                  << soid << " : rc = " << rc << dendl;
+  }
+  return rc;
+}
+
+int libradosstriper::RadosStriperImpl::createAndOpenStripedObject(const std::string& soid,
+                                                                 ceph_file_layout *layout,
+                                                                 uint64_t size,
+                                                                 std::string *lockCookie,
+                                                                 bool isFileSizeAbsolute)
+{
+  // build atomic write operation
+  librados::ObjectWriteOperation writeOp;
+  writeOp.create(true);
+  // object_size
+  std::ostringstream oss_object_size;
+  oss_object_size << m_layout.fl_object_size;
+  bufferlist bl_object_size;
+  bl_object_size.append(oss_object_size.str());
+  writeOp.setxattr(XATTR_LAYOUT_OBJECT_SIZE, bl_object_size);
+  // stripe unit
+  std::ostringstream oss_stripe_unit;
+  oss_stripe_unit << m_layout.fl_stripe_unit;
+  bufferlist bl_stripe_unit;
+  bl_stripe_unit.append(oss_stripe_unit.str());
+  writeOp.setxattr(XATTR_LAYOUT_STRIPE_UNIT, bl_stripe_unit);
+  // stripe count
+  std::ostringstream oss_stripe_count;
+  oss_stripe_count << m_layout.fl_stripe_count;
+  bufferlist bl_stripe_count;
+  bl_stripe_count.append(oss_stripe_count.str());
+  writeOp.setxattr(XATTR_LAYOUT_STRIPE_COUNT, bl_stripe_count);
+  // size
+  std::ostringstream oss_size;
+  oss_size << (isFileSizeAbsolute?size:0);
+  bufferlist bl_size;
+  bl_size.append(oss_size.str());
+  writeOp.setxattr(XATTR_SIZE, bl_size);
+  // effectively change attributes
+  std::string firstObjOid = getObjectId(soid, 0);
+  int rc = m_ioCtx.operate(firstObjOid, &writeOp);
+  // in case of error (but no EEXIST which would mean the object existed), return
+  if (rc && -EEXIST != rc) return rc;
+  // Otherwise open the object
+  uint64_t fileSize = size;
+  return openStripedObjectForWrite(soid, layout, &fileSize, lockCookie, isFileSizeAbsolute);
+}
+
+int libradosstriper::RadosStriperImpl::truncate(const std::string& soid,
+                                               uint64_t original_size,
+                                               uint64_t size,
+                                               ceph_file_layout &layout) 
+{
+  // handle the underlying rados objects. 3 cases here :
+  //  -- the objects belonging to object sets entirely located
+  //     before the truncation are unchanged
+  //  -- the objects belonging to the object set where the
+  //     truncation took place are truncated or removed
+  //  -- the objects belonging to object sets entirely located
+  //     after the truncation are removed
+  // Note that we do it backward and that we change the size in
+  // the external attributes only at the end. This make sure that
+  // no rados object stays behind if we remove the striped object
+  // after a truncation has failed
+  uint64_t trunc_objectsetno = size / layout.fl_object_size / layout.fl_stripe_count;
+  uint64_t last_objectsetno = original_size / layout.fl_object_size / layout.fl_stripe_count;
+  bool exists = false;
+  for (int64_t objectno = (last_objectsetno+1) * layout.fl_stripe_count-1;
+       objectno >= (int64_t)((trunc_objectsetno + 1) * layout.fl_stripe_count);
+       objectno--) {
+    // if no object existed so far, check object existence
+    if (!exists) {
+      uint64_t nb_full_object_set = objectno / layout.fl_stripe_count;
+      uint64_t object_index_in_set = objectno % layout.fl_stripe_count;
+      uint64_t set_start_off = nb_full_object_set * layout.fl_object_size * layout.fl_stripe_count;
+      uint64_t object_start_off = set_start_off + object_index_in_set * layout.fl_stripe_unit;
+      exists = (original_size > object_start_off);
+    }
+    if (exists) {
+      // remove
+      int rc = m_ioCtx.remove(getObjectId(soid, objectno));
+      // in case the object did not exist, it means we had a sparse file, all is fine
+      if (rc && rc != -ENOENT) return rc;
+    }
+  }
+  for (int64_t objectno = ((trunc_objectsetno + 1) * layout.fl_stripe_count) -1;
+       objectno >= (int64_t)(trunc_objectsetno * layout.fl_stripe_count);
+       objectno--) {
+    // if no object existed so far, check object existence
+    if (!exists) {
+      uint64_t object_start_off = ((objectno / layout.fl_stripe_count) * layout.fl_object_size) +
+       ((objectno % layout.fl_stripe_count) * layout.fl_stripe_unit);
+      exists = (original_size > object_start_off);
+    }
+    if (exists) {
+      // truncate
+      uint64_t new_object_size = Striper::object_truncate_size(cct(), &layout, objectno, size);
+      int rc;
+      if (new_object_size > 0 or 0 == objectno) {
+       rc = m_ioCtx.trunc(getObjectId(soid, objectno), new_object_size);
+      } else {
+       rc = m_ioCtx.remove(getObjectId(soid, objectno));
+      }
+      // in case the object did not exist, it means we had a sparse file, all is fine
+      if (rc && rc != -ENOENT) return rc;
+    }
+  }
+  // all went fine, change size in the external attributes
+  std::ostringstream oss;
+  oss << size;
+  bufferlist bl;
+  bl.append(oss.str());
+  int rc = m_ioCtx.setxattr(getObjectId(soid, 0), XATTR_SIZE, bl);
+  return rc;
+}  
+
+int libradosstriper::RadosStriperImpl::grow(const std::string& soid,
+                                           uint64_t original_size,
+                                           uint64_t size,
+                                           ceph_file_layout &layout) 
+{
+  // handle the underlying rados objects. As we support sparse objects,
+  // we only have to change the size in the external attributes
+  std::ostringstream oss;
+  oss << size;
+  bufferlist bl;
+  bl.append(oss.str());
+  int rc = m_ioCtx.setxattr(getObjectId(soid, 0), XATTR_SIZE, bl);
+  return rc;
+}  
+
+std::string libradosstriper::RadosStriperImpl::getUUID()
+{
+  struct uuid_d uuid;
+  uuid.generate_random();
+  char suuid[37];
+  uuid.print(suuid);
+  return std::string(suuid);
+}
diff --git a/src/libradosstriper/RadosStriperImpl.h b/src/libradosstriper/RadosStriperImpl.h
new file mode 100644 (file)
index 0000000..7f51895
--- /dev/null
@@ -0,0 +1,319 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Sebastien Ponce <sebastien.ponce@cern.ch>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_LIBRADOSSTRIPER_RADOSSTRIPERIMPL_H
+#define CEPH_LIBRADOSSTRIPER_RADOSSTRIPERIMPL_H
+
+#include <string>
+
+#include "include/atomic.h"
+
+#include "include/rados/librados.h"
+#include "include/rados/librados.hpp"
+#include "include/radosstriper/libradosstriper.h"
+#include "include/radosstriper/libradosstriper.hpp"
+
+#include "librados/IoCtxImpl.h"
+
+struct libradosstriper::RadosStriperImpl {
+
+  /**
+   * struct handling the data needed to pass to the call back
+   * function in asynchronous operations
+   */
+  struct CompletionData {
+    /// striper to be used to handle the write completion
+    libradosstriper::RadosStriperImpl *m_striper;
+    /// striped object concerned by the write operation
+    std::string m_soid;
+    /// shared lock to be released at completion
+    std::string m_lockCookie;
+    /// completion handler
+    librados::IoCtxImpl::C_aio_Ack *m_ack;
+    /// constructor
+    CompletionData(libradosstriper::RadosStriperImpl * striper,
+                  const std::string& soid,
+                  const std::string& lockCookie,
+                  librados::AioCompletionImpl *userCompletion = 0);
+    /// destructor
+    virtual ~CompletionData();
+    /// complete method
+    void complete(int r);
+  };
+
+  /**
+   * struct handling the data needed to pass to the call back
+   * function in asynchronous read operations
+   */
+  struct ReadCompletionData : CompletionData {
+    /// bufferlist containing final result
+    bufferlist* m_bl;
+    /// extents that will be read
+    std::vector<ObjectExtent>* m_extents;
+    /// intermediate results
+    std::vector<bufferlist>* m_resultbl;
+    /// constructor
+    ReadCompletionData(libradosstriper::RadosStriperImpl * striper,
+                      const std::string& soid,
+                      const std::string& lockCookie,
+                      librados::AioCompletionImpl *userCompletion,
+                      bufferlist* bl,
+                      std::vector<ObjectExtent>* extents,
+                      std::vector<bufferlist>* resultbl);
+    /// destructor
+    virtual ~ReadCompletionData();
+    /// complete method
+    void complete(int r);
+  };
+
+  /**
+   * struct handling the data needed to pass to the call back
+   * function in asynchronous write operations
+   */
+  struct WriteCompletionData : CompletionData {
+    /// safe completion handler
+    librados::IoCtxImpl::C_aio_Safe *m_safe;
+    /// constructor
+    WriteCompletionData(libradosstriper::RadosStriperImpl * striper,
+                       const std::string& soid,
+                       const std::string& lockCookie,
+                       librados::AioCompletionImpl *userCompletion = 0);
+    /// destructor
+    virtual ~WriteCompletionData();
+    /// safe method
+    void safe(int r);
+  };
+
+  /**
+   * struct handling the data needed to pass to the call back
+   * function in asynchronous read operations of a Rados File
+   */
+  struct RadosReadCompletionData {
+    /// constructor
+    RadosReadCompletionData(MultiAioCompletionImpl *multiAioCompl,
+                           uint64_t expectedBytes,
+                           bufferlist *bl) :
+      m_multiAioCompl(multiAioCompl), m_expectedBytes(expectedBytes), m_bl(bl) {};
+    /// the multi asynch io completion object to be used
+    MultiAioCompletionImpl *m_multiAioCompl;
+    /// the expected number of bytes
+    uint64_t m_expectedBytes;
+    /// the bufferlist object where data have been written
+    bufferlist *m_bl;
+  };
+
+  /**
+   * exception wrapper around an error code
+   */
+  struct ErrorCode {
+    ErrorCode(int error) : m_code(error) {};
+    int m_code;
+  };
+    
+  /**
+   * Helper struct to handle simple locks on objects
+   */
+  struct RadosExclusiveLock {
+    /// striper to be used to handle the locking
+    librados::IoCtx* m_ioCtx;
+    /// object to be locked
+    const std::string& m_oid;
+    /// name of the lock
+    std::string m_lockCookie;
+    /// constructor : takes the lock
+    RadosExclusiveLock(librados::IoCtx* ioCtx, const std::string &oid);
+    /// destructor : releases the lock
+    ~RadosExclusiveLock();
+  };
+
+  /*
+   * Constructor
+   * @param cluster_name name of the cluster, can be NULL
+   * @param client_name has 2 meanings depending on cluster_name
+   *          - if cluster_name is null : this is the client id
+   *          - else : this is the full client name in format type.id
+   */
+  RadosStriperImpl(librados::IoCtx& ioctx, librados::IoCtxImpl *ioctx_impl);
+  /// Destructor
+  ~RadosStriperImpl() {};
+
+  // configuration
+  int setObjectLayoutStripeUnit(unsigned int stripe_unit);
+  int setObjectLayoutStripeCount(unsigned int stripe_count);
+  int setObjectLayoutObjectSize(unsigned int object_size);
+
+  // xattrs
+  int getxattr(const object_t& soid, const char *name, bufferlist& bl);
+  int setxattr(const object_t& soid, const char *name, bufferlist& bl);
+  int getxattrs(const object_t& soid, map<string, bufferlist>& attrset);
+  int rmxattr(const object_t& soid, const char *name);
+  
+  // io
+  int write(const std::string& soid, const bufferlist& bl, size_t len, uint64_t off);
+  int append(const std::string& soid, const bufferlist& bl, size_t len);
+  int write_full(const std::string& soid, const bufferlist& bl);
+  int read(const std::string& soid, bufferlist* pbl, size_t len, uint64_t off);
+
+  // asynchronous io
+  int aio_write(const std::string& soid, librados::AioCompletionImpl *c,
+               const bufferlist& bl, size_t len, uint64_t off);
+  int aio_append(const std::string& soid, librados::AioCompletionImpl *c,
+                const bufferlist& bl, size_t len);
+  int aio_write_full(const std::string& soid, librados::AioCompletionImpl *c,
+                    const bufferlist& bl);
+  int aio_read(const std::string& soid, librados::AioCompletionImpl *c,
+              bufferlist* pbl, size_t len, uint64_t off);
+  int aio_read(const std::string& soid, librados::AioCompletionImpl *c,
+              char* buf, size_t len, uint64_t off);
+  int aio_flush();
+
+  // stat, deletion and truncation
+  int stat(const std::string& soid, uint64_t *psize, time_t *pmtime);
+  int remove(const std::string& soid);
+  int trunc(const std::string& soid, uint64_t size);
+
+  // reference counting
+  void get() {
+    m_refCnt.inc();
+  }
+  void put() {
+    if (m_refCnt.dec() == 0)
+      delete this;
+  }
+
+  // objectid manipulation
+  std::string getObjectId(const object_t& soid, long long unsigned objectno);
+
+  // opening and closing of striped objects
+  int closeForWrite(const std::string& soid,
+                   const std::string& lockCookie);
+  void unlockObject(const std::string& soid,
+                   const std::string& lockCookie);
+
+  // internal versions of IO method
+  int write_in_open_object(const std::string& soid,
+                          const ceph_file_layout& layout,
+                          const std::string& lockCookie,
+                          const bufferlist& bl,
+                          size_t len,
+                          uint64_t off);
+  int aio_write_in_open_object(const std::string& soid,
+                              librados::AioCompletionImpl *c,
+                              const ceph_file_layout& layout,
+                              const std::string& lockCookie,
+                              const bufferlist& bl,
+                              size_t len,
+                              uint64_t off);
+  int internal_aio_write(const std::string& soid,
+                        libradosstriper::MultiAioCompletionImpl *c,
+                        const bufferlist& bl,
+                        size_t len,
+                        uint64_t off,
+                        const ceph_file_layout& layout);
+
+  int extract_uint32_attr(std::map<std::string, bufferlist> &attrs,
+                         const std::string& key,
+                         ceph_le32 *value);
+
+  int extract_sizet_attr(std::map<std::string, bufferlist> &attrs,
+                        const std::string& key,
+                        size_t *value);
+
+  int internal_get_layout_and_size(const std::string& oid,
+                                  ceph_file_layout *layout,
+                                  uint64_t *size);
+
+  /**
+   * opens an existing striped object and takes a shared lock on it
+   * @return 0 if everything is ok and the lock was taken. -errcode otherwise
+   * In particulae, if the striped object does not exists, -ENOENT is returned
+   * In case the return code in not 0, no lock is taken
+   */
+  int openStripedObjectForRead(const std::string& soid,
+                              ceph_file_layout *layout,
+                              uint64_t *size,
+                              std::string *lockCookie);
+
+  /**
+   * opens an existing striped object, takes a shared lock on it
+   * and sets its size to the size it will have after the write.
+   * In case the striped object does not exists, it will create it by
+   * calling createOrOpenStripedObject.
+   * @param layout this is filled with the layout of the file 
+   * @param size new size of the file (together with isFileSizeAbsolute)
+   * In case of success, this is filled with the size of the file before the opening
+   * @param isFileSizeAbsolute if false, this means that the given size should
+   * be added to the current file size (append mode)
+   * @return 0 if everything is ok and the lock was taken. -errcode otherwise
+   * In case the return code in not 0, no lock is taken
+   */
+  int openStripedObjectForWrite(const std::string& soid,
+                               ceph_file_layout *layout,
+                               uint64_t *size,
+                               std::string *lockCookie,
+                               bool isFileSizeAbsolute);
+  /**
+   * creates an empty striped object with the given size and opens it calling
+   * openStripedObjectForWrite, which implies taking a shared lock on it
+   * Also deals with the cases where the object was created in the mean time
+   * @param isFileSizeAbsolute if false, this means that the given size should
+   * be added to the current file size (append mode). This of course only makes
+   * sense in case the striped object already exists
+   * @return 0 if everything is ok and the lock was taken. -errcode otherwise
+   * In case the return code in not 0, no lock is taken
+   */
+  int createAndOpenStripedObject(const std::string& soid,
+                                ceph_file_layout *layout,
+                                uint64_t size,
+                                std::string *lockCookie,
+                                bool isFileSizeAbsolute);
+
+  /**
+   * truncates an object. Should only be called with size < original_size
+   */
+  int truncate(const std::string& soid,
+              uint64_t original_size,
+              uint64_t size,
+              ceph_file_layout &layout);
+
+  /**
+   * grows an object (adding 0s). Should only be called with size > original_size
+   */
+  int grow(const std::string& soid,
+          uint64_t original_size,
+          uint64_t size,
+          ceph_file_layout &layout);
+  
+  /**
+   * creates a unique identifier
+   */
+  static std::string getUUID();
+  
+  CephContext *cct() {
+    return (CephContext*)m_radosCluster.cct();
+  }
+
+  // reference counting
+  atomic_t m_refCnt;
+
+  // Context
+  librados::Rados m_radosCluster;
+  librados::IoCtx m_ioCtx;
+  librados::IoCtxImpl *m_ioCtxImpl;
+
+  // Default layout
+  ceph_file_layout m_layout;
+};
+
+#endif
diff --git a/src/libradosstriper/libradosstriper.cc b/src/libradosstriper/libradosstriper.cc
new file mode 100644 (file)
index 0000000..e02c8ca
--- /dev/null
@@ -0,0 +1,613 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Sebastien Ponce <sebastien.ponce@cern.ch>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <errno.h>
+
+#include "libradosstriper/RadosStriperImpl.h"
+#include "libradosstriper/MultiAioCompletionImpl.h"
+
+#include "include/types.h"
+
+#include "include/radosstriper/libradosstriper.h"
+#include "include/radosstriper/libradosstriper.hpp"
+#include "librados/RadosXattrIter.h"
+
+/*
+ * This file implements the rados striper API.
+ * There are 2 flavours of it :
+ *   - the C API, found in include/rados/libradosstriper.h
+ *   - the C++ API, found in include/rados/libradosstriper.hpp
+ */
+
+///////////////////////////// C++ API //////////////////////////////
+
+libradosstriper::MultiAioCompletion::~MultiAioCompletion()
+{
+  delete pc;
+}
+
+int libradosstriper::MultiAioCompletion::set_complete_callback
+(void *cb_arg, rados_callback_t cb)
+{
+  MultiAioCompletionImpl *c = (MultiAioCompletionImpl *)pc;
+  return c->set_complete_callback(cb_arg, cb);
+}
+
+int libradosstriper::MultiAioCompletion::set_safe_callback
+(void *cb_arg, rados_callback_t cb)
+{
+  MultiAioCompletionImpl *c = (MultiAioCompletionImpl *)pc;
+  return c->set_safe_callback(cb_arg, cb);
+}
+
+void libradosstriper::MultiAioCompletion::wait_for_complete()
+{
+  MultiAioCompletionImpl *c = (MultiAioCompletionImpl *)pc;
+  c->wait_for_complete();
+}
+
+void libradosstriper::MultiAioCompletion::wait_for_safe()
+{
+  MultiAioCompletionImpl *c = (MultiAioCompletionImpl *)pc;
+  c->wait_for_safe();
+}
+
+bool libradosstriper::MultiAioCompletion::is_complete()
+{
+  MultiAioCompletionImpl *c = (MultiAioCompletionImpl *)pc;
+  return c->is_complete();
+}
+
+bool libradosstriper::MultiAioCompletion::is_safe()
+{
+  MultiAioCompletionImpl *c = (MultiAioCompletionImpl *)pc;
+  return c->is_safe();
+}
+
+void libradosstriper::MultiAioCompletion::wait_for_complete_and_cb()
+{
+  MultiAioCompletionImpl *c = (MultiAioCompletionImpl *)pc;
+  c->wait_for_complete_and_cb();
+}
+
+void libradosstriper::MultiAioCompletion::MultiAioCompletion::wait_for_safe_and_cb()
+{
+  MultiAioCompletionImpl *c = (MultiAioCompletionImpl *)pc;
+  c->wait_for_safe_and_cb();
+}
+
+bool libradosstriper::MultiAioCompletion::is_complete_and_cb()
+{
+  MultiAioCompletionImpl *c = (MultiAioCompletionImpl *)pc;
+  return c->is_complete_and_cb();
+}
+
+bool libradosstriper::MultiAioCompletion::is_safe_and_cb()
+{
+  MultiAioCompletionImpl *c = (MultiAioCompletionImpl *)pc;
+  return c->is_safe_and_cb();
+}
+
+int libradosstriper::MultiAioCompletion::get_return_value()
+{
+  MultiAioCompletionImpl *c = (MultiAioCompletionImpl *)pc;
+  return c->get_return_value();
+}
+
+void libradosstriper::MultiAioCompletion::release()
+{
+  MultiAioCompletionImpl *c = (MultiAioCompletionImpl *)pc;
+  c->release();
+  delete this;
+}
+
+libradosstriper::RadosStriper::RadosStriper() :
+  rados_striper_impl(0)
+{
+}
+
+void libradosstriper::RadosStriper::to_rados_striper_t(RadosStriper &striper, rados_striper_t *s)
+{
+  *s = (rados_striper_t)striper.rados_striper_impl;
+  striper.rados_striper_impl->get();
+}
+
+libradosstriper::RadosStriper::RadosStriper(const RadosStriper& rs)
+{
+  rados_striper_impl = rs.rados_striper_impl;
+  if (rados_striper_impl) {
+    rados_striper_impl->get();
+  }
+}
+
+libradosstriper::RadosStriper& libradosstriper::RadosStriper::operator=(const RadosStriper& rs)
+{
+  if (rados_striper_impl)
+    rados_striper_impl->put();
+  rados_striper_impl = rs.rados_striper_impl;
+  rados_striper_impl->get();
+  return *this;
+}
+
+libradosstriper::RadosStriper::~RadosStriper()
+{
+  if (rados_striper_impl)
+    rados_striper_impl->put();
+  rados_striper_impl = 0;
+}
+
+int libradosstriper::RadosStriper::striper_create(librados::IoCtx& ioctx,
+                                                 RadosStriper *striper)
+{
+  try {
+    striper->rados_striper_impl = new libradosstriper::RadosStriperImpl(ioctx, ioctx.io_ctx_impl);
+    striper->rados_striper_impl->get();
+  } catch (int rc) {
+    return rc;
+  }
+  return 0;
+}
+
+int libradosstriper::RadosStriper::set_object_layout_stripe_unit
+(unsigned int stripe_unit)
+{
+  return rados_striper_impl->setObjectLayoutStripeUnit(stripe_unit);
+}
+
+int libradosstriper::RadosStriper::set_object_layout_stripe_count
+(unsigned int stripe_count)
+{
+  return rados_striper_impl->setObjectLayoutStripeCount(stripe_count);
+}
+
+int libradosstriper::RadosStriper::set_object_layout_object_size
+(unsigned int object_size)
+{
+  return rados_striper_impl->setObjectLayoutObjectSize(object_size);
+}
+
+int libradosstriper::RadosStriper::getxattr(const std::string& oid, const char *name, bufferlist& bl)
+{
+  return rados_striper_impl->getxattr(oid, name, bl);
+}
+
+int libradosstriper::RadosStriper::setxattr(const std::string& oid, const char *name, bufferlist& bl)
+{
+  return rados_striper_impl->setxattr(oid, name, bl);
+}
+
+int libradosstriper::RadosStriper::rmxattr(const std::string& oid, const char *name)
+{
+  return rados_striper_impl->rmxattr(oid, name);
+}
+
+int libradosstriper::RadosStriper::getxattrs(const std::string& oid,
+                                            std::map<std::string, bufferlist>& attrset)
+{
+  return rados_striper_impl->getxattrs(oid, attrset);
+}
+
+int libradosstriper::RadosStriper::write(const std::string& soid,
+                                        const bufferlist& bl,
+                                        size_t len,
+                                        uint64_t off)
+{
+  return rados_striper_impl->write(soid, bl, len, off);
+}
+
+int libradosstriper::RadosStriper::write_full(const std::string& soid,
+                                             const bufferlist& bl)
+{
+  return rados_striper_impl->write_full(soid, bl);
+}
+
+int libradosstriper::RadosStriper::append(const std::string& soid,
+                                         const bufferlist& bl,
+                                         size_t len)
+{
+  return rados_striper_impl->append(soid, bl, len);
+}
+
+int libradosstriper::RadosStriper::aio_write(const std::string& soid,
+                                            librados::AioCompletion *c,
+                                            const bufferlist& bl,
+                                            size_t len,
+                                            uint64_t off)
+{
+  return rados_striper_impl->aio_write(soid, c->pc, bl, len, off);
+}
+
+int libradosstriper::RadosStriper::aio_write_full(const std::string& soid,
+                                                 librados::AioCompletion *c,
+                                                 const bufferlist& bl)
+{
+  return rados_striper_impl->aio_write_full(soid, c->pc, bl);
+}
+
+int libradosstriper::RadosStriper::aio_append(const std::string& soid,
+                                             librados::AioCompletion *c,
+                                             const bufferlist& bl,
+                                             size_t len)
+{
+  return rados_striper_impl->aio_append(soid, c->pc, bl, len);
+}
+
+int libradosstriper::RadosStriper::read(const std::string& soid,
+                                       bufferlist* bl,
+                                       size_t len,
+                                       uint64_t off)
+{
+  bl->clear();
+  bl->push_back(buffer::create(len));
+  return rados_striper_impl->read(soid, bl, len, off);
+}
+
+int libradosstriper::RadosStriper::aio_read(const std::string& soid,
+                                           librados::AioCompletion *c,
+                                           bufferlist* bl,
+                                           size_t len,
+                                           uint64_t off)
+{
+  bl->clear();
+  bl->push_back(buffer::create(len));
+  return rados_striper_impl->aio_read(soid, c->pc, bl, len, off);
+}
+
+int libradosstriper::RadosStriper::stat(const std::string& soid, uint64_t *psize, time_t *pmtime)
+{
+  return rados_striper_impl->stat(soid, psize, pmtime);
+}
+
+int libradosstriper::RadosStriper::remove(const std::string& soid)
+{
+  return rados_striper_impl->remove(soid);
+}
+
+int libradosstriper::RadosStriper::trunc(const std::string& soid, uint64_t size)
+{
+  return rados_striper_impl->trunc(soid, size);
+}
+
+int libradosstriper::RadosStriper::aio_flush()
+{
+  return rados_striper_impl->aio_flush();
+}
+
+libradosstriper::MultiAioCompletion* libradosstriper::RadosStriper::multi_aio_create_completion()
+{
+  MultiAioCompletionImpl *c = new MultiAioCompletionImpl;
+  return new MultiAioCompletion(c);
+}
+
+libradosstriper::MultiAioCompletion*
+libradosstriper::RadosStriper::multi_aio_create_completion(void *cb_arg,
+                                                          librados::callback_t cb_complete,
+                                                          librados::callback_t cb_safe)
+{
+  MultiAioCompletionImpl *c;
+  int r = rados_striper_multi_aio_create_completion(cb_arg, cb_complete, cb_safe, (void**)&c);
+  assert(r == 0);
+  return new MultiAioCompletion(c);
+}
+
+///////////////////////////// C API //////////////////////////////
+
+extern "C" int rados_striper_create(rados_ioctx_t ioctx,
+                                   rados_striper_t *striper)
+{
+  librados::IoCtx ctx;
+  librados::IoCtx::from_rados_ioctx_t(ioctx, ctx);
+  libradosstriper::RadosStriper striperp;
+  int rc = libradosstriper::RadosStriper::striper_create(ctx, &striperp);
+  if (0 == rc)
+    libradosstriper::RadosStriper::to_rados_striper_t(striperp, striper);
+  return rc;
+}
+
+extern "C" void rados_striper_destroy(rados_striper_t striper)
+{
+  libradosstriper::RadosStriperImpl *impl = (libradosstriper::RadosStriperImpl *)striper;
+  impl->put();
+}
+
+extern "C" int rados_striper_set_object_layout_stripe_unit(rados_striper_t striper,
+                                                          unsigned int stripe_unit)
+{
+  libradosstriper::RadosStriperImpl *impl = (libradosstriper::RadosStriperImpl *)striper;
+  return impl->setObjectLayoutStripeUnit(stripe_unit);
+}
+
+extern "C" int rados_striper_set_object_layout_stripe_count(rados_striper_t striper,
+                                                           unsigned int stripe_count)
+{
+  libradosstriper::RadosStriperImpl *impl = (libradosstriper::RadosStriperImpl *)striper;
+  return impl->setObjectLayoutStripeCount(stripe_count);
+}
+
+extern "C" int rados_striper_set_object_layout_object_size(rados_striper_t striper,
+                                                          unsigned int object_size)
+{
+  libradosstriper::RadosStriperImpl *impl = (libradosstriper::RadosStriperImpl *)striper;
+  return impl->setObjectLayoutObjectSize(object_size);
+}
+
+extern "C" int rados_striper_write(rados_striper_t striper,
+                                  const char *soid,
+                                  const char *buf,
+                                  size_t len,
+                                  uint64_t off)
+{
+  libradosstriper::RadosStriperImpl *impl = (libradosstriper::RadosStriperImpl *)striper;
+  bufferlist bl;
+  bl.append(buf, len);
+  return impl->write(soid, bl, len, off);
+}
+
+extern "C" int rados_striper_write_full(rados_striper_t striper,
+                                       const char *soid,
+                                       const char *buf,
+                                       size_t len)
+{
+  libradosstriper::RadosStriperImpl *impl = (libradosstriper::RadosStriperImpl *)striper;
+  bufferlist bl;
+  bl.append(buf, len);
+  return impl->write_full(soid, bl);
+}
+
+
+extern "C" int rados_striper_append(rados_striper_t striper,
+                                   const char *soid,
+                                   const char *buf,
+                                   size_t len)
+{
+  libradosstriper::RadosStriperImpl *impl = (libradosstriper::RadosStriperImpl *)striper;
+  bufferlist bl;
+  bl.append(buf, len);
+  return impl->append(soid, bl, len);
+}
+
+extern "C" int rados_striper_read(rados_striper_t striper,
+                                 const char *soid,
+                                 char *buf,
+                                 size_t len,
+                                 uint64_t off)
+{
+  libradosstriper::RadosStriperImpl *impl = (libradosstriper::RadosStriperImpl *)striper;
+  bufferlist bl;
+  bufferptr bp = buffer::create_static(len, buf);
+  bl.push_back(bp);
+  int ret = impl->read(soid, &bl, len, off);
+  if (ret >= 0) {
+    if (bl.length() > len)
+      return -ERANGE;
+    if (bl.c_str() != buf)
+      bl.copy(0, bl.length(), buf);
+    ret = bl.length();    // hrm :/
+  }
+  return ret;
+}
+
+extern "C" int rados_striper_remove(rados_striper_t striper, const char* soid)
+{
+  libradosstriper::RadosStriperImpl *impl = (libradosstriper::RadosStriperImpl *)striper;
+  return impl->remove(soid);
+}
+
+extern "C" int rados_striper_trunc(rados_striper_t striper, const char* soid, uint64_t size)
+{
+  libradosstriper::RadosStriperImpl *impl = (libradosstriper::RadosStriperImpl *)striper;
+  return impl->trunc(soid, size);
+}
+
+extern "C" int rados_striper_getxattr(rados_striper_t striper,
+                                     const char *oid,
+                                     const char *name,
+                                     char *buf,
+                                     size_t len)
+{
+  libradosstriper::RadosStriperImpl *impl = (libradosstriper::RadosStriperImpl *)striper;
+  object_t obj(oid);
+  bufferlist bl;
+  int ret = impl->getxattr(oid, name, bl);
+  if (ret >= 0) {
+    if (bl.length() > len)
+      return -ERANGE;
+    bl.copy(0, bl.length(), buf);
+    ret = bl.length();
+  }
+  return ret;
+}
+
+extern "C" int rados_striper_setxattr(rados_striper_t striper,
+                                     const char *oid,
+                                     const char *name,
+                                     const char *buf,
+                                     size_t len)
+{
+  libradosstriper::RadosStriperImpl *impl = (libradosstriper::RadosStriperImpl *)striper;
+  object_t obj(oid);
+  bufferlist bl;
+  bl.append(buf, len);
+  return impl->setxattr(obj, name, bl);
+}
+
+extern "C" int rados_striper_rmxattr(rados_striper_t striper,
+                                    const char *oid,
+                                    const char *name)
+{
+  libradosstriper::RadosStriperImpl *impl = (libradosstriper::RadosStriperImpl *)striper;
+  object_t obj(oid);
+  return impl->rmxattr(obj, name);
+}
+
+extern "C" int rados_striper_getxattrs(rados_striper_t striper,
+                                      const char *oid,
+                                      rados_xattrs_iter_t *iter)
+{
+  libradosstriper::RadosStriperImpl *impl = (libradosstriper::RadosStriperImpl *)striper;
+  object_t obj(oid);
+  librados::RadosXattrsIter *it = new librados::RadosXattrsIter();
+  if (!it)
+    return -ENOMEM;
+  int ret = impl->getxattrs(obj, it->attrset);
+  if (ret) {
+    delete it;
+    return ret;
+  }
+  it->i = it->attrset.begin();
+  librados::RadosXattrsIter **iret = (librados::RadosXattrsIter**)iter;
+  *iret = it;
+  *iter = it;
+  return 0;
+}
+
+extern "C" int rados_striper_getxattrs_next(rados_xattrs_iter_t iter,
+                                           const char **name,
+                                           const char **val,
+                                           size_t *len)
+{
+  return rados_getxattrs_next(iter, name, val, len);
+}
+
+extern "C" void rados_striper_getxattrs_end(rados_xattrs_iter_t iter)
+{
+  return rados_getxattrs_end(iter);
+}
+
+extern "C" int rados_striper_stat(rados_striper_t striper,
+                                 const char* soid,
+                                 uint64_t *psize,
+                                 time_t *pmtime)
+{
+  libradosstriper::RadosStriperImpl *impl = (libradosstriper::RadosStriperImpl *)striper;
+  return impl->stat(soid, psize, pmtime);
+}
+
+extern "C" int rados_striper_multi_aio_create_completion(void *cb_arg,
+                                                        rados_callback_t cb_complete,
+                                                        rados_callback_t cb_safe,
+                                                        rados_striper_multi_completion_t *pc)
+{
+  libradosstriper::MultiAioCompletionImpl *c = new libradosstriper::MultiAioCompletionImpl;
+  if (cb_complete)
+    c->set_complete_callback(cb_arg, cb_complete);
+  if (cb_safe)
+    c->set_safe_callback(cb_arg, cb_safe);
+  *pc = c;
+  return 0;
+}
+
+extern "C" void rados_striper_multi_aio_wait_for_complete(rados_striper_multi_completion_t c)
+{
+  ((libradosstriper::MultiAioCompletionImpl*)c)->wait_for_complete();
+}
+
+extern "C" void rados_striper_multi_aio_wait_for_safe(rados_striper_multi_completion_t c)
+{
+  ((libradosstriper::MultiAioCompletionImpl*)c)->wait_for_safe();
+}
+
+extern "C" int rados_striper_multi_aio_is_complete(rados_striper_multi_completion_t c)
+{
+  return ((libradosstriper::MultiAioCompletionImpl*)c)->is_complete();
+}
+
+extern "C" int rados_striper_multi_aio_is_safe(rados_striper_multi_completion_t c)
+{
+  return ((libradosstriper::MultiAioCompletionImpl*)c)->is_safe();
+}
+
+extern "C" void rados_striper_multi_aio_wait_for_complete_and_cb(rados_striper_multi_completion_t c)
+{
+  ((libradosstriper::MultiAioCompletionImpl*)c)->wait_for_complete_and_cb();
+}
+
+extern "C" void rados_striper_multi_aio_wait_for_safe_and_cb(rados_striper_multi_completion_t c)
+{
+  ((libradosstriper::MultiAioCompletionImpl*)c)->wait_for_safe_and_cb();
+}
+
+extern "C" int rados_striper_multi_aio_is_complete_and_cb(rados_striper_multi_completion_t c)
+{
+  return ((libradosstriper::MultiAioCompletionImpl*)c)->is_complete_and_cb();
+}
+
+extern "C" int rados_striper_multi_aio_is_safe_and_cb(rados_striper_multi_completion_t c)
+{
+  return ((libradosstriper::MultiAioCompletionImpl*)c)->is_safe_and_cb();
+}
+
+extern "C" int rados_striper_multi_aio_get_return_value(rados_striper_multi_completion_t c)
+{
+  return ((libradosstriper::MultiAioCompletionImpl*)c)->get_return_value();
+}
+
+extern "C" void rados_striper_multi_aio_release(rados_striper_multi_completion_t c)
+{
+  ((libradosstriper::MultiAioCompletionImpl*)c)->put();
+}
+
+extern "C" int rados_striper_aio_write(rados_striper_t striper,
+                                      const char* soid,
+                                      rados_completion_t completion,
+                                      const char *buf,
+                                      size_t len,
+                                      uint64_t off)
+{
+  libradosstriper::RadosStriperImpl *impl = (libradosstriper::RadosStriperImpl *)striper;
+  bufferlist bl;
+  bl.append(buf, len);
+  return impl->aio_write(soid, (librados::AioCompletionImpl*)completion, bl, len, off);
+}
+
+extern "C" int rados_striper_aio_append(rados_striper_t striper,
+                                       const char* soid,
+                                       rados_completion_t completion,
+                                       const char *buf,
+                                       size_t len)
+{
+  libradosstriper::RadosStriperImpl *impl = (libradosstriper::RadosStriperImpl *)striper;
+  bufferlist bl;
+  bl.append(buf, len);
+  return impl->aio_append(soid, (librados::AioCompletionImpl*)completion, bl, len);
+}
+
+extern "C" int rados_striper_aio_write_full(rados_striper_t striper,
+                                           const char* soid,
+                                           rados_completion_t completion,
+                                           const char *buf,
+                                           size_t len)
+{
+  libradosstriper::RadosStriperImpl *impl = (libradosstriper::RadosStriperImpl *)striper;
+  bufferlist bl;
+  bl.append(buf, len);
+  return impl->aio_write_full(soid, (librados::AioCompletionImpl*)completion, bl);
+}
+
+extern "C" int rados_striper_aio_read(rados_striper_t striper,
+                                     const char *soid,
+                                     rados_completion_t completion,
+                                     char *buf,
+                                     size_t len,
+                                     uint64_t off)
+{
+  libradosstriper::RadosStriperImpl *impl = (libradosstriper::RadosStriperImpl *)striper;
+  return impl->aio_read(soid, (librados::AioCompletionImpl*)completion, buf, len, off);
+}
+
+extern "C" void rados_striper_aio_flush(rados_striper_t striper)
+{
+  libradosstriper::RadosStriperImpl *impl = (libradosstriper::RadosStriperImpl *)striper;
+  impl->aio_flush();
+}