MDS_OBJS= \
mds/MDS.o\
mds/MDCache.o\
+ mds/Migrator.o\
mds/MDBalancer.o\
mds/CDentry.o\
mds/CDir.o\
assert(fd >= 0);
struct stat st;
::fstat(fd, &st);
- bufferptr bp = new buffer(st.st_size);
+ bufferptr bp(st.st_size);
bl.append(bp);
::read(fd, (void*)bl.c_str(), bl.length());
::close(fd);
+
+
// ceph stuff
#include "Client.h"
// time it.
utime_t start = g_clock.now();
+ // copy into fresh buffer (since our write may be resub, async)
+ bufferptr bp = buffer::copy(buf, size);
+ bufferlist blist;
+ blist.push_back( bp );
+
if (g_conf.client_oc) { // buffer cache ON?
assert(objectcacher);
- bufferlist blist;
- blist.push_back( new buffer(buf, size) );
-
// write (this may block!)
in->fc.write(offset, size, blist, client_lock);
// legacy, inconsistent synchronous write.
dout(7) << "synchronous write" << endl;
- // copy into fresh buffer (since our write may be resub, async)
- bufferlist blist;
- blist.push_back( new buffer(buf, size) );
-
// prepare write
Cond cond;
bool done = false;
#ifndef __CLIENT_H
#define __CLIENT_H
-extern "C" {
-#include <dirent.h>
-}
#include "mds/MDSMap.h"
#include "osd/OSDMap.h"
*/
-
-#ifndef _Cond_Posix_
-#define _Cond_Posix_
+#ifndef __COND_H
+#define __COND_H
#include <time.h>
#include <pthread.h>
#include <cassert>
-class Cond
-{
- mutable pthread_cond_t C;
+class Cond {
+ // my bits
+ pthread_cond_t _c;
+ // don't allow copying.
void operator=(Cond &C) {}
Cond( const Cond &C ) {}
public:
-
Cond() {
- int r = pthread_cond_init(&C,NULL);
+ int r = pthread_cond_init(&_c,NULL);
assert(r == 0);
}
-
virtual ~Cond() {
- pthread_cond_destroy(&C);
+ pthread_cond_destroy(&_c);
}
int Wait(Mutex &mutex) {
- int r = pthread_cond_wait(&C, &mutex.M);
+ int r = pthread_cond_wait(&_c, &mutex._m);
return r;
}
int Wait(Mutex &mutex, char* s) {
- cout << "Wait: " << s << endl;
- int r = pthread_cond_wait(&C, &mutex.M);
+ //cout << "Wait: " << s << endl;
+ int r = pthread_cond_wait(&_c, &mutex._m);
return r;
}
struct timespec ts;
g_clock.make_timespec(when, &ts);
//cout << "timedwait for " << ts.tv_sec << " sec " << ts.tv_nsec << " nsec" << endl;
- int r = pthread_cond_timedwait(&C, &mutex.M, &ts);
+ int r = pthread_cond_timedwait(&_c, &mutex._m, &ts);
return r;
}
int WaitInterval(Mutex &mutex, utime_t interval) {
}
int Signal() {
- //int r = pthread_cond_signal(&C);
- int r = pthread_cond_broadcast(&C);
+ //int r = pthread_cond_signal(&_c);
+ int r = pthread_cond_broadcast(&_c);
return r;
}
int SignalOne() {
- int r = pthread_cond_signal(&C);
+ int r = pthread_cond_signal(&_c);
return r;
}
int SignalAll() {
- //int r = pthread_cond_signal(&C);
- int r = pthread_cond_broadcast(&C);
+ //int r = pthread_cond_signal(&_c);
+ int r = pthread_cond_broadcast(&_c);
return r;
}
};
}
};
-#endif // !_Cond_Posix_
+#endif
*
*/
-
-/////////////////////////////////////////////////////////////////////
-// Written by Phillip Sitbon
-// Copyright 2003
-//
-// Posix/Mutex.h
-// - Resource locking mechanism using Posix mutexes
-//
-/////////////////////////////////////////////////////////////////////
-
-#ifndef _Mutex_Posix_
-#define _Mutex_Posix_
+#ifndef __MUTEX_H
+#define __MUTEX_H
#include <pthread.h>
#include <cassert>
-class Mutex
-{
- mutable pthread_mutex_t M;
+class Mutex {
+private:
+ pthread_mutex_t _m;
+ int nlock;
+ bool recursive;
+
+ // don't allow copying.
void operator=(Mutex &M) {}
Mutex( const Mutex &M ) {}
- bool tag;
- int locked;
-
- public:
-
- Mutex() : tag(false), locked(0)
- {
- pthread_mutexattr_t attr;
- pthread_mutexattr_init(&attr);
- pthread_mutexattr_settype(&attr,PTHREAD_MUTEX_RECURSIVE);
- pthread_mutex_init(&M,&attr);
- //cout << this << " mutex init = " << r << endl;
- pthread_mutexattr_destroy(&attr);
- }
- Mutex(bool t) : tag(t)
- {
- assert(0);
- pthread_mutexattr_t attr;
- pthread_mutexattr_init(&attr);
- pthread_mutexattr_settype(&attr,PTHREAD_MUTEX_RECURSIVE);
- pthread_mutex_init(&M,&attr);
- //cout << this << " mutex init = " << r << endl;
- pthread_mutexattr_destroy(&attr);
+public:
+ Mutex(bool r = true) : nlock(0), recursive(r) {
+ if (recursive) {
+ pthread_mutexattr_t attr;
+ pthread_mutexattr_init(&attr);
+ pthread_mutexattr_settype(&attr,PTHREAD_MUTEX_RECURSIVE);
+ pthread_mutex_init(&_m,&attr);
+ pthread_mutexattr_destroy(&attr);
+ } else {
+ pthread_mutex_init(&_m,NULL);
+ }
}
-
- bool is_locked() { return locked > 0; }
- int get_lock_count() { return locked; }
-
- virtual ~Mutex()
- {
- if (locked < 0) cerr << "Mutex(" << this << "," << pthread_self() << ").destructor locked = " << locked << " < 0" << endl;
- //pthread_mutex_unlock(&M);
- pthread_mutex_destroy(&M);
- }
-
- int Lock() {
- int t = tag;
- if (t) cout << this << " " << pthread_self() << endl;
- int r = pthread_mutex_lock(&M);
- if (t) cout << "lock = " << r << endl;
- locked++;
- return r;
- }
-
- int Lock(char *s) {
- cout << "Lock: " << s << endl;
- int r = pthread_mutex_lock(&M);
- cout << this << " " << pthread_self() << " lock = " << r << endl;
- locked++;
- return r;
+ virtual ~Mutex() {
+ assert(nlock == 0);
+ pthread_mutex_destroy(&_m);
}
- int Lock_Try() const
- {
- return pthread_mutex_trylock(&M);
+ bool is_locked() {
+ return (nlock > 0);
}
- int Unlock()
- {
- int t = tag;
- locked--;
- if (locked < 0) cerr << "Mutex(" << this << "," << pthread_self() << ").Unlock locked = " << locked << " < 0" << endl;
- if (t) cout << this << " " << pthread_self() << endl;
- int r = pthread_mutex_unlock(&M);
- if (t) cout << "lock = " << r << endl;
- return r;
+ void Lock() {
+ int r = pthread_mutex_lock(&_m);
+ assert(r == 0);
+ nlock++;
+ assert(nlock == 1 || recursive);
}
- int Unlock(char *s)
- {
- cout << "Unlock: " << s << endl;
- locked--;
- if (locked < 0) cerr << "Mutex(" << this << "," << pthread_self() << ").Unlock locked = " << locked << " < 0" << endl;
- int r = pthread_mutex_unlock(&M);
- cout << this << " " << pthread_self() << " unlock = " << r << endl;
- return r;
+ void Unlock() {
+ assert(nlock > 0);
+ --nlock;
+ int r = pthread_mutex_unlock(&_m);
+ assert(r == 0);
}
friend class Cond;
};
-#endif // !_Mutex_Posix_
+#endif
if (!pending.empty()) {
sleeping = false;
lock.Unlock();
- { // make sure we're not holding any locks while we do callbacks (or talk to the messenger)
+ { // make sure we're not holding any locks while we do callbacks
// make the callbacks myself.
for (list<Context*>::iterator cit = pending.begin();
cit != pending.end();
} else {
dout(DBL) << "register_timer starting thread" << endl;
timer_thread.create();
- //pthread_create(&thread_id, NULL, timer_thread_entrypoint, (void*)this);
}
}
dout(10) << "waiting for thread to finish" << endl;
void *ptr;
- timer_thread.join(&ptr);//pthread_join(thread_id, &ptr);
+ timer_thread.join(&ptr);
dout(10) << "thread finished, exit code " << ptr << endl;
}
public:
Timer() :
- //thread_id0),
thread_stop(false),
timed_sleep(false),
sleeping(false),
{
}
~Timer() {
+ // stop.
+ cancel_timer();
+
// scheduled
for (map< utime_t, multiset<Context*> >::iterator it = scheduled.begin();
it != scheduled.end();
cancel_timer();
}
- /*
- void set_messenger_kicker(Context *c);
- void unset_messenger_kicker();
-
- void set_messenger(Messenger *m);
- void unset_messenger();
- */
-
// schedule events
void add_event_after(float seconds,
Context *callback);
assert(fd >= 0);
struct stat st;
::fstat(fd, &st);
- bufferptr bp = new buffer(st.st_size);
+ bufferptr bp(st.st_size);
bl.append(bp);
::read(fd, (void*)bl.c_str(), bl.length());
::close(fd);
//#include <set>
using namespace std;
-#include "include/bufferlist.h"
+#include "include/buffer.h"
namespace crush {
#include <math.h>
-#include "include/bufferlist.h"
+#include "include/buffer.h"
namespace crush {
#include "Bucket.h"
-#include "include/bufferlist.h"
+#include "include/buffer.h"
namespace crush {
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef __EBOFS_BUFFERPOOL_H
-#define __EBOFS_BUFFERPOOL_H
-
-
-#include <iostream>
-#include <list>
-using namespace std;
-
-// for posix_memalign
-#define _XOPEN_SOURCE 600
-#include <stdlib.h>
-#include <malloc.h>
-
-// for mmap
-#include <sys/mman.h>
-
-#include "include/buffer.h"
-#include "include/bufferlist.h"
-
-#include "config.h"
-
-
-
-class AlignedBufferPool {
- int alignment; // err, this isn't actually enforced! we just use mmap.
-
- bool dommap;
-
- off_t talloc;
-
- public:
- AlignedBufferPool(int a) : alignment(a), dommap(true), talloc(0) {}
- ~AlignedBufferPool() {
- }
-
-#ifdef __CYGWIN__
-
- bufferptr alloc(size_t bytes) {
- // overallocate.
- bufferptr bp = new buffer(bytes+4095);
- int off = (unsigned)bp.c_str() % 4096;
- if (off)
- bp.set_offset(4096-off);
- bp.set_length(bytes);
- return bp;
- }
-
- bufferptr alloc_page() {
- return alloc(4096);
- }
-
- void alloc(size_t bytes, bufferlist& bl) {
- bl.clear();
-
- // keep allocations reasonably small to avoid fragmenting memory
- while (bytes > 0) {
- size_t max = MIN(bytes, g_conf.ebofs_abp_max_alloc);
- bufferptr bp = alloc(max);
- bl.push_back( bp );
- bytes -= max;
- }
- }
-
-#else
-
- void free(char *p, unsigned len) {
- dout(10) << "bufferpool(" << (void*)this << ").free " << (void*)p << " len " << len << " ... total " << talloc << endl;
- talloc -= len;
- if (dommap)
- ::munmap(p, len);
- else
- ::free((void*)p);
- }
-
- static void aligned_buffer_free_func(void *arg, char *ptr, unsigned len) {
- AlignedBufferPool *pool = (AlignedBufferPool*)arg;
- pool->free(ptr, len);
- }
-
- buffer *alloc(size_t bytes) {
- assert(bytes % alignment == 0);
- char *p = 0;
-
- // use mmap or posix_memalign.
- if (dommap)
- p = (char*)::mmap(NULL, bytes, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
- else
- ::posix_memalign((void**)&p, alignment, bytes);
- assert(p);
-
- talloc += bytes;
-
- if (g_conf.ebofs_abp_zero)
- ::bzero(p, bytes); // only to shut up valgrind
-
- dout(10) << "bufferpool(" << (void*)this << ").alloc " << (void*)p << " len " << bytes << " ... total " << talloc << endl;
-
- return
- new buffer(p, bytes, BUFFER_MODE_NOCOPY|BUFFER_MODE_NOFREE|BUFFER_MODE_CUSTOMFREE,
- bytes,
- aligned_buffer_free_func, this);
- }
-
- // allocate a single buffer
- buffer* alloc_page() {
- return alloc(alignment);
- }
-
-
- // bufferlists
- void alloc(size_t bytes, bufferlist& bl) {
- bl.clear();
-
- // keep allocations reasonably small to avoid fragmenting memory
- while (bytes > 0) {
- size_t max = MIN(bytes, g_conf.ebofs_abp_max_alloc);
- bl.push_back( alloc(max) );
- bytes -= max;
- }
- }
-
-#endif
-
-};
-
-
-#endif
struct iovec iov[ bl.buffers().size() ];
int n = 0;
size_t left = len;
- for (list<bufferptr>::iterator i = bl.buffers().begin();
+ for (list<bufferptr>::const_iterator i = bl.buffers().begin();
i != bl.buffers().end();
i++) {
assert(i->length() % EBOFS_BLOCK_SIZE == 0);
- iov[n].iov_base = i->c_str();
+ iov[n].iov_base = (void*)i->c_str();
iov[n].iov_len = MIN(left, i->length());
left -= iov[n].iov_len;
int n = 0;
size_t left = len;
- for (list<bufferptr>::iterator i = bl.buffers().begin();
+ for (list<bufferptr>::const_iterator i = bl.buffers().begin();
i != bl.buffers().end();
i++) {
assert(i->length() % EBOFS_BLOCK_SIZE == 0);
- iov[n].iov_base = i->c_str();
+ iov[n].iov_base = (void*)i->c_str();
iov[n].iov_len = MIN(left, i->length());
assert((((unsigned long long)iov[n].iov_base) & 4095ULL) == 0);
#ifndef __EBOFS_BLOCKDEVICE_H
#define __EBOFS_BLOCKDEVICE_H
-#include "include/bufferlist.h"
+#include "include/buffer.h"
#include "include/interval_set.h"
#include "include/Context.h"
#include "common/Mutex.h"
// apply partial to myself
assert(bh->data.length() == 0);
- bufferptr bp = bc->bufferpool.alloc(EBOFS_BLOCK_SIZE);
+ bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
bh->data.push_back( bp );
bh->data.copy_in(0, EBOFS_BLOCK_SIZE, bl);
bh->apply_partial();
bh->start(), bh->length(),
ex.start);
- bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), fin->bl); // new buffers!
+ //bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), fin->bl); // new buffers!
+ fin->bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) );
bh->rx_ioh = dev.read(ex.start, ex.length, fin->bl,
fin);
// make the combined block
bufferlist combined;
- bufferptr bp = oc->bc->bufferpool.alloc(EBOFS_BLOCK_SIZE);
+ bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
combined.push_back( bp );
combined.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE, (pblock-diskstart+1)*EBOFS_BLOCK_SIZE, bl);
BufferHead::apply_partial( combined, p->second.partial );
#include "common/Clock.h"
#include "types.h"
-#include "AlignedBufferPool.h"
#include "BlockDevice.h"
#include "include/interval_set.h"
public:
Mutex &ebofs_lock; // hack: this is a ref to global ebofs_lock
BlockDevice &dev;
- AlignedBufferPool &bufferpool;
set<BufferHead*> dirty_bh;
map<block_t, map<block_t, PartialWrite> > partial_write; // queued writes w/ partial content
public:
- BufferCache(BlockDevice& d, AlignedBufferPool& bp, Mutex& el) :
- ebofs_lock(el), dev(d), bufferpool(bp),
+ BufferCache(BlockDevice& d, Mutex& el) :
+ ebofs_lock(el), dev(d),
stat_waiter(0),
stat_clean(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_partial(0), stat_missing(0)
{}
dout(2) << "mounting " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << endl;
// read super
- bufferptr bp1 = bufferpool.alloc(EBOFS_BLOCK_SIZE);
- bufferptr bp2 = bufferpool.alloc(EBOFS_BLOCK_SIZE);
+ bufferptr bp1 = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
+ bufferptr bp2 = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
dev.read(0, 1, bp1);
dev.read(1, 1, bp2);
sb.nodepool.node_usemap_odd = nodepool.usemap_odd;
// put in a buffer
- bp = bufferpool.alloc(EBOFS_BLOCK_SIZE);
+ bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
memcpy(bp.c_str(), (const char*)&sb, sizeof(sb));
}
// read it!
bufferlist bl;
- bufferpool.alloc( EBOFS_BLOCK_SIZE*onode_loc.length, bl );
+ bl.push_back( buffer::create_page_aligned( EBOFS_BLOCK_SIZE*onode_loc.length ) );
ebofs_lock.Unlock();
dev.read( onode_loc.start, onode_loc.length, bl );
p += key.length() + 1;
int len = *(int*)(p);
p += sizeof(len);
- on->attr[key] = new buffer(p, len);
+ on->attr[key] = buffer::copy(p, len);
p += len;
dout(15) << "get_onode " << *on << " attr " << key << " len " << len << endl;
}
unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1;
bufferlist bl;
- bufferpool.alloc( EBOFS_BLOCK_SIZE*blocks, bl );
+ bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks) );
// (always) relocate onode
if (1) {
// read it!
bufferlist bl;
- bufferpool.alloc( EBOFS_BLOCK_SIZE*cnode_loc.length, bl );
+ //bufferpool.alloc( EBOFS_BLOCK_SIZE*cnode_loc.length, bl );
+ bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*cnode_loc.length) );
ebofs_lock.Unlock();
dev.read( cnode_loc.start, cnode_loc.length, bl );
p += key.length() + 1;
int len = *(int*)(p);
p += sizeof(len);
- cn->attr[key] = new buffer(p, len);
+ cn->attr[key] = buffer::copy(p, len);
p += len;
dout(15) << "get_cnode " << *cn << " attr " << key << " len " << len << endl;
}
unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1;
bufferlist bl;
- bufferpool.alloc( EBOFS_BLOCK_SIZE*blocks, bl );
+ //bufferpool.alloc( EBOFS_BLOCK_SIZE*blocks, bl );
+ bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks) );
// (always) relocate cnode!
if (1) {
dout(10) << "apply_write tx pending, copying buffer on " << *bh << endl;
bufferlist temp;
temp.claim(bh->data);
- bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data);
+ //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data);
+ bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) );
bh->data.copy_in(0, bh->length()*EBOFS_BLOCK_SIZE, temp);
}
<< endl;
unsigned z = MIN( zleft, len_in_bh );
if (z) {
+ bufferptr zp(z);
+ zp.zero();
bufferlist zb;
- zb.push_back(new buffer(z));
- zb.zero();
+ zb.push_back(zp);
bh->add_partial(off_in_bh, zb);
zleft -= z;
opos += z;
if (bh->partial_is_complete(on->object_size - bh->start()*EBOFS_BLOCK_SIZE)) {
dout(10) << "apply_write completed partial " << *bh << endl;
- bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); // new buffers!
+ //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); // new buffers!
+ bh->data.clear();
+ bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) );
bh->data.zero();
bh->apply_partial();
bc.mark_dirty(bh);
// FIXME: only do the modified pages? this might be a big bh!
bufferlist temp;
temp.claim(bh->data);
- bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data);
+ //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data);
+ bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) );
bh->data.copy_in(0, bh->length()*EBOFS_BLOCK_SIZE, temp);
unsigned z = MIN( zleft, len_in_bh );
if (z) {
+ bufferptr zp(z);
+ zp.zero();
bufferlist zb;
- zb.push_back(new buffer(z));
- zb.zero();
+ zb.push_back(zp);
bh->data.copy_in(off_in_bh, z, zb);
zleft -= z;
opos += z;
opos+(off_t)(zleft+left) == on->object_size);
// alloc new buffers.
- bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data);
+ //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data);
+ bh->data.clear();
+ bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) );
// copy!
unsigned len_in_bh = MIN(bh->length()*EBOFS_BLOCK_SIZE, zleft+left);
unsigned z = MIN(len_in_bh, zleft);
if (z) {
+ bufferptr zp(z);
+ zp.zero();
bufferlist zb;
- zb.push_back(new buffer(z));
- zb.zero();
+ zb.push_back(zp);
bh->data.copy_in(0, z, zb);
zleft -= z;
}
if (!on) return -ENOENT;
string n(name);
- on->attr[n] = new buffer((char*)value, size);
+ on->attr[n] = buffer::copy((char*)value, size);
dirty_onode(on);
put_onode(on);
if (!cn) return -ENOENT;
string n(name);
- cn->attr[n] = new buffer((char*)value, size);
+ cn->attr[n] = buffer::copy((char*)value, size);
dirty_cnode(cn);
put_cnode(cn);
*/
+#include <map>
+using namespace std;
#include <ext/hash_map>
using namespace __gnu_cxx;
#include "include/Context.h"
-#include "include/bufferlist.h"
+#include "include/buffer.h"
template<typename U,typename V>
inline ostream& operator<<(ostream& out, const pair<U,V>& p) {
#include "nodes.h"
#include "Allocator.h"
#include "Table.h"
-#include "AlignedBufferPool.h"
#include "common/Mutex.h"
#include "common/Cond.h"
block_t get_limbo_extents() { return limbo_tab->get_num_keys(); }
- // ** buffers **
- AlignedBufferPool bufferpool;
-
-
// ** tables and sets **
// nodes
NodePool nodepool; // for all tables...
commit_thread(this),
free_blocks(0), limbo_blocks(0),
allocator(this),
- bufferpool(EBOFS_BLOCK_SIZE),
nodepool(ebofs_lock),
object_tab(0), limbo_tab(0), collection_tab(0), co_tab(0),
onode_lru(g_conf.ebofs_oc_size),
cnode_lru(g_conf.ebofs_cc_size),
inodes_flushing(0),
- bc(dev, bufferpool, ebofs_lock),
+ bc(dev, ebofs_lock),
idle_kicker(this),
finisher_stop(false), finisher_thread(this) {
for (int i=0; i<EBOFS_NUM_FREE_BUCKETS; i++)
#include "types.h"
#include "BlockDevice.h"
-#include "AlignedBufferPool.h"
/*
void set_state(int s) { state = s; }
- void make_shadow(AlignedBufferPool& bufferpool) {
+ void make_shadow() {
assert(is_tx());
shadow_bptr = bptr;
// new buffer
- bptr = bufferpool.alloc(EBOFS_NODE_BYTES);
+ bptr = buffer::create_page_aligned(EBOFS_NODE_BYTES);
nrecs = (int*)(bptr.c_str());
type = (int*)(bptr.c_str() + sizeof(*nrecs));
class NodePool {
protected:
- AlignedBufferPool bufferpool; // our own memory allocator for node buffers
-
map<nodeid_t, Node*> node_map; // open node map
public:
public:
NodePool(Mutex &el) :
- bufferpool(EBOFS_NODE_BYTES),
num_nodes(0),
ebofs_lock(el),
flushing(0) {}
else
loc = usemap_even;
- bufferptr bp = bufferpool.alloc(EBOFS_BLOCK_SIZE*loc.length);
+ bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*loc.length);
dev.read(loc.start, loc.length, bp);
// parse
if (!clean.count(nid)) continue;
debofs(20) << "ebofs.nodepool.read node " << nid << endl;
- bufferptr bp = bufferpool.alloc(EBOFS_NODE_BYTES);
+ bufferptr bp = buffer::create_page_aligned(EBOFS_NODE_BYTES);
dev.read(region_loc[r].start + (block_t)boff, EBOFS_NODE_BLOCKS,
bp);
else
loc = usemap_even;
- bufferptr bp = bufferpool.alloc(EBOFS_BLOCK_SIZE*loc.length);
+ bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*loc.length);
// fill in
unsigned region = 0; // current region
debofs(15) << "ebofs.nodepool.new_node " << nid << endl;
// alloc node
- bufferptr bp = bufferpool.alloc(EBOFS_NODE_BYTES);
+ bufferptr bp = buffer::create_page_aligned(EBOFS_NODE_BYTES);
Node *n = new Node(nid, bp, Node::STATE_DIRTY);
n->set_type(type);
n->set_size(0);
tx.erase(oldid);
// move/copy current -> shadow buffer as necessary
- n->make_shadow(bufferpool);
+ n->make_shadow();
}
limbo.insert(oldid);
node_map.erase(oldid);
*
*/
-
#ifndef __BUFFER_H
#define __BUFFER_H
-#include <cassert>
-#include <string.h>
-
-#include <iostream>
-using namespace std;
-
-// bit masks
-#define BUFFER_MODE_NOCOPY 0
-#define BUFFER_MODE_COPY 1 // copy on create, my buffer
+#include "common/Mutex.h"
-#define BUFFER_MODE_NOFREE 0
-#define BUFFER_MODE_FREE 2
+#include <ostream>
+#include <list>
-#define BUFFER_MODE_CUSTOMFREE 4
+#ifndef __CYGWIN__
+# include <sys/mman.h>
+#endif
-#define BUFFER_MODE_DEFAULT 3//(BUFFER_MODE_COPY|BUFFER_MODE_FREE)
+#define BUFFER_PAGE_SIZE 4096 // fixme.
+// <hack>
+// these are in config.o
+extern Mutex bufferlock;
+extern long buffer_total_alloc;
+// </hack>
-// debug crap
-#include "config.h"
-#define bdbout(x) if (x <= g_conf.debug_buffer) cout
+class buffer {
+private:
+
+ /* hack for memory utilization debugging. */
+ static void inc_total_alloc(unsigned len) {
+ bufferlock.Lock();
+ buffer_total_alloc += len;
+ bufferlock.Unlock();
+ }
+ static void dec_total_alloc(unsigned len) {
+ bufferlock.Lock();
+ buffer_total_alloc -= len;
+ bufferlock.Unlock();
+ }
-#include "common/Mutex.h"
+ /*
+ * an abstract raw buffer. with a reference count.
+ */
+ class raw {
+ public:
+ char *data;
+ unsigned len;
+ int nref;
+ Mutex lock; // we'll make it non-recursive.
+
+ raw(unsigned l) : len(l), nref(0), lock(false) {}
+ raw(char *c, unsigned l) : data(c), len(l), nref(0), lock(false) {}
+ virtual ~raw() {};
+
+ virtual raw* clone_empty() = 0;
+ raw *clone() {
+ raw *c = clone_empty();
+ memcpy(c->data, data, len);
+ return c;
+ }
+ };
+
+ friend std::ostream& operator<<(std::ostream& out, const raw &r);
+
+ /*
+ * primitive buffer types
+ */
+ class raw_char : public raw {
+ public:
+ raw_char(unsigned l) : raw(l) {
+ data = new char[len];
+ inc_total_alloc(len);
+ }
+ ~raw_char() {
+ delete[] data;
+ dec_total_alloc(len);
+ }
+ raw* clone_empty() {
+ return new raw_char(len);
+ }
+ };
+
+ class raw_static : public raw {
+ public:
+ raw_static(const char *d, unsigned l) : raw((char*)d, l) { }
+ ~raw_static() {}
+ raw* clone_empty() {
+ return new raw_char(len);
+ }
+ };
+
+#ifndef __CYGWIN__
+ class raw_mmap_pages : public raw {
+ public:
+ raw_mmap_pages(unsigned l) : raw(l) {
+ data = (char*)::mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+ inc_total_alloc(len);
+ }
+ ~raw_mmap_pages() {
+ ::munmap(data, len);
+ dec_total_alloc(len);
+ }
+ raw* clone_empty() {
+ return new raw_mmap_pages(len);
+ }
+ };
-// HACK: in config.cc
-/*
- * WARNING: bufferlock placements are tricky for efficiency. note that only bufferptr and
- * buffer ever use buffer._ref, and only bufferptr should call ~buffer().
- *
- * So, I only need to protect:
- * - buffer()'s modification of buffer_total_alloc
- * - ~bufferptr() check of buffer._ref, and ~buffer's mod of buffer_total_alloc
- *
- * I don't protect
- * - buffer._get() .. increment is atomic on any sane architecture
- * - buffer._put() .. only called by ~bufferptr.
- * - ~buffer .. only called by ~bufferptr *** I HOPE!!
- */
-extern Mutex bufferlock;
-extern long buffer_total_alloc;
+ class raw_posix_aligned : public raw {
+ public:
+ raw_posix_aligned(unsigned l) : raw(l) {
+ ::posix_memalign((void**)&data, BUFFER_PAGE_SIZE, len);
+ inc_total_alloc(len);
+ }
+ ~raw_posix_aligned() {
+ ::free((void*)data);
+ dec_total_alloc(len);
+ }
+ raw* clone_empty() {
+ return new raw_posix_aligned(len);
+ }
+ };
+#endif
+ class raw_hack_aligned : public raw {
+ char *realdata;
+ public:
+ raw_hack_aligned(unsigned l) : raw(l) {
+ realdata = new char[len+4095];
+ unsigned off = (unsigned)data % 4096;
+ if (off)
+ data = realdata + 4096 - off;
+ else
+ data = realdata;
+ inc_total_alloc(len+4095);
+ }
+ ~raw_hack_aligned() {
+ delete[] realdata;
+ dec_total_alloc(len+4095);
+ }
+ raw* clone_empty() {
+ return new raw_hack_aligned(len);
+ }
+ };
-typedef void (buffer_free_func_t)(void*,char*,unsigned);
+public:
+ /*
+ * named constructors
+ */
-/*
- * buffer - the underlying buffer container. with a reference count.
- *
- * the buffer never shrinks.
- *
- * some invariants:
- * _len never shrinks
- * _len <= _alloc_len
- */
-class buffer {
- protected:
- //wtf
- //static Mutex bufferlock;
- //static long buffer_total_alloc;// = 0;
-
- private:
- // raw buffer alloc
- char *_dataptr;
- bool _myptr;
- unsigned _len;
- unsigned _alloc_len;
-
- // ref counts
- unsigned _ref;
- int _get() {
- bdbout(1) << "buffer.get " << *this << " get " << _ref+1 << endl;
- return ++_ref;
+ static raw* copy(const char *c, unsigned len) {
+ raw* r = new raw_char(len);
+ memcpy(r->data, c, len);
+ return r;
}
- int _put() {
- bdbout(1) << "buffer.put " << *this << " put " << _ref-1 << endl;
- assert(_ref > 0);
- return --_ref;
+ static raw* create(unsigned len) {
+ return new raw_char(len);
}
- // custom (de!)allocator
- buffer_free_func_t *free_func;
- void *free_func_arg;
+ static raw* create_page_aligned(unsigned len) {
+#ifndef __CYGWIN__
+ return new raw_mmap_pages(len);
+#else
+ return new raw_hack_aligned(len);
+#endif
+ }
- friend class bufferptr;
+
+ /*
+ * a buffer pointer. references (a subsequence of) a raw buffer.
+ */
+ class ptr {
+ raw *_raw;
+ unsigned _off, _len;
+
+ public:
+ ptr() : _raw(0), _off(0), _len(0) {}
+ ptr(raw *r) : _raw(r), _off(0), _len(r->len) { // no lock needed; this is an unref raw.
+ ++r->nref;
+ }
+ ptr(unsigned l) : _off(0), _len(l) {
+ _raw = create(l);
+ ++_raw->nref;
+ }
+ ptr(char *d, unsigned l) : _off(0), _len(l) { // ditto.
+ _raw = copy(d, l);
+ ++_raw->nref;
+ }
+ ptr(const ptr& p) : _raw(p._raw), _off(p._off), _len(p._len) {
+ if (_raw) {
+ _raw->lock.Lock();
+ ++_raw->nref;
+ _raw->lock.Unlock();
+ }
+ }
+ ptr(const ptr& p, unsigned o, unsigned l) : _raw(p._raw), _off(p._off + o), _len(l) {
+ assert(o+l <= p._len);
+ assert(_raw);
+ _raw->lock.Lock();
+ ++_raw->nref;
+ _raw->lock.Unlock();
+ }
+ ptr& operator= (const ptr& p) {
+ // be careful -- we need to properly handle self-assignment.
+ if (p._raw) {
+ p._raw->lock.Lock();
+ ++p._raw->nref; // inc new
+ p._raw->lock.Unlock();
+ }
+ release(); // dec (+ dealloc) old (if any)
+ _raw = p._raw; // change my ref
+ _off = p._off;
+ _len = p._len;
+ return *this;
+ }
+ ~ptr() {
+ release();
+ }
- public:
- // constructors
- buffer() : _dataptr(0), _myptr(true), _len(0), _alloc_len(0), _ref(0), free_func(0), free_func_arg(0) {
- bdbout(1) << "buffer.cons " << *this << endl;
- }
- buffer(unsigned a) : _dataptr(0), _myptr(true), _len(a), _alloc_len(a), _ref(0), free_func(0), free_func_arg(0) {
- bdbout(1) << "buffer.cons " << *this << endl;
- _dataptr = new char[a];
- bufferlock.Lock();
- buffer_total_alloc += _alloc_len;
- bufferlock.Unlock();
- bdbout(1) << "buffer.malloc " << (void*)_dataptr << endl;
- }
- ~buffer() {
- bdbout(1) << "buffer.des " << *this << " " << (void*)free_func << endl;
- if (free_func) {
- bdbout(1) << "buffer.custom_free_func " << free_func_arg << " " << (void*)_dataptr << endl;
- free_func( free_func_arg, _dataptr, _alloc_len );
+ void release() {
+ if (_raw) {
+ _raw->lock.Lock();
+ if (--_raw->nref == 0) {
+ //std::cout << "hosing raw " << (void*)_raw << std::endl;
+ _raw->lock.Unlock();
+ delete _raw; // dealloc old (if any)
+ } else
+ _raw->lock.Unlock();
+ _raw = 0;
+ }
}
- else if (_dataptr && _myptr) {
- bdbout(1) << "buffer.free " << (void*)_dataptr << endl;
- delete[] _dataptr;
- buffer_total_alloc -= _alloc_len;
+
+ // misc
+ bool at_buffer_head() const { return _off == 0; }
+ bool at_buffer_tail() const { return _off + _len == _raw->len; }
+
+ // accessors
+ const char *c_str() const { assert(_raw); return _raw->data + _off; }
+ char *c_str() { assert(_raw); return _raw->data + _off; }
+ unsigned length() const { return _len; }
+ unsigned offset() const { return _off; }
+ unsigned unused_tail_length() const { return (_off+_len) - _raw->len; }
+
+ const char *raw_c_str() const { assert(_raw); return _raw->data; }
+ unsigned raw_length() const { assert(_raw); return _raw->len; }
+ int raw_nref() const { assert(_raw); return _raw->nref; }
+
+ void copy_out(unsigned o, unsigned l, char *dest) const {
+ assert(_raw);
+ assert(o >= 0 && o <= _len);
+ assert(l >= 0 && o+l <= _len);
+ memcpy(dest, c_str()+o, l);
}
- }
-
- buffer(const char *p, int l, int mode=BUFFER_MODE_DEFAULT, int alloc_len=0,
- buffer_free_func_t free_func=0, void* free_func_arg=0) :
- _dataptr(0),
- _myptr(false),
- _len(l),
- _ref(0),
- free_func(0), free_func_arg(0) {
+
+ unsigned wasted() {
+ assert(_raw);
+ return _raw->len - _len;
+ }
+
+ // modifiers
+ void set_offset(unsigned o) { _off = o; }
+ void set_length(unsigned l) { _len = l; }
+
+ void append(char *p, unsigned l) {
+ assert(_raw);
+ assert(unused_tail_length() <= l);
+ memcpy(c_str() + _len, p, l);
+ _len += l;
+ }
+
+ void copy_in(unsigned o, unsigned l, const char *src) {
+ assert(_raw);
+ assert(o >= 0 && o <= _len);
+ assert(l >= 0 && o+l <= _len);
+ memcpy(c_str()+o, src, l);
+ }
+
+ void zero() {
+ memset(c_str(), 0, _len);
+ }
+
+ void clean() {
+ //raw *newraw = _raw->makesib(_len);
+ }
+ };
+
+
+ /*
+ * list - the useful bit!
+ */
+
+ class list {
+ // my private bits
+ std::list<ptr> _buffers;
+ unsigned _len;
+
+ public:
+ // cons/des
+ list() : _len(0) {}
+ list(const list& other) : _buffers(other._buffers), _len(other._len) { }
+ list(unsigned l) : _len(0) {
+ ptr bp(l);
+ push_back(bp);
+ }
+ ~list() {}
- if (alloc_len)
- _alloc_len = alloc_len;
- else
- _alloc_len = l;
-
- _myptr = mode & BUFFER_MODE_FREE ? true:false;
- bdbout(1) << "buffer.cons " << *this << " mode = " << mode << ", myptr=" << _myptr << endl;
- if (mode & BUFFER_MODE_COPY) {
- _dataptr = new char[_alloc_len];
- bdbout(1) << "buffer.malloc " << (void*)_dataptr << endl;
- bufferlock.Lock();
- buffer_total_alloc += _alloc_len;
- bufferlock.Unlock();
- memcpy(_dataptr, p, l);
- bdbout(1) << "buffer.copy " << *this << endl;
- } else {
- _dataptr = (char*)p; // ugly
- bdbout(1) << "buffer.claim " << *this << " myptr=" << _myptr << endl;
- }
-
- if (mode & BUFFER_MODE_CUSTOMFREE && free_func) {
- this->free_func = free_func;
- this->free_func_arg = free_func_arg;
+ list& operator= (const list& other) {
+ _buffers = other._buffers;
+ _len = other._len;
+ return *this;
}
- }
- // operators
- buffer& operator=(buffer& other) {
- assert(0); // not implemented, no reasonable assignment semantics.
- return *this;
- }
+ const std::list<ptr>& buffers() const { return _buffers; }
+
+ unsigned length() const {
+#if 0
+ // DEBUG: verify _len
+ unsigned len = 0;
+ for (std::list<ptr>::iterator it = _buffers.begin();
+ it != _buffers.end();
+ it++) {
+ len += (*it).length();
+ }
+ assert(len == _len);
+#endif
+ return _len;
+ }
- char *c_str() {
- return _dataptr;
- }
- bool has_free_func() { return free_func != 0; }
-
- // accessor
- unsigned alloc_length() {
- return _alloc_len;
- }
- void set_length(unsigned l) {
- assert(l <= _alloc_len);
- _len = l;
- }
- unsigned length() { return _len; }
- unsigned unused_tail_length() { return _alloc_len - _len; }
+ // modifiers
+ void clear() {
+ _buffers.clear();
+ _len = 0;
+ }
+ void push_front(ptr& bp) {
+ _buffers.push_front(bp);
+ _len += bp.length();
+ }
+ void push_front(raw *r) {
+ ptr bp(r);
+ _buffers.push_front(bp);
+ _len += bp.length();
+ }
+ void push_back(ptr& bp) {
+ _buffers.push_back(bp);
+ _len += bp.length();
+ }
+ void push_back(raw *r) {
+ ptr bp(r);
+ _buffers.push_back(bp);
+ _len += bp.length();
+ }
+ void zero() {
+ for (std::list<ptr>::iterator it = _buffers.begin();
+ it != _buffers.end();
+ it++)
+ it->zero();
+ }
+
+ // sort-of-like-assignment-op
+ void claim(list& bl) {
+ // free my buffers
+ clear();
+ claim_append(bl);
+ }
+ void claim_append(list& bl) {
+ // steal the other guy's buffers
+ _len += bl._len;
+ _buffers.splice( _buffers.end(), bl._buffers );
+ bl._len = 0;
+ }
+
+ // crope lookalikes
+ void copy(unsigned off, unsigned len, char *dest) {
+ assert(off >= 0);
+ assert(off + len <= length());
+ /*assert(off < length());
+ if (off + len > length())
+ len = length() - off;
+ */
+ // advance to off
+ std::list<ptr>::iterator curbuf = _buffers.begin();
+
+ // skip off
+ while (off > 0) {
+ assert(curbuf != _buffers.end());
+ if (off >= (*curbuf).length()) {
+ // skip this buffer
+ off -= (*curbuf).length();
+ curbuf++;
+ } else {
+ // somewhere in this buffer!
+ break;
+ }
+ }
+
+ // copy
+ while (len > 0) {
+ // is the rest ALL in this buffer?
+ if (off + len <= (*curbuf).length()) {
+ (*curbuf).copy_out(off, len, dest); // yup, last bit!
+ break;
+ }
+
+ // get as much as we can from this buffer.
+ unsigned howmuch = (*curbuf).length() - off;
+ (*curbuf).copy_out(off, howmuch, dest);
+
+ dest += howmuch;
+ len -= howmuch;
+ off = 0;
+ curbuf++;
+ assert(curbuf != _buffers.end());
+ }
+ }
+
+ void copy_in(unsigned off, unsigned len, const char *src) {
+ assert(off >= 0);
+ assert(off + len <= length());
+
+ // advance to off
+ std::list<ptr>::iterator curbuf = _buffers.begin();
+
+ // skip off
+ while (off > 0) {
+ assert(curbuf != _buffers.end());
+ if (off >= (*curbuf).length()) {
+ // skip this buffer
+ off -= (*curbuf).length();
+ curbuf++;
+ } else {
+ // somewhere in this buffer!
+ break;
+ }
+ }
+
+ // copy
+ while (len > 0) {
+ // is the rest ALL in this buffer?
+ if (off + len <= (*curbuf).length()) {
+ (*curbuf).copy_in(off, len, src); // yup, last bit!
+ break;
+ }
+
+ // get as much as we can from this buffer.
+ unsigned howmuch = (*curbuf).length() - off;
+ (*curbuf).copy_in(off, howmuch, src);
+
+ src += howmuch;
+ len -= howmuch;
+ off = 0;
+ curbuf++;
+ assert(curbuf != _buffers.end());
+ }
+ }
+ void copy_in(unsigned off, unsigned len, const list& bl) {
+ unsigned left = len;
+ for (std::list<ptr>::const_iterator i = bl._buffers.begin();
+ i != bl._buffers.end();
+ i++) {
+ unsigned l = (*i).length();
+ if (left < l) l = left;
+ copy_in(off, l, (*i).c_str());
+ left -= l;
+ if (left == 0) break;
+ off += l;
+ }
+ }
+
+
+ void append(const char *data, unsigned len) {
+ if (len == 0) return;
+
+ unsigned alen = 0;
+
+ // copy into the tail buffer?
+ if (!_buffers.empty()) {
+ unsigned avail = _buffers.back().unused_tail_length();
+ if (avail > 0) {
+ //cout << "copying up to " << len << " into tail " << avail << " bytes of tail buf" << endl;
+ if (avail > len)
+ avail = len;
+ unsigned blen = _buffers.back().length();
+ memcpy(_buffers.back().c_str() + blen, data, avail);
+ blen += avail;
+ _buffers.back().set_length(blen);
+ _len += avail;
+ data += avail;
+ len -= avail;
+ }
+ alen = _buffers.back().length();
+ }
+ if (len == 0) return;
+
+ // just add another buffer.
+ // alloc a bit extra, in case we do a bunch of appends. FIXME be smarter!
+ if (alen < 1024) alen = 1024;
+ ptr bp = create(alen);
+ bp.set_length(len);
+ bp.copy_in(0, len, data);
+ push_back(bp);
+ }
+ void append(ptr& bp) {
+ push_back(bp);
+ }
+ void append(ptr& bp, unsigned off, unsigned len) {
+ assert(len+off <= bp.length());
+ ptr tempbp(bp, off, len);
+ push_back(tempbp);
+ }
+ void append(const list& bl) {
+ list temp(bl); // copy list
+ claim_append(temp); // and append
+ }
+
+
+ /*
+ * return a contiguous ptr to whole bufferlist contents.
+ */
+ char *c_str() {
+ if (_buffers.size() == 1) {
+ return _buffers.front().c_str(); // good, we're already contiguous.
+ }
+ else if (_buffers.size() == 0) {
+ return 0; // no buffers
+ }
+ else {
+ ptr newbuf = create(length()); // make one new contiguous buffer.
+ copy(0, length(), newbuf.c_str()); // copy myself into it.
+ clear();
+ push_back(newbuf);
+ return newbuf.c_str(); // now it'll work.
+ }
+ }
+
+ void substr_of(list& other, unsigned off, unsigned len) {
+ assert(off + len <= other.length());
+ clear();
+
+ // skip off
+ std::list<ptr>::iterator curbuf = other._buffers.begin();
+ while (off > 0) {
+ assert(curbuf != _buffers.end());
+ if (off >= (*curbuf).length()) {
+ // skip this buffer
+ //cout << "skipping over " << *curbuf << endl;
+ off -= (*curbuf).length();
+ curbuf++;
+ } else {
+ // somewhere in this buffer!
+ //cout << "somewhere in " << *curbuf << endl;
+ break;
+ }
+ }
+
+ while (len > 0) {
+ // partial?
+ if (off + len < (*curbuf).length()) {
+ //cout << "copying partial of " << *curbuf << endl;
+ _buffers.push_back( ptr( *curbuf, off, len ) );
+ _len += len;
+ break;
+ }
+
+ // through end
+ //cout << "copying end (all?) of " << *curbuf << endl;
+ unsigned howmuch = (*curbuf).length() - off;
+ _buffers.push_back( ptr( *curbuf, off, howmuch ) );
+ _len += howmuch;
+ len -= howmuch;
+ off = 0;
+ curbuf++;
+ }
+ }
+
+
+ // funky modifer
+ void splice(unsigned off, unsigned len, list *claim_by=0 /*, bufferlist& replace_with */) { // fixme?
+ assert(off < length());
+ assert(len > 0);
+ //cout << "splice off " << off << " len " << len << " ... mylen = " << length() << endl;
+
+ // skip off
+ std::list<ptr>::iterator curbuf = _buffers.begin();
+ while (off > 0) {
+ assert(curbuf != _buffers.end());
+ if (off >= (*curbuf).length()) {
+ // skip this buffer
+ //cout << "off = " << off << " skipping over " << *curbuf << endl;
+ off -= (*curbuf).length();
+ curbuf++;
+ } else {
+ // somewhere in this buffer!
+ //cout << "off = " << off << " somewhere in " << *curbuf << endl;
+ break;
+ }
+ }
+ assert(off >= 0);
+
+ if (off) {
+ // add a reference to the front bit
+ // insert it before curbuf (which we'll hose)
+ //cout << "keeping front " << off << " of " << *curbuf << endl;
+ _buffers.insert( curbuf, ptr( *curbuf, 0, off ) );
+ _len += off;
+ }
+
+ while (len > 0) {
+ // partial?
+ if (off + len < (*curbuf).length()) {
+ //cout << "keeping end of " << *curbuf << ", losing first " << off+len << endl;
+ if (claim_by)
+ claim_by->append( *curbuf, off, len );
+ (*curbuf).set_offset( off+len + (*curbuf).offset() ); // ignore beginning big
+ (*curbuf).set_length( (*curbuf).length() - (len+off) );
+ _len -= off+len;
+ //cout << " now " << *curbuf << endl;
+ break;
+ }
+
+ // hose though the end
+ unsigned howmuch = (*curbuf).length() - off;
+ //cout << "discarding " << howmuch << " of " << *curbuf << endl;
+ if (claim_by)
+ claim_by->append( *curbuf, off, howmuch );
+ _len -= (*curbuf).length();
+ _buffers.erase( curbuf++ );
+ len -= howmuch;
+ off = 0;
+ }
+
+ // splice in *replace (implement me later?)
+ }
+
+ };
- friend ostream& operator<<(ostream& out, buffer& b);
};
-inline ostream& operator<<(ostream& out, buffer& b) {
- return out << "buffer(this=" << &b << " len=" << b._len << ", alloc=" << b._alloc_len << ", data=" << (void*)b._dataptr << " ref=" << b._ref << ")";
+typedef buffer::ptr bufferptr;
+typedef buffer::list bufferlist;
+
+inline std::ostream& operator<<(std::ostream& out, const buffer::raw &r) {
+ return out << "buffer::raw(" << (void*)r.data << " len " << r.len << " nref " << r.nref << ")";
}
+inline std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp) {
+ out << "buffer::ptr(" << bp.offset() << "~" << bp.length()
+ << " " << (void*)bp.c_str()
+ << " in raw " << (void*)bp.raw_c_str()
+ << " len " << bp.raw_length()
+ << " nref " << bp.raw_nref() << ")";
+ return out;
+}
-/*
- * smart pointer class for buffer
- *
- * we reference count the actual buffer.
- * we also let you refer to a subset of a buffer.
- * we implement the high-level buffer accessor methods.
- *
- * some invariants:
- * _off < _buffer->_len
- * _off + _len <= _buffer->_len
- */
-class bufferptr {
- private:
- buffer *_buffer;
- unsigned _len, _off;
-
- public:
- // empty cons
- bufferptr() :
- _buffer(0),
- _len(0),
- _off(0) { }
- // main cons - the entire buffer
- bufferptr(buffer *b) :
- _buffer(b),
- _len(b->_len),
- _off(0) {
- assert(_buffer->_ref == 0);
- _buffer->_get(); // this is always the first one.
- }
- // subset cons - a subset of another bufferptr (subset)
- bufferptr(const bufferptr& bp, unsigned len, unsigned off) {
- bufferlock.Lock();
- _buffer = bp._buffer;
- _len = len;
- _off = bp._off + off;
- _buffer->_get();
- assert(_off < _buffer->_len); // sanity checks
- assert(_off + _len <= _buffer->_len);
- bufferlock.Unlock();
- }
+inline std::ostream& operator<<(std::ostream& out, const buffer::list& bl) {
+ out << "buffer::list(len=" << bl.length() << "," << std::endl;
- // copy cons
- bufferptr(const bufferptr &other) {
- bufferlock.Lock();
- _buffer = other._buffer;
- _len = other._len;
- _off = other._off;
- if (_buffer) _buffer->_get();
- bufferlock.Unlock();
+ std::list<buffer::ptr>::const_iterator it = bl.buffers().begin();
+ while (it != bl.buffers().end()) {
+ out << "\t" << *it;
+ if (++it == bl.buffers().end()) break;
+ out << "," << std::endl;
}
+ out << std::endl << ")";
+ return out;
+}
- // assignment operator
- bufferptr& operator=(const bufferptr& other) {
- //assert(0);
- // discard old
- discard_buffer();
- // point to other
- bufferlock.Lock();
- _buffer = other._buffer;
- _len = other._len;
- _off = other._off;
- if (_buffer) _buffer->_get();
- bufferlock.Unlock();
- return *this;
- }
- ~bufferptr() {
- discard_buffer();
- }
- void discard_buffer() {
- if (_buffer) {
- bufferlock.Lock();
- if (_buffer->_put() == 0)
- delete _buffer;
- _buffer = 0;
- bufferlock.Unlock();
- }
- }
+// encoder/decode helpers
+// string
+inline void _encode(const std::string& s, bufferlist& bl)
+{
+ bl.append(s.c_str(), s.length()+1);
+}
+inline void _decode(std::string& s, bufferlist& bl, int& off)
+{
+ s = bl.c_str() + off;
+ off += s.length() + 1;
+}
- // dereference to get the actual buffer
- buffer& operator*() {
- return *_buffer;
- }
+// bufferptr (encapsulated)
+inline void _encode(bufferptr& bp, bufferlist& bl)
+{
+ size_t len = bp.length();
+ bl.append((char*)&len, sizeof(len));
+ bl.append(bp);
+}
+inline void _decode(bufferptr& bp, bufferlist& bl, int& off)
+{
+ size_t len;
+ bl.copy(off, sizeof(len), (char*)&len);
+ off += sizeof(len);
+ bufferlist s;
+ s.substr_of(bl, off, len);
+ off += len;
+
+ if (s.buffers().size() == 1)
+ bp = s.buffers().front();
+ else
+ bp = buffer::copy(s.c_str(), s.length());
+}
+// bufferlist (encapsulated)
+inline void _encode(const bufferlist& s, bufferlist& bl)
+{
+ size_t len = s.length();
+ bl.append((char*)&len, sizeof(len));
+ bl.append(s);
+}
+inline void _decode(bufferlist& s, bufferlist& bl, int& off)
+{
+ size_t len;
+ bl.copy(off, sizeof(len), (char*)&len);
+ off += sizeof(len);
+ s.substr_of(bl, off, len);
+ off += len;
+}
- bool at_buffer_head() const {
- return _off == 0;
+#include <set>
+#include <map>
+#include <vector>
+#include <string>
+
+// set<T>
+template<class T>
+inline void _encode(std::set<T>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (typename std::set<T>::iterator it = s.begin();
+ it != s.end();
+ it++) {
+ T v = *it;
+ bl.append((char*)&v, sizeof(v));
+ n--;
}
- bool at_buffer_tail() const {
- return _off + _len == _buffer->_len;
+ assert(n==0);
+}
+template<class T>
+inline void _decode(std::set<T>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ T v;
+ bl.copy(off, sizeof(v), (char*)&v);
+ off += sizeof(v);
+ s.insert(v);
}
+ assert(s.size() == (unsigned)n);
+}
- // accessors for my subset
- char *c_str() {
- return _buffer->c_str() + _off;
+// vector<T>
+template<class T>
+inline void _encode(std::vector<T>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (typename std::vector<T>::iterator it = s.begin();
+ it != s.end();
+ it++) {
+ T v = *it;
+ bl.append((char*)&v, sizeof(v));
+ n--;
}
- unsigned length() const {
- return _len;
+ assert(n==0);
+}
+template<class T>
+inline void _decode(std::vector<T>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ s = std::vector<T>(n);
+ for (int i=0; i<n; i++) {
+ T v;
+ bl.copy(off, sizeof(v), (char*)&v);
+ off += sizeof(v);
+ s[i] = v;
}
- unsigned offset() const {
- return _off;
+ assert(s.size() == (unsigned)n);
+}
+
+// list<T>
+template<class T>
+inline void _encode(const std::list<T>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (typename std::list<T>::const_iterator it = s.begin();
+ it != s.end();
+ it++) {
+ T v = *it;
+ bl.append((char*)&v, sizeof(v));
+ n--;
}
- unsigned unused_tail_length() {
- if (!at_buffer_tail()) return 0;
- return _buffer->unused_tail_length();
+ assert(n==0);
+}
+template<class T>
+inline void _decode(std::list<T>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ T v;
+ bl.copy(off, sizeof(v), (char*)&v);
+ off += sizeof(v);
+ s.push_back(v);
}
+ assert(s.size() == (unsigned)n);
+}
-
- // modifiers
- void set_offset(unsigned off) {
- assert(off <= _buffer->_alloc_len);
- _off = off;
- }
- void set_length(unsigned len) {
- assert(len >= 0 && _off + len <= _buffer->_alloc_len);
- if (_buffer->_len < _off + len)
- _buffer->_len = _off + len; // set new buffer len (_IF_ i'm expanding it)
- _len = len; // my len too
+// map<string,bufferptr>
+inline void _encode(std::map<std::string, bufferptr>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (std::map<std::string, bufferptr>::iterator it = s.begin();
+ it != s.end();
+ it++) {
+ _encode(it->first, bl);
+ _encode(it->second, bl);
+ n--;
}
- void zero() {
- //bzero((void*)c_str(), _len);
- memset((void*)c_str(), 0, _len);
+ assert(n==0);
+}
+inline void _decode(std::map<std::string,bufferptr>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ std::string k;
+ _decode(k, bl, off);
+ _decode(s[k], bl, off);
}
+ assert(s.size() == (unsigned)n);
+}
- // crope lookalikes
- void append(const char *p, unsigned len) {
- assert(len + _len + _off <= _buffer->_alloc_len); // FIXME later for auto-expansion?
-
- // copy
- memcpy(c_str() + _len, p, len);
- _buffer->_len += len;
- _len += len;
+// map<T,bufferlist>
+template<class T>
+inline void _encode(const std::map<T, bufferlist>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (typename std::map<T, bufferlist>::const_iterator it = s.begin();
+ it != s.end();
+ it++) {
+ T k = it->first;
+ bl.append((char*)&k, sizeof(k));
+ _encode(it->second, bl);
+ n--;
}
- void copy_out(unsigned off, unsigned len, char *dest) {
- assert(off >= 0 && off <= _len);
- assert(len >= 0 && off + len <= _len);
- memcpy(dest, c_str() + off, len);
- }
- void copy_in(unsigned off, unsigned len, const char *src) {
- assert(off >= 0 && off <= _len);
- assert(len >= 0 && off + len <= _len);
- memcpy(c_str() + off, src, len);
+ assert(n==0);
+}
+template<class T>
+inline void _decode(std::map<T,bufferlist>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ T k;
+ bl.copy(off, sizeof(k), (char*)&k);
+ off += sizeof(k);
+ bufferlist b;
+ _decode(b, bl, off);
+ s[k] = b;
}
+ assert(s.size() == (unsigned)n);
+}
- friend ostream& operator<<(ostream& out, bufferptr& bp);
-};
-
-
-inline ostream& operator<<(ostream& out, bufferptr& bp) {
- return out << "bufferptr(len=" << bp._len << " off=" << bp._off
- << " cstr=" << (void*)bp.c_str()
- << " buf=" << *bp._buffer
- << ")";
+// map<T,U>
+template<class T, class U>
+inline void _encode(const std::map<T, U>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (typename std::map<T, U>::const_iterator it = s.begin();
+ it != s.end();
+ it++) {
+ T k = it->first;
+ U v = it->second;
+ bl.append((char*)&k, sizeof(k));
+ bl.append((char*)&v, sizeof(v));
+ n--;
+ }
+ assert(n==0);
}
+template<class T, class U>
+inline void _decode(std::map<T,U>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ T k;
+ U v;
+ bl.copy(off, sizeof(k), (char*)&k);
+ off += sizeof(k);
+ bl.copy(off, sizeof(v), (char*)&v);
+ off += sizeof(v);
+ s[k] = v;
+ }
+ assert(s.size() == (unsigned)n);
+}
+
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef __BUFFERLIST_H
-#define __BUFFERLIST_H
-
-#include "buffer.h"
-
-#include <list>
-#include <map>
-#include <set>
-#include <vector>
-using namespace std;
-
-#include <ext/rope>
-using namespace __gnu_cxx;
-
-
-// debug crap
-#include "config.h"
-#define bdbout(x) if (x <= g_conf.debug_buffer) cout
-
-
-
-class bufferlist {
- private:
- /* local state limited to _buffers, and _len.
- * we maintain _len ourselves, so we must be careful when fiddling with buffers!
- */
- list<bufferptr> _buffers;
- unsigned _len;
-
- public:
- // cons/des
- bufferlist() : _len(0) {
- bdbout(1) << "bufferlist.cons " << this << endl;
- }
- bufferlist(const bufferlist& bl) : _len(0) {
- //assert(0); // o(n) and stupid
- bdbout(1) << "bufferlist.cons " << this << endl;
- _buffers = bl._buffers;
- _len = bl._len;
- }
- ~bufferlist() {
- bdbout(1) << "bufferlist.des " << this << endl;
- }
-
- bufferlist& operator=(bufferlist& bl) {
- //assert(0); // actually, this should be fine, just slow (O(n)) and stupid.
- bdbout(1) << "bufferlist.= " << this << endl;
- _buffers = bl._buffers;
- _len = bl._len;
- return *this;
- }
-
-
- // accessors
- list<bufferptr>& buffers() {
- return _buffers;
- }
- //list<buffer*>::iterator begin() { return _buffers.begin(); }
- //list<buffer*>::iterator end() { return _buffers.end(); }
-
- unsigned length() const {
-#if 0
- { // DEBUG: verify _len
- int len = 0;
- for (list<bufferptr>::iterator it = _buffers.begin();
- it != _buffers.end();
- it++) {
- len += (*it).length();
- }
- assert(len == _len);
- }
-#endif
- return _len;
- }
-
- void _rope(crope& r) {
- for (list<bufferptr>::iterator it = _buffers.begin();
- it != _buffers.end();
- it++)
- r.append((*it).c_str(), (*it).length());
- }
-
- // modifiers
- void clear() {
- _buffers.clear();
- _len = 0;
- }
- void push_front(bufferptr& bp) {
- _buffers.push_front(bp);
- _len += bp.length();
- }
- void push_front(buffer *b) {
- bufferptr bp(b);
- _buffers.push_front(bp);
- _len += bp.length();
- }
- void push_back(bufferptr& bp) {
- _buffers.push_back(bp);
- _len += bp.length();
- }
- void push_back(buffer *b) {
- bufferptr bp(b);
-
- _buffers.push_back(bp);
- _len += bp.length();
-
- }
- void zero() {
- for (list<bufferptr>::iterator it = _buffers.begin();
- it != _buffers.end();
- it++)
- it->zero();
- }
-
- // sort-of-like-assignment-op
- void claim(bufferlist& bl) {
- // free my buffers
- clear();
- claim_append(bl);
- }
- void claim_append(bufferlist& bl) {
- // steal the other guy's buffers
- _len += bl._len;
- _buffers.splice( _buffers.end(), bl._buffers );
- bl._len = 0;
- }
-
-
-
-
- // crope lookalikes
- void copy(unsigned off, unsigned len, char *dest) {
- assert(off >= 0);
- assert(off + len <= length());
- /*assert(off < length());
- if (off + len > length())
- len = length() - off;
- */
- // advance to off
- list<bufferptr>::iterator curbuf = _buffers.begin();
-
- // skip off
- while (off > 0) {
- assert(curbuf != _buffers.end());
- if (off >= (*curbuf).length()) {
- // skip this buffer
- off -= (*curbuf).length();
- curbuf++;
- } else {
- // somewhere in this buffer!
- break;
- }
- }
-
- // copy
- while (len > 0) {
- // is the rest ALL in this buffer?
- if (off + len <= (*curbuf).length()) {
- (*curbuf).copy_out(off, len, dest); // yup, last bit!
- break;
- }
-
- // get as much as we can from this buffer.
- unsigned howmuch = (*curbuf).length() - off;
- (*curbuf).copy_out(off, howmuch, dest);
-
- dest += howmuch;
- len -= howmuch;
- off = 0;
- curbuf++;
- assert(curbuf != _buffers.end());
- }
- }
-
- void copy_in(unsigned off, unsigned len, const char *src) {
- assert(off >= 0);
- assert(off + len <= length());
-
- // advance to off
- list<bufferptr>::iterator curbuf = _buffers.begin();
-
- // skip off
- while (off > 0) {
- assert(curbuf != _buffers.end());
- if (off >= (*curbuf).length()) {
- // skip this buffer
- off -= (*curbuf).length();
- curbuf++;
- } else {
- // somewhere in this buffer!
- break;
- }
- }
-
- // copy
- while (len > 0) {
- // is the rest ALL in this buffer?
- if (off + len <= (*curbuf).length()) {
- (*curbuf).copy_in(off, len, src); // yup, last bit!
- break;
- }
-
- // get as much as we can from this buffer.
- unsigned howmuch = (*curbuf).length() - off;
- (*curbuf).copy_in(off, howmuch, src);
-
- src += howmuch;
- len -= howmuch;
- off = 0;
- curbuf++;
- assert(curbuf != _buffers.end());
- }
- }
- void copy_in(unsigned off, unsigned len, bufferlist& bl) {
- unsigned left = len;
- for (list<bufferptr>::iterator i = bl._buffers.begin();
- i != bl._buffers.end();
- i++) {
- unsigned l = (*i).length();
- if (left < l) l = left;
- copy_in(off, l, (*i).c_str());
- left -= l;
- if (left == 0) break;
- off += l;
- }
- }
-
-
- void append(const char *data, unsigned len) {
- if (len == 0) return;
-
- unsigned alen = 0;
-
- // copy into the tail buffer?
- if (!_buffers.empty()) {
- unsigned avail = _buffers.back().unused_tail_length();
- if (avail > 0) {
- //cout << "copying up to " << len << " into tail " << avail << " bytes of tail buf" << endl;
- if (avail > len)
- avail = len;
- unsigned blen = _buffers.back().length();
- memcpy(_buffers.back().c_str() + blen, data, avail);
- blen += avail;
- _buffers.back().set_length(blen);
- _len += avail;
- data += avail;
- len -= avail;
- }
- alen = _buffers.back().length();
- }
- if (len == 0) return;
-
- // just add another buffer.
- // alloc a bit extra, in case we do a bunch of appends. FIXME be smarter!
- if (alen < 1024) alen = 1024;
- push_back(new buffer(data, len, BUFFER_MODE_DEFAULT, len+alen));
- }
- void append(bufferptr& bp) {
- push_back(bp);
- }
- void append(bufferptr& bp, unsigned len, unsigned off) {
- bufferptr tempbp(bp, len, off);
- push_back(tempbp);
- }
- void append(const bufferlist& bl) {
- bufferlist temp = bl; // copy list
- claim_append(temp); // and append
- }
-
-
- /*
- * return a contiguous ptr to whole bufferlist contents.
- */
- char *c_str() {
- if (_buffers.size() == 1) {
- return _buffers.front().c_str(); // good, we're already contiguous.
- }
- else if (_buffers.size() == 0) {
- return 0; // no buffers
- }
- else {
- // make one new contiguous buffer.
- bufferptr newbuf = new buffer(length());
- unsigned off = 0;
-
- for (list<bufferptr>::iterator it = _buffers.begin();
- it != _buffers.end();
- it++) {
- //assert((*(*it)).has_free_func() == false); // not allowed if there's a funky free_func.. -sage ...for debugging at least!
- memcpy(newbuf.c_str() + off,
- (*it).c_str(), (*it).length());
- off += (*it).length();
- }
- assert(off == newbuf.length());
-
- _buffers.clear();
- _buffers.push_back( newbuf );
-
- // now it'll work.
- return c_str();
- }
- }
-
-
- void substr_of(bufferlist& other, unsigned off, unsigned len) {
- assert(off + len <= other.length());
- clear();
-
- // skip off
- list<bufferptr>::iterator curbuf = other._buffers.begin();
- while (off > 0) {
- assert(curbuf != _buffers.end());
- if (off >= (*curbuf).length()) {
- // skip this buffer
- //cout << "skipping over " << *curbuf << endl;
- off -= (*curbuf).length();
- curbuf++;
- } else {
- // somewhere in this buffer!
- //cout << "somewhere in " << *curbuf << endl;
- break;
- }
- }
-
- while (len > 0) {
- // partial?
- if (off + len < (*curbuf).length()) {
- //cout << "copying partial of " << *curbuf << endl;
- _buffers.push_back( bufferptr( *curbuf, len, off ) );
- _len += len;
- break;
- }
-
- // through end
- //cout << "copying end (all?) of " << *curbuf << endl;
- unsigned howmuch = (*curbuf).length() - off;
- _buffers.push_back( bufferptr( *curbuf, howmuch, off ) );
- _len += howmuch;
- len -= howmuch;
- off = 0;
- curbuf++;
- }
- }
-
- // funky modifer
- void splice(unsigned off, unsigned len, bufferlist *claim_by=0 /*, bufferlist& replace_with */) { // fixme?
- assert(off < length());
- assert(len > 0);
- //cout << "splice off " << off << " len " << len << " ... mylen = " << length() << endl;
-
- // skip off
- list<bufferptr>::iterator curbuf = _buffers.begin();
- while (off > 0) {
- assert(curbuf != _buffers.end());
- if (off >= (*curbuf).length()) {
- // skip this buffer
- //cout << "off = " << off << " skipping over " << *curbuf << endl;
- off -= (*curbuf).length();
- curbuf++;
- } else {
- // somewhere in this buffer!
- //cout << "off = " << off << " somewhere in " << *curbuf << endl;
- break;
- }
- }
- assert(off >= 0);
-
- if (off) {
- // add a reference to the front bit
- // insert it before curbuf (which we'll hose)
- //cout << "keeping front " << off << " of " << *curbuf << endl;
- _buffers.insert( curbuf, bufferptr( *curbuf, off, 0 ) );
- _len += off;
- }
-
- while (len > 0) {
- // partial?
- if (off + len < (*curbuf).length()) {
- //cout << "keeping end of " << *curbuf << ", losing first " << off+len << endl;
- if (claim_by)
- claim_by->append( *curbuf, len, off );
- (*curbuf).set_offset( off+len + (*curbuf).offset() ); // ignore beginning big
- (*curbuf).set_length( (*curbuf).length() - (len+off) );
- _len -= off+len;
- //cout << " now " << *curbuf << endl;
- break;
- }
-
- // hose though the end
- unsigned howmuch = (*curbuf).length() - off;
- //cout << "discarding " << howmuch << " of " << *curbuf << endl;
- if (claim_by)
- claim_by->append( *curbuf, howmuch, off );
- _len -= (*curbuf).length();
- _buffers.erase( curbuf++ );
- len -= howmuch;
- off = 0;
- }
-
- // splice in *replace (implement me later?)
- }
-
- friend ostream& operator<<(ostream& out, bufferlist& bl);
-
-};
-
-inline ostream& operator<<(ostream& out, bufferlist& bl) {
- out << "bufferlist(len=" << bl.length() << endl;
- for (list<bufferptr>::iterator it = bl._buffers.begin();
- it != bl._buffers.end();
- it++)
- out << "\t" << *it << endl;
- out << ")" << endl;
- return out;
-}
-
-
-
-// encoder/decode helpers
-
-// string
-inline void _encode(const string& s, bufferlist& bl)
-{
- bl.append(s.c_str(), s.length()+1);
-}
-inline void _decode(string& s, bufferlist& bl, int& off)
-{
- s = bl.c_str() + off;
- off += s.length() + 1;
-}
-
-// bufferptr (encapsulated)
-inline void _encode(bufferptr& bp, bufferlist& bl)
-{
- size_t len = bp.length();
- bl.append((char*)&len, sizeof(len));
- bl.append(bp);
-}
-inline void _decode(bufferptr& bp, bufferlist& bl, int& off)
-{
- size_t len;
- bl.copy(off, sizeof(len), (char*)&len);
- off += sizeof(len);
- bufferlist s;
- s.substr_of(bl, off, len);
- off += len;
-
- if (s.buffers().size() == 1)
- bp = s.buffers().front();
- else
- bp = new buffer(s.c_str(), s.length());
-}
-
-// bufferlist (encapsulated)
-inline void _encode(const bufferlist& s, bufferlist& bl)
-{
- size_t len = s.length();
- bl.append((char*)&len, sizeof(len));
- bl.append(s);
-}
-inline void _decode(bufferlist& s, bufferlist& bl, int& off)
-{
- size_t len;
- bl.copy(off, sizeof(len), (char*)&len);
- off += sizeof(len);
- s.substr_of(bl, off, len);
- off += len;
-}
-
-
-// set<T>
-template<class T>
-inline void _encode(set<T>& s, bufferlist& bl)
-{
- int n = s.size();
- bl.append((char*)&n, sizeof(n));
- for (typename set<T>::iterator it = s.begin();
- it != s.end();
- it++) {
- T v = *it;
- bl.append((char*)&v, sizeof(v));
- n--;
- }
- assert(n==0);
-}
-template<class T>
-inline void _decode(set<T>& s, bufferlist& bl, int& off)
-{
- s.clear();
- int n;
- bl.copy(off, sizeof(n), (char*)&n);
- off += sizeof(n);
- for (int i=0; i<n; i++) {
- T v;
- bl.copy(off, sizeof(v), (char*)&v);
- off += sizeof(v);
- s.insert(v);
- }
- assert(s.size() == (unsigned)n);
-}
-
-// vector<T>
-template<class T>
-inline void _encode(vector<T>& s, bufferlist& bl)
-{
- int n = s.size();
- bl.append((char*)&n, sizeof(n));
- for (typename vector<T>::iterator it = s.begin();
- it != s.end();
- it++) {
- T v = *it;
- bl.append((char*)&v, sizeof(v));
- n--;
- }
- assert(n==0);
-}
-template<class T>
-inline void _decode(vector<T>& s, bufferlist& bl, int& off)
-{
- s.clear();
- int n;
- bl.copy(off, sizeof(n), (char*)&n);
- off += sizeof(n);
- s = vector<T>(n);
- for (int i=0; i<n; i++) {
- T v;
- bl.copy(off, sizeof(v), (char*)&v);
- off += sizeof(v);
- s[i] = v;
- }
- assert(s.size() == (unsigned)n);
-}
-
-// list<T>
-template<class T>
-inline void _encode(const list<T>& s, bufferlist& bl)
-{
- int n = s.size();
- bl.append((char*)&n, sizeof(n));
- for (typename list<T>::const_iterator it = s.begin();
- it != s.end();
- it++) {
- T v = *it;
- bl.append((char*)&v, sizeof(v));
- n--;
- }
- assert(n==0);
-}
-template<class T>
-inline void _decode(list<T>& s, bufferlist& bl, int& off)
-{
- s.clear();
- int n;
- bl.copy(off, sizeof(n), (char*)&n);
- off += sizeof(n);
- for (int i=0; i<n; i++) {
- T v;
- bl.copy(off, sizeof(v), (char*)&v);
- off += sizeof(v);
- s.push_back(v);
- }
- assert(s.size() == (unsigned)n);
-}
-
-// map<string,bufferptr>
-inline void _encode(map<string, bufferptr>& s, bufferlist& bl)
-{
- int n = s.size();
- bl.append((char*)&n, sizeof(n));
- for (map<string, bufferptr>::iterator it = s.begin();
- it != s.end();
- it++) {
- _encode(it->first, bl);
- _encode(it->second, bl);
- n--;
- }
- assert(n==0);
-}
-inline void _decode(map<string,bufferptr>& s, bufferlist& bl, int& off)
-{
- s.clear();
- int n;
- bl.copy(off, sizeof(n), (char*)&n);
- off += sizeof(n);
- for (int i=0; i<n; i++) {
- string k;
- _decode(k, bl, off);
- _decode(s[k], bl, off);
- }
- assert(s.size() == (unsigned)n);
-}
-
-
-// map<T,bufferlist>
-template<class T>
-inline void _encode(const map<T, bufferlist>& s, bufferlist& bl)
-{
- int n = s.size();
- bl.append((char*)&n, sizeof(n));
- for (typename map<T, bufferlist>::const_iterator it = s.begin();
- it != s.end();
- it++) {
- T k = it->first;
- bl.append((char*)&k, sizeof(k));
- _encode(it->second, bl);
- n--;
- }
- assert(n==0);
-}
-template<class T>
-inline void _decode(map<T,bufferlist>& s, bufferlist& bl, int& off)
-{
- s.clear();
- int n;
- bl.copy(off, sizeof(n), (char*)&n);
- off += sizeof(n);
- for (int i=0; i<n; i++) {
- T k;
- bl.copy(off, sizeof(k), (char*)&k);
- off += sizeof(k);
- bufferlist b;
- _decode(b, bl, off);
- s[k] = b;
- }
- assert(s.size() == (unsigned)n);
-}
-
-// map<T,U>
-template<class T, class U>
-inline void _encode(const map<T, U>& s, bufferlist& bl)
-{
- int n = s.size();
- bl.append((char*)&n, sizeof(n));
- for (typename map<T, U>::const_iterator it = s.begin();
- it != s.end();
- it++) {
- T k = it->first;
- U v = it->second;
- bl.append((char*)&k, sizeof(k));
- bl.append((char*)&v, sizeof(v));
- n--;
- }
- assert(n==0);
-}
-template<class T, class U>
-inline void _decode(map<T,U>& s, bufferlist& bl, int& off)
-{
- s.clear();
- int n;
- bl.copy(off, sizeof(n), (char*)&n);
- off += sizeof(n);
- for (int i=0; i<n; i++) {
- T k;
- U v;
- bl.copy(off, sizeof(k), (char*)&k);
- off += sizeof(k);
- bl.copy(off, sizeof(v), (char*)&v);
- off += sizeof(v);
- s[k] = v;
- }
- assert(s.size() == (unsigned)n);
-}
-
-
-
-
-#endif
#include <ext/rope>
using namespace __gnu_cxx;
-#include "bufferlist.h"
+#include "buffer.h"
class filepath {
if (p->first < start) {
if (p->first + p->second != start) {
- cout << "p is " << p->first << "~" << p->second << ", start is " << start << ", len is " << len << endl;
+ //cout << "p is " << p->first << "~" << p->second << ", start is " << start << ", len is " << len << endl;
assert(0);
}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __BUFFER_H
+#define __BUFFER_H
+
+#include <cassert>
+#include <string.h>
+
+#include <iostream>
+using namespace std;
+
+// bit masks
+#define BUFFER_MODE_NOCOPY 0
+#define BUFFER_MODE_COPY 1 // copy on create, my buffer
+
+#define BUFFER_MODE_NOFREE 0
+#define BUFFER_MODE_FREE 2
+
+#define BUFFER_MODE_CUSTOMFREE 4
+
+#define BUFFER_MODE_DEFAULT 3//(BUFFER_MODE_COPY|BUFFER_MODE_FREE)
+
+
+// debug crap
+#include "config.h"
+#define bdbout(x) if (x <= g_conf.debug_buffer) cout
+
+#include "common/Mutex.h"
+
+// HACK: in config.cc
+/*
+ * WARNING: bufferlock placements are tricky for efficiency. note that only bufferptr and
+ * buffer ever use buffer._ref, and only bufferptr should call ~buffer().
+ *
+ * So, I only need to protect:
+ * - buffer()'s modification of buffer_total_alloc
+ * - ~bufferptr() check of buffer._ref, and ~buffer's mod of buffer_total_alloc
+ *
+ * I don't protect
+ * - buffer._get() .. increment is atomic on any sane architecture
+ * - buffer._put() .. only called by ~bufferptr.
+ * - ~buffer .. only called by ~bufferptr *** I HOPE!!
+ */
+extern Mutex bufferlock;
+extern long buffer_total_alloc;
+
+
+typedef void (buffer_free_func_t)(void*,char*,unsigned);
+
+
+/*
+ * buffer - the underlying buffer container. with a reference count.
+ *
+ * the buffer never shrinks.
+ *
+ * some invariants:
+ * _len never shrinks
+ * _len <= _alloc_len
+ */
+class buffer {
+ protected:
+ //wtf
+ //static Mutex bufferlock;
+ //static long buffer_total_alloc;// = 0;
+
+ private:
+ // raw buffer alloc
+ char *_dataptr;
+ bool _myptr;
+ unsigned _len;
+ unsigned _alloc_len;
+
+ // ref counts
+ unsigned _ref;
+ int _get() {
+ bdbout(1) << "buffer.get " << *this << " get " << _ref+1 << endl;
+ return ++_ref;
+ }
+ int _put() {
+ bdbout(1) << "buffer.put " << *this << " put " << _ref-1 << endl;
+ assert(_ref > 0);
+ return --_ref;
+ }
+
+ // custom (de!)allocator
+ buffer_free_func_t *free_func;
+ void *free_func_arg;
+
+ friend class bufferptr;
+
+ public:
+ // constructors
+ buffer() : _dataptr(0), _myptr(true), _len(0), _alloc_len(0), _ref(0), free_func(0), free_func_arg(0) {
+ bdbout(1) << "buffer.cons " << *this << endl;
+ }
+ buffer(unsigned a) : _dataptr(0), _myptr(true), _len(a), _alloc_len(a), _ref(0), free_func(0), free_func_arg(0) {
+ bdbout(1) << "buffer.cons " << *this << endl;
+ _dataptr = new char[a];
+ bufferlock.Lock();
+ buffer_total_alloc += _alloc_len;
+ bufferlock.Unlock();
+ bdbout(1) << "buffer.malloc " << (void*)_dataptr << endl;
+ }
+ ~buffer() {
+ bdbout(1) << "buffer.des " << *this << " " << (void*)free_func << endl;
+ if (free_func) {
+ bdbout(1) << "buffer.custom_free_func " << free_func_arg << " " << (void*)_dataptr << endl;
+ free_func( free_func_arg, _dataptr, _alloc_len );
+ }
+ else if (_dataptr && _myptr) {
+ bdbout(1) << "buffer.free " << (void*)_dataptr << endl;
+ delete[] _dataptr;
+ buffer_total_alloc -= _alloc_len;
+ }
+ }
+
+ buffer(const char *p, int l, int mode=BUFFER_MODE_DEFAULT, int alloc_len=0,
+ buffer_free_func_t free_func=0, void* free_func_arg=0) :
+ _dataptr(0),
+ _myptr(false),
+ _len(l),
+ _ref(0),
+ free_func(0), free_func_arg(0) {
+
+ if (alloc_len)
+ _alloc_len = alloc_len;
+ else
+ _alloc_len = l;
+
+ _myptr = mode & BUFFER_MODE_FREE ? true:false;
+ bdbout(1) << "buffer.cons " << *this << " mode = " << mode << ", myptr=" << _myptr << endl;
+ if (mode & BUFFER_MODE_COPY) {
+ _dataptr = new char[_alloc_len];
+ bdbout(1) << "buffer.malloc " << (void*)_dataptr << endl;
+ bufferlock.Lock();
+ buffer_total_alloc += _alloc_len;
+ bufferlock.Unlock();
+ memcpy(_dataptr, p, l);
+ bdbout(1) << "buffer.copy " << *this << endl;
+ } else {
+ _dataptr = (char*)p; // ugly
+ bdbout(1) << "buffer.claim " << *this << " myptr=" << _myptr << endl;
+ }
+
+ if (mode & BUFFER_MODE_CUSTOMFREE && free_func) {
+ this->free_func = free_func;
+ this->free_func_arg = free_func_arg;
+ }
+ }
+
+ // operators
+ buffer& operator=(buffer& other) {
+ assert(0); // not implemented, no reasonable assignment semantics.
+ return *this;
+ }
+
+ char *c_str() {
+ return _dataptr;
+ }
+
+ bool has_free_func() { return free_func != 0; }
+
+ // accessor
+ unsigned alloc_length() {
+ return _alloc_len;
+ }
+ void set_length(unsigned l) {
+ assert(l <= _alloc_len);
+ _len = l;
+ }
+ unsigned length() { return _len; }
+ unsigned unused_tail_length() { return _alloc_len - _len; }
+
+ friend ostream& operator<<(ostream& out, buffer& b);
+};
+
+inline ostream& operator<<(ostream& out, buffer& b) {
+ return out << "buffer(this=" << &b << " len=" << b._len << ", alloc=" << b._alloc_len << ", data=" << (void*)b._dataptr << " ref=" << b._ref << ")";
+}
+
+
+/*
+ * smart pointer class for buffer
+ *
+ * we reference count the actual buffer.
+ * we also let you refer to a subset of a buffer.
+ * we implement the high-level buffer accessor methods.
+ *
+ * some invariants:
+ * _off < _buffer->_len
+ * _off + _len <= _buffer->_len
+ */
+class bufferptr {
+ private:
+ buffer *_buffer;
+ unsigned _len, _off;
+
+ public:
+ // empty cons
+ bufferptr() :
+ _buffer(0),
+ _len(0),
+ _off(0) { }
+ // main cons - the entire buffer
+ bufferptr(buffer *b) :
+ _buffer(b),
+ _len(b->_len),
+ _off(0) {
+ assert(_buffer->_ref == 0);
+ _buffer->_get(); // this is always the first one.
+ }
+ // subset cons - a subset of another bufferptr (subset)
+ bufferptr(const bufferptr& bp, unsigned len, unsigned off) {
+ bufferlock.Lock();
+ _buffer = bp._buffer;
+ _len = len;
+ _off = bp._off + off;
+ _buffer->_get();
+ assert(_off < _buffer->_len); // sanity checks
+ assert(_off + _len <= _buffer->_len);
+ bufferlock.Unlock();
+ }
+
+ // copy cons
+ bufferptr(const bufferptr &other) {
+ bufferlock.Lock();
+ _buffer = other._buffer;
+ _len = other._len;
+ _off = other._off;
+ if (_buffer) _buffer->_get();
+ bufferlock.Unlock();
+ }
+
+ // assignment operator
+ bufferptr& operator=(const bufferptr& other) {
+ //assert(0);
+ // discard old
+ discard_buffer();
+
+ // point to other
+ bufferlock.Lock();
+ _buffer = other._buffer;
+ _len = other._len;
+ _off = other._off;
+ if (_buffer) _buffer->_get();
+ bufferlock.Unlock();
+ return *this;
+ }
+
+ ~bufferptr() {
+ discard_buffer();
+ }
+
+ void discard_buffer() {
+ if (_buffer) {
+ bufferlock.Lock();
+ if (_buffer->_put() == 0)
+ delete _buffer;
+ _buffer = 0;
+ bufferlock.Unlock();
+ }
+ }
+
+
+ // dereference to get the actual buffer
+ buffer& operator*() {
+ return *_buffer;
+ }
+
+
+ bool at_buffer_head() const {
+ return _off == 0;
+ }
+ bool at_buffer_tail() const {
+ return _off + _len == _buffer->_len;
+ }
+
+ // accessors for my subset
+ char *c_str() {
+ return _buffer->c_str() + _off;
+ }
+ unsigned length() const {
+ return _len;
+ }
+ unsigned offset() const {
+ return _off;
+ }
+ unsigned unused_tail_length() {
+ if (!at_buffer_tail()) return 0;
+ return _buffer->unused_tail_length();
+ }
+
+
+
+ // modifiers
+ void set_offset(unsigned off) {
+ assert(off <= _buffer->_alloc_len);
+ _off = off;
+ }
+ void set_length(unsigned len) {
+ assert(len >= 0 && _off + len <= _buffer->_alloc_len);
+ if (_buffer->_len < _off + len)
+ _buffer->_len = _off + len; // set new buffer len (_IF_ i'm expanding it)
+ _len = len; // my len too
+ }
+ void zero() {
+ //bzero((void*)c_str(), _len);
+ memset((void*)c_str(), 0, _len);
+ }
+
+
+ // crope lookalikes
+ void append(const char *p, unsigned len) {
+ assert(len + _len + _off <= _buffer->_alloc_len); // FIXME later for auto-expansion?
+
+ // copy
+ memcpy(c_str() + _len, p, len);
+ _buffer->_len += len;
+ _len += len;
+ }
+ void copy_out(unsigned off, unsigned len, char *dest) {
+ assert(off >= 0 && off <= _len);
+ assert(len >= 0 && off + len <= _len);
+ memcpy(dest, c_str() + off, len);
+ }
+ void copy_in(unsigned off, unsigned len, const char *src) {
+ assert(off >= 0 && off <= _len);
+ assert(len >= 0 && off + len <= _len);
+ memcpy(c_str() + off, src, len);
+ }
+
+ friend ostream& operator<<(ostream& out, bufferptr& bp);
+};
+
+
+inline ostream& operator<<(ostream& out, bufferptr& bp) {
+ return out << "bufferptr(len=" << bp._len << " off=" << bp._off
+ << " cstr=" << (void*)bp.c_str()
+ << " buf=" << *bp._buffer
+ << ")";
+}
+
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __BUFFERLIST_H
+#define __BUFFERLIST_H
+
+#include "buffer.h"
+
+#include <list>
+#include <map>
+#include <set>
+#include <vector>
+using namespace std;
+
+#include <ext/rope>
+using namespace __gnu_cxx;
+
+
+// debug crap
+#include "config.h"
+#define bdbout(x) if (x <= g_conf.debug_buffer) cout
+
+
+
+class bufferlist {
+ private:
+ /* local state limited to _buffers, and _len.
+ * we maintain _len ourselves, so we must be careful when fiddling with buffers!
+ */
+ list<bufferptr> _buffers;
+ unsigned _len;
+
+ public:
+ // cons/des
+ bufferlist() : _len(0) {
+ bdbout(1) << "bufferlist.cons " << this << endl;
+ }
+ bufferlist(const bufferlist& bl) : _len(0) {
+ //assert(0); // o(n) and stupid
+ bdbout(1) << "bufferlist.cons " << this << endl;
+ _buffers = bl._buffers;
+ _len = bl._len;
+ }
+ ~bufferlist() {
+ bdbout(1) << "bufferlist.des " << this << endl;
+ }
+
+ bufferlist& operator=(bufferlist& bl) {
+ //assert(0); // actually, this should be fine, just slow (O(n)) and stupid.
+ bdbout(1) << "bufferlist.= " << this << endl;
+ _buffers = bl._buffers;
+ _len = bl._len;
+ return *this;
+ }
+
+
+ // accessors
+ list<bufferptr>& buffers() {
+ return _buffers;
+ }
+ //list<buffer*>::iterator begin() { return _buffers.begin(); }
+ //list<buffer*>::iterator end() { return _buffers.end(); }
+
+ unsigned length() const {
+#if 0
+ { // DEBUG: verify _len
+ int len = 0;
+ for (list<bufferptr>::iterator it = _buffers.begin();
+ it != _buffers.end();
+ it++) {
+ len += (*it).length();
+ }
+ assert(len == _len);
+ }
+#endif
+ return _len;
+ }
+
+ void _rope(crope& r) {
+ for (list<bufferptr>::iterator it = _buffers.begin();
+ it != _buffers.end();
+ it++)
+ r.append((*it).c_str(), (*it).length());
+ }
+
+ // modifiers
+ void clear() {
+ _buffers.clear();
+ _len = 0;
+ }
+ void push_front(bufferptr& bp) {
+ _buffers.push_front(bp);
+ _len += bp.length();
+ }
+ void push_front(buffer *b) {
+ bufferptr bp(b);
+ _buffers.push_front(bp);
+ _len += bp.length();
+ }
+ void push_back(bufferptr& bp) {
+ _buffers.push_back(bp);
+ _len += bp.length();
+ }
+ void push_back(buffer *b) {
+ bufferptr bp(b);
+
+ _buffers.push_back(bp);
+ _len += bp.length();
+
+ }
+ void zero() {
+ for (list<bufferptr>::iterator it = _buffers.begin();
+ it != _buffers.end();
+ it++)
+ it->zero();
+ }
+
+ // sort-of-like-assignment-op
+ void claim(bufferlist& bl) {
+ // free my buffers
+ clear();
+ claim_append(bl);
+ }
+ void claim_append(bufferlist& bl) {
+ // steal the other guy's buffers
+ _len += bl._len;
+ _buffers.splice( _buffers.end(), bl._buffers );
+ bl._len = 0;
+ }
+
+
+
+
+ // crope lookalikes
+ void copy(unsigned off, unsigned len, char *dest) {
+ assert(off >= 0);
+ assert(off + len <= length());
+ /*assert(off < length());
+ if (off + len > length())
+ len = length() - off;
+ */
+ // advance to off
+ list<bufferptr>::iterator curbuf = _buffers.begin();
+
+ // skip off
+ while (off > 0) {
+ assert(curbuf != _buffers.end());
+ if (off >= (*curbuf).length()) {
+ // skip this buffer
+ off -= (*curbuf).length();
+ curbuf++;
+ } else {
+ // somewhere in this buffer!
+ break;
+ }
+ }
+
+ // copy
+ while (len > 0) {
+ // is the rest ALL in this buffer?
+ if (off + len <= (*curbuf).length()) {
+ (*curbuf).copy_out(off, len, dest); // yup, last bit!
+ break;
+ }
+
+ // get as much as we can from this buffer.
+ unsigned howmuch = (*curbuf).length() - off;
+ (*curbuf).copy_out(off, howmuch, dest);
+
+ dest += howmuch;
+ len -= howmuch;
+ off = 0;
+ curbuf++;
+ assert(curbuf != _buffers.end());
+ }
+ }
+
+ void copy_in(unsigned off, unsigned len, const char *src) {
+ assert(off >= 0);
+ assert(off + len <= length());
+
+ // advance to off
+ list<bufferptr>::iterator curbuf = _buffers.begin();
+
+ // skip off
+ while (off > 0) {
+ assert(curbuf != _buffers.end());
+ if (off >= (*curbuf).length()) {
+ // skip this buffer
+ off -= (*curbuf).length();
+ curbuf++;
+ } else {
+ // somewhere in this buffer!
+ break;
+ }
+ }
+
+ // copy
+ while (len > 0) {
+ // is the rest ALL in this buffer?
+ if (off + len <= (*curbuf).length()) {
+ (*curbuf).copy_in(off, len, src); // yup, last bit!
+ break;
+ }
+
+ // get as much as we can from this buffer.
+ unsigned howmuch = (*curbuf).length() - off;
+ (*curbuf).copy_in(off, howmuch, src);
+
+ src += howmuch;
+ len -= howmuch;
+ off = 0;
+ curbuf++;
+ assert(curbuf != _buffers.end());
+ }
+ }
+ void copy_in(unsigned off, unsigned len, bufferlist& bl) {
+ unsigned left = len;
+ for (list<bufferptr>::iterator i = bl._buffers.begin();
+ i != bl._buffers.end();
+ i++) {
+ unsigned l = (*i).length();
+ if (left < l) l = left;
+ copy_in(off, l, (*i).c_str());
+ left -= l;
+ if (left == 0) break;
+ off += l;
+ }
+ }
+
+
+ void append(const char *data, unsigned len) {
+ if (len == 0) return;
+
+ unsigned alen = 0;
+
+ // copy into the tail buffer?
+ if (!_buffers.empty()) {
+ unsigned avail = _buffers.back().unused_tail_length();
+ if (avail > 0) {
+ //cout << "copying up to " << len << " into tail " << avail << " bytes of tail buf" << endl;
+ if (avail > len)
+ avail = len;
+ unsigned blen = _buffers.back().length();
+ memcpy(_buffers.back().c_str() + blen, data, avail);
+ blen += avail;
+ _buffers.back().set_length(blen);
+ _len += avail;
+ data += avail;
+ len -= avail;
+ }
+ alen = _buffers.back().length();
+ }
+ if (len == 0) return;
+
+ // just add another buffer.
+ // alloc a bit extra, in case we do a bunch of appends. FIXME be smarter!
+ if (alen < 1024) alen = 1024;
+ push_back(new buffer(data, len, BUFFER_MODE_DEFAULT, len+alen));
+ }
+ void append(bufferptr& bp) {
+ push_back(bp);
+ }
+ void append(bufferptr& bp, unsigned len, unsigned off) {
+ bufferptr tempbp(bp, len, off);
+ push_back(tempbp);
+ }
+ void append(const bufferlist& bl) {
+ bufferlist temp = bl; // copy list
+ claim_append(temp); // and append
+ }
+
+
+ /*
+ * return a contiguous ptr to whole bufferlist contents.
+ */
+ char *c_str() {
+ if (_buffers.size() == 1) {
+ return _buffers.front().c_str(); // good, we're already contiguous.
+ }
+ else if (_buffers.size() == 0) {
+ return 0; // no buffers
+ }
+ else {
+ // make one new contiguous buffer.
+ bufferptr newbuf = new buffer(length());
+ unsigned off = 0;
+
+ for (list<bufferptr>::iterator it = _buffers.begin();
+ it != _buffers.end();
+ it++) {
+ //assert((*(*it)).has_free_func() == false); // not allowed if there's a funky free_func.. -sage ...for debugging at least!
+ memcpy(newbuf.c_str() + off,
+ (*it).c_str(), (*it).length());
+ off += (*it).length();
+ }
+ assert(off == newbuf.length());
+
+ _buffers.clear();
+ _buffers.push_back( newbuf );
+
+ // now it'll work.
+ return c_str();
+ }
+ }
+
+
+ void substr_of(bufferlist& other, unsigned off, unsigned len) {
+ assert(off + len <= other.length());
+ clear();
+
+ // skip off
+ list<bufferptr>::iterator curbuf = other._buffers.begin();
+ while (off > 0) {
+ assert(curbuf != _buffers.end());
+ if (off >= (*curbuf).length()) {
+ // skip this buffer
+ //cout << "skipping over " << *curbuf << endl;
+ off -= (*curbuf).length();
+ curbuf++;
+ } else {
+ // somewhere in this buffer!
+ //cout << "somewhere in " << *curbuf << endl;
+ break;
+ }
+ }
+
+ while (len > 0) {
+ // partial?
+ if (off + len < (*curbuf).length()) {
+ //cout << "copying partial of " << *curbuf << endl;
+ _buffers.push_back( bufferptr( *curbuf, len, off ) );
+ _len += len;
+ break;
+ }
+
+ // through end
+ //cout << "copying end (all?) of " << *curbuf << endl;
+ unsigned howmuch = (*curbuf).length() - off;
+ _buffers.push_back( bufferptr( *curbuf, howmuch, off ) );
+ _len += howmuch;
+ len -= howmuch;
+ off = 0;
+ curbuf++;
+ }
+ }
+
+ // funky modifer
+ void splice(unsigned off, unsigned len, bufferlist *claim_by=0 /*, bufferlist& replace_with */) { // fixme?
+ assert(off < length());
+ assert(len > 0);
+ //cout << "splice off " << off << " len " << len << " ... mylen = " << length() << endl;
+
+ // skip off
+ list<bufferptr>::iterator curbuf = _buffers.begin();
+ while (off > 0) {
+ assert(curbuf != _buffers.end());
+ if (off >= (*curbuf).length()) {
+ // skip this buffer
+ //cout << "off = " << off << " skipping over " << *curbuf << endl;
+ off -= (*curbuf).length();
+ curbuf++;
+ } else {
+ // somewhere in this buffer!
+ //cout << "off = " << off << " somewhere in " << *curbuf << endl;
+ break;
+ }
+ }
+ assert(off >= 0);
+
+ if (off) {
+ // add a reference to the front bit
+ // insert it before curbuf (which we'll hose)
+ //cout << "keeping front " << off << " of " << *curbuf << endl;
+ _buffers.insert( curbuf, bufferptr( *curbuf, off, 0 ) );
+ _len += off;
+ }
+
+ while (len > 0) {
+ // partial?
+ if (off + len < (*curbuf).length()) {
+ //cout << "keeping end of " << *curbuf << ", losing first " << off+len << endl;
+ if (claim_by)
+ claim_by->append( *curbuf, len, off );
+ (*curbuf).set_offset( off+len + (*curbuf).offset() ); // ignore beginning big
+ (*curbuf).set_length( (*curbuf).length() - (len+off) );
+ _len -= off+len;
+ //cout << " now " << *curbuf << endl;
+ break;
+ }
+
+ // hose though the end
+ unsigned howmuch = (*curbuf).length() - off;
+ //cout << "discarding " << howmuch << " of " << *curbuf << endl;
+ if (claim_by)
+ claim_by->append( *curbuf, howmuch, off );
+ _len -= (*curbuf).length();
+ _buffers.erase( curbuf++ );
+ len -= howmuch;
+ off = 0;
+ }
+
+ // splice in *replace (implement me later?)
+ }
+
+ friend ostream& operator<<(ostream& out, bufferlist& bl);
+
+};
+
+inline ostream& operator<<(ostream& out, bufferlist& bl) {
+ out << "bufferlist(len=" << bl.length() << endl;
+ for (list<bufferptr>::iterator it = bl._buffers.begin();
+ it != bl._buffers.end();
+ it++)
+ out << "\t" << *it << endl;
+ out << ")" << endl;
+ return out;
+}
+
+
+
+// encoder/decode helpers
+
+// string
+inline void _encode(const string& s, bufferlist& bl)
+{
+ bl.append(s.c_str(), s.length()+1);
+}
+inline void _decode(string& s, bufferlist& bl, int& off)
+{
+ s = bl.c_str() + off;
+ off += s.length() + 1;
+}
+
+// bufferptr (encapsulated)
+inline void _encode(bufferptr& bp, bufferlist& bl)
+{
+ size_t len = bp.length();
+ bl.append((char*)&len, sizeof(len));
+ bl.append(bp);
+}
+inline void _decode(bufferptr& bp, bufferlist& bl, int& off)
+{
+ size_t len;
+ bl.copy(off, sizeof(len), (char*)&len);
+ off += sizeof(len);
+ bufferlist s;
+ s.substr_of(bl, off, len);
+ off += len;
+
+ if (s.buffers().size() == 1)
+ bp = s.buffers().front();
+ else
+ bp = new buffer(s.c_str(), s.length());
+}
+
+// bufferlist (encapsulated)
+inline void _encode(const bufferlist& s, bufferlist& bl)
+{
+ size_t len = s.length();
+ bl.append((char*)&len, sizeof(len));
+ bl.append(s);
+}
+inline void _decode(bufferlist& s, bufferlist& bl, int& off)
+{
+ size_t len;
+ bl.copy(off, sizeof(len), (char*)&len);
+ off += sizeof(len);
+ s.substr_of(bl, off, len);
+ off += len;
+}
+
+
+// set<T>
+template<class T>
+inline void _encode(set<T>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (typename set<T>::iterator it = s.begin();
+ it != s.end();
+ it++) {
+ T v = *it;
+ bl.append((char*)&v, sizeof(v));
+ n--;
+ }
+ assert(n==0);
+}
+template<class T>
+inline void _decode(set<T>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ T v;
+ bl.copy(off, sizeof(v), (char*)&v);
+ off += sizeof(v);
+ s.insert(v);
+ }
+ assert(s.size() == (unsigned)n);
+}
+
+// vector<T>
+template<class T>
+inline void _encode(vector<T>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (typename vector<T>::iterator it = s.begin();
+ it != s.end();
+ it++) {
+ T v = *it;
+ bl.append((char*)&v, sizeof(v));
+ n--;
+ }
+ assert(n==0);
+}
+template<class T>
+inline void _decode(vector<T>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ s = vector<T>(n);
+ for (int i=0; i<n; i++) {
+ T v;
+ bl.copy(off, sizeof(v), (char*)&v);
+ off += sizeof(v);
+ s[i] = v;
+ }
+ assert(s.size() == (unsigned)n);
+}
+
+// list<T>
+template<class T>
+inline void _encode(const list<T>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (typename list<T>::const_iterator it = s.begin();
+ it != s.end();
+ it++) {
+ T v = *it;
+ bl.append((char*)&v, sizeof(v));
+ n--;
+ }
+ assert(n==0);
+}
+template<class T>
+inline void _decode(list<T>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ T v;
+ bl.copy(off, sizeof(v), (char*)&v);
+ off += sizeof(v);
+ s.push_back(v);
+ }
+ assert(s.size() == (unsigned)n);
+}
+
+// map<string,bufferptr>
+inline void _encode(map<string, bufferptr>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (map<string, bufferptr>::iterator it = s.begin();
+ it != s.end();
+ it++) {
+ _encode(it->first, bl);
+ _encode(it->second, bl);
+ n--;
+ }
+ assert(n==0);
+}
+inline void _decode(map<string,bufferptr>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ string k;
+ _decode(k, bl, off);
+ _decode(s[k], bl, off);
+ }
+ assert(s.size() == (unsigned)n);
+}
+
+
+// map<T,bufferlist>
+template<class T>
+inline void _encode(const map<T, bufferlist>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (typename map<T, bufferlist>::const_iterator it = s.begin();
+ it != s.end();
+ it++) {
+ T k = it->first;
+ bl.append((char*)&k, sizeof(k));
+ _encode(it->second, bl);
+ n--;
+ }
+ assert(n==0);
+}
+template<class T>
+inline void _decode(map<T,bufferlist>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ T k;
+ bl.copy(off, sizeof(k), (char*)&k);
+ off += sizeof(k);
+ bufferlist b;
+ _decode(b, bl, off);
+ s[k] = b;
+ }
+ assert(s.size() == (unsigned)n);
+}
+
+// map<T,U>
+template<class T, class U>
+inline void _encode(const map<T, U>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (typename map<T, U>::const_iterator it = s.begin();
+ it != s.end();
+ it++) {
+ T k = it->first;
+ U v = it->second;
+ bl.append((char*)&k, sizeof(k));
+ bl.append((char*)&v, sizeof(v));
+ n--;
+ }
+ assert(n==0);
+}
+template<class T, class U>
+inline void _decode(map<T,U>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ T k;
+ U v;
+ bl.copy(off, sizeof(k), (char*)&k);
+ off += sizeof(k);
+ bl.copy(off, sizeof(v), (char*)&v);
+ off += sizeof(v);
+ s[k] = v;
+ }
+ assert(s.size() == (unsigned)n);
+}
+
+
+
+
+#endif
using std::string;
#include "include/types.h"
-#include "include/bufferlist.h"
+#include "include/buffer.h"
class Anchor {
public:
int npins;
multiset<Message*> pinset;
+ friend class Migrator;
friend class MDCache;
friend class MDS;
friend class CInode;
#define __CDIR_H
#include "include/types.h"
-#include "include/bufferlist.h"
+#include "include/buffer.h"
#include "config.h"
#include "common/DecayCounter.h"
meta_load_t popularity[MDS_NPOP];
// friends
+ friend class Migrator;
friend class CInode;
friend class MDCache;
friend class MDiscover;
// cached inode wrapper
-class CInode : LRUObject {
+class CInode : public LRUObject {
public:
inode_t inode; // the inode itself
meta_load_t popularity[MDS_NPOP];
// friends
+ friend class Migrator;
friend class MDCache;
friend class CDir;
friend class CInodeExport;
#ifndef __CAPABILITY_H
#define __CAPABILITY_H
-#include "include/bufferlist.h"
+#include "include/buffer.h"
#include <map>
using namespace std;
#include "include/types.h"
#include "include/rangeset.h"
-#include "include/bufferlist.h"
+#include "include/buffer.h"
#include "include/Context.h"
class MDS;
#include <set>
using namespace std;
-#include "include/bufferlist.h"
+#include "include/buffer.h"
#include "Capability.h"
*/
friend class MDCache;
+ friend class Migrator;
};
//ostream& operator<<(ostream& out, CLock& l);
#include <string>
using namespace std;
-#include "include/bufferlist.h"
+#include "include/buffer.h"
#include "include/Context.h"
class MDS;
#include "CInode.h"
#include "CDir.h"
#include "MDCache.h"
+#include "Migrator.h"
#include "include/Context.h"
#include "msg/Messenger.h"
CDir *dir = *it;
if (!dir->inode->is_root() && dir->get_size() == 0)
- mds->mdcache->export_empty_import(dir);
+ mds->mdcache->migrator->export_empty_import(dir);
}
}
if (!dir->is_auth()) continue;
dout(0) << "do_hashing hashing " << *dir << endl;
- mds->mdcache->hash_dir(dir);
+ mds->mdcache->migrator->hash_dir(dir);
}
hash_queue.clear();
}
dout(-5) << " exporting idle import " << **it
<< " back to mds" << (*it)->inode->authority()
<< endl;
- mds->mdcache->export_dir(*it, (*it)->inode->authority());
+ mds->mdcache->migrator->export_dir(*it, (*it)->inode->authority());
continue;
}
import_pop_map[ pop ] = *it;
dout(-5) << "reexporting " << *dir
<< " pop " << pop
<< " back to mds" << target << endl;
- mds->mdcache->export_dir(dir, target);
+ mds->mdcache->migrator->export_dir(dir, target);
have += pop;
import_from_map.erase(plast);
import_pop_map.erase(pop);
<< " back to mds" << imp->inode->authority()
<< endl;
have += pop;
- mds->mdcache->export_dir(imp, imp->inode->authority());
+ mds->mdcache->migrator->export_dir(imp, imp->inode->authority());
}
if (amount-have < MIN_OFFLOAD) break;
}
<< " fragment " << **it
<< " pop " << (*it)->popularity[MDS_POP_CURDOM].meta_load()
<< endl;
- mds->mdcache->export_dir(*it, target);
+ mds->mdcache->migrator->export_dir(*it, target);
}
}
#include "MDLog.h"
#include "MDBalancer.h"
#include "AnchorClient.h"
+#include "Migrator.h"
#include "include/filepath.h"
#include "messages/MDiscover.h"
#include "messages/MDiscoverReply.h"
-#include "messages/MExportDirDiscover.h"
-#include "messages/MExportDirDiscoverAck.h"
-#include "messages/MExportDirPrep.h"
-#include "messages/MExportDirPrepAck.h"
-#include "messages/MExportDirWarning.h"
-#include "messages/MExportDir.h"
-#include "messages/MExportDirNotify.h"
-#include "messages/MExportDirNotifyAck.h"
-#include "messages/MExportDirFinish.h"
-
-#include "messages/MHashDirDiscover.h"
-#include "messages/MHashDirDiscoverAck.h"
-#include "messages/MHashDirPrep.h"
-#include "messages/MHashDirPrepAck.h"
-#include "messages/MHashDir.h"
-#include "messages/MHashDirNotify.h"
-#include "messages/MHashDirAck.h"
-
-#include "messages/MUnhashDirPrep.h"
-#include "messages/MUnhashDirPrepAck.h"
-#include "messages/MUnhashDir.h"
-#include "messages/MUnhashDirAck.h"
-#include "messages/MUnhashDirNotify.h"
-#include "messages/MUnhashDirNotifyAck.h"
-
//#include "messages/MInodeUpdate.h"
#include "messages/MDirUpdate.h"
#include "messages/MCacheExpire.h"
MDCache::MDCache(MDS *m)
{
mds = m;
+ migrator = new Migrator(mds, this);
root = NULL;
lru.lru_set_max(g_conf.mds_cache_size);
lru.lru_set_midpoint(g_conf.mds_cache_mid);
show_imports();
}
-class C_MDC_EmptyImport : public Context {
- MDCache *mdc;
- CDir *dir;
-public:
- C_MDC_EmptyImport(MDCache *mdc, CDir *dir) {
- this->mdc = mdc;
- this->dir = dir;
- }
- void finish(int r) {
- mdc->export_empty_import(dir);
- }
-};
-
-void MDCache::export_empty_import(CDir *dir)
-{
- dout(7) << "export_empty_import " << *dir << endl;
-
- return; // hack fixme
-
- if (!dir->is_import()) {
- dout(7) << "not import (anymore?)" << endl;
- return;
- }
- if (dir->inode->is_root()) {
- dout(7) << "root" << endl;
- return;
- }
-
- if (dir->get_size() > 0) {
- dout(7) << "not actually empty" << endl;
- return;
- }
-
- // is it really empty?
- if (!dir->is_complete()) {
- dout(7) << "not complete, fetching." << endl;
- mds->mdstore->fetch_dir(dir,
- new C_MDC_EmptyImport(this,dir));
- return;
- }
-
- int dest = dir->inode->authority();
-
- // comment this out ot wreak havoc?
- //if (mds->is_shutting_down()) dest = 0; // this is more efficient.
-
- dout(7) << "really empty, exporting to " << dest << endl;
- assert (dest != mds->get_nodeid());
-
- dout(-7) << "exporting to mds" << dest
- << " empty import " << *dir << endl;
- export_dir( dir, dest );
-}
bool MDCache::trim(int max) {
if (diri->dir->is_import() && // import
diri->dir->get_size() == 0 && // no children
!diri->is_root()) // not root
- export_empty_import(diri->dir);
+ migrator->export_empty_import(diri->dir);
}
CDir *dir = *it;
if (!dir->is_auth()) continue;
if (dir->is_unhashing()) continue;
- unhash_dir(dir);
+ migrator->unhash_dir(dir);
}
dout(7) << "waiting for dirs to unhash" << endl;
if (im->is_frozen() || im->is_freezing()) continue;
dout(7) << "sending " << *im << " back to mds0" << endl;
- export_dir(im,0);
+ migrator->export_dir(im,0);
}
did_shutdown_exports = true;
}
-/*
- * some import/export helpers
- */
-
-/** con = get_auth_container(dir)
- * Returns the directory in which authority is delegated for *dir.
- * This may be because a directory is an import, or because it is hashed
- * and we are nested underneath an inode in that dir (that hashes to us).
- * Thus do not assume con->is_auth()! It is_auth() || is_hashed().
- */
-CDir *MDCache::get_auth_container(CDir *dir)
-{
- CDir *imp = dir; // might be *dir
-
- // find the underlying import or hash that delegates dir
- while (true) {
- if (imp->is_import()) break; // import
- imp = imp->get_parent_dir();
- assert(imp);
- if (imp->is_hashed()) break; // hash
- }
-
- return imp;
-}
-
-
-void MDCache::find_nested_exports(CDir *dir, set<CDir*>& s)
-{
- CDir *import = get_auth_container(dir);
- find_nested_exports_under(import, dir, s);
-}
-
-void MDCache::find_nested_exports_under(CDir *import, CDir *dir, set<CDir*>& s)
-{
- dout(10) << "find_nested_exports for " << *dir << endl;
- dout(10) << "find_nested_exports_under import " << *import << endl;
-
- if (import == dir) {
- // yay, my job is easy!
- for (set<CDir*>::iterator p = nested_exports[import].begin();
- p != nested_exports[import].end();
- p++) {
- CDir *nested = *p;
- s.insert(nested);
- dout(10) << "find_nested_exports " << *dir << " " << *nested << endl;
- }
- return;
- }
-
- // ok, my job is annoying.
- for (set<CDir*>::iterator p = nested_exports[import].begin();
- p != nested_exports[import].end();
- p++) {
- CDir *nested = *p;
-
- dout(12) << "find_nested_exports checking " << *nested << endl;
-
- // trace back to import, or dir
- CDir *cur = nested->get_parent_dir();
- while (!cur->is_import() || cur == dir) {
- if (cur == dir) {
- s.insert(nested);
- dout(10) << "find_nested_exports " << *dir << " " << *nested << endl;
- break;
- } else {
- cur = cur->get_parent_dir();
- }
- }
- }
-}
-
-
-
-
break;
- // import
- case MSG_MDS_EXPORTDIRDISCOVER:
- handle_export_dir_discover((MExportDirDiscover*)m);
- break;
- case MSG_MDS_EXPORTDIRPREP:
- handle_export_dir_prep((MExportDirPrep*)m);
- break;
- case MSG_MDS_EXPORTDIR:
- handle_export_dir((MExportDir*)m);
- break;
- case MSG_MDS_EXPORTDIRFINISH:
- handle_export_dir_finish((MExportDirFinish*)m);
- break;
-
- // export
- case MSG_MDS_EXPORTDIRDISCOVERACK:
- handle_export_dir_discover_ack((MExportDirDiscoverAck*)m);
- break;
- case MSG_MDS_EXPORTDIRPREPACK:
- handle_export_dir_prep_ack((MExportDirPrepAck*)m);
- break;
- case MSG_MDS_EXPORTDIRNOTIFYACK:
- handle_export_dir_notify_ack((MExportDirNotifyAck*)m);
- break;
-
- // export 3rd party (inode authority)
- case MSG_MDS_EXPORTDIRWARNING:
- handle_export_dir_warning((MExportDirWarning*)m);
- break;
- case MSG_MDS_EXPORTDIRNOTIFY:
- handle_export_dir_notify((MExportDirNotify*)m);
- break;
-
-
- // hashing
- case MSG_MDS_HASHDIRDISCOVER:
- handle_hash_dir_discover((MHashDirDiscover*)m);
- break;
- case MSG_MDS_HASHDIRDISCOVERACK:
- handle_hash_dir_discover_ack((MHashDirDiscoverAck*)m);
- break;
- case MSG_MDS_HASHDIRPREP:
- handle_hash_dir_prep((MHashDirPrep*)m);
- break;
- case MSG_MDS_HASHDIRPREPACK:
- handle_hash_dir_prep_ack((MHashDirPrepAck*)m);
- break;
- case MSG_MDS_HASHDIR:
- handle_hash_dir((MHashDir*)m);
- break;
- case MSG_MDS_HASHDIRACK:
- handle_hash_dir_ack((MHashDirAck*)m);
- break;
- case MSG_MDS_HASHDIRNOTIFY:
- handle_hash_dir_notify((MHashDirNotify*)m);
- break;
-
- // unhashing
- case MSG_MDS_UNHASHDIRPREP:
- handle_unhash_dir_prep((MUnhashDirPrep*)m);
- break;
- case MSG_MDS_UNHASHDIRPREPACK:
- handle_unhash_dir_prep_ack((MUnhashDirPrepAck*)m);
- break;
- case MSG_MDS_UNHASHDIR:
- handle_unhash_dir((MUnhashDir*)m);
- break;
- case MSG_MDS_UNHASHDIRACK:
- handle_unhash_dir_ack((MUnhashDirAck*)m);
- break;
- case MSG_MDS_UNHASHDIRNOTIFY:
- handle_unhash_dir_notify((MUnhashDirNotify*)m);
- break;
- case MSG_MDS_UNHASHDIRNOTIFYACK:
- handle_unhash_dir_notify_ack((MUnhashDirNotifyAck*)m);
- break;
-
default:
mds->logger->set("ctop", lru.lru_get_top());
mds->logger->set("cbot", lru.lru_get_bot());
mds->logger->set("cptail", lru.lru_get_pintail());
- mds->logger->set("buf",buffer_total_alloc);
+ //mds->logger->set("buf",buffer_total_alloc);
if (g_conf.log_pins) {
// pin
// did i empty out an imported dir?
if (dir->is_import() && !dir->inode->is_root() && dir->get_size() == 0)
- export_empty_import(dir);
+ migrator->export_empty_import(dir);
// wake up any waiters
dir->take_waiting(CDIR_WAIT_ANY, dname, mds->finished_queue);
// did i empty out an imported dir? FIXME this check should go somewhere else???
if (srcdir->is_import() && !srcdir->inode->is_root() && srcdir->get_size() == 0)
- export_empty_import(srcdir);
+ migrator->export_empty_import(srcdir);
// finish our caller
if (c) {
// encode and export inode state
bufferlist inode_state;
- encode_export_inode(in, inode_state, destauth);
+ migrator->encode_export_inode(in, inode_state, destauth);
// send
MRename *m = new MRename(initiator,
// HACK
bufferlist bufstate;
bufstate.claim_append(m->get_inode_state());
- decode_import_inode(destdn, bufstate, off, MSG_ADDR_NUM(m->get_source()));
+ migrator->decode_import_inode(destdn, bufstate, off, MSG_ADDR_NUM(m->get_source()));
CInode *in = destdn->inode;
assert(in);
+/*
+ * some import/export helpers
+ */
-
-
-
-
-
-// ==========================================================
-// IMPORT/EXPORT
-
-
-class C_MDC_ExportFreeze : public Context {
- MDS *mds;
- CDir *ex; // dir i'm exporting
- int dest;
-
-public:
- C_MDC_ExportFreeze(MDS *mds, CDir *ex, int dest) {
- this->mds = mds;
- this->ex = ex;
- this->dest = dest;
- }
- virtual void finish(int r) {
- mds->mdcache->export_dir_frozen(ex, dest);
- }
-};
-
-
-
-/** export_dir(dir, dest)
- * public method to initiate an export.
- * will fail if the directory is freezing, frozen, unpinnable, or root.
+/** con = get_auth_container(dir)
+ * Returns the directory in which authority is delegated for *dir.
+ * This may be because a directory is an import, or because it is hashed
+ * and we are nested underneath an inode in that dir (that hashes to us).
+ * Thus do not assume con->is_auth()! It is_auth() || is_hashed().
*/
-void MDCache::export_dir(CDir *dir,
- int dest)
+CDir *MDCache::get_auth_container(CDir *dir)
{
- dout(7) << "export_dir " << *dir << " to " << dest << endl;
- assert(dest != mds->get_nodeid());
- assert(!dir->is_hashed());
-
- if (dir->inode->is_root()) {
- dout(7) << "i won't export root" << endl;
- assert(0);
- return;
- }
-
- if (dir->is_frozen() ||
- dir->is_freezing()) {
- dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << endl;
- return;
- }
- if (dir->is_hashed()) {
- dout(7) << "can't export hashed dir right now. implement me carefully later." << endl;
- return;
- }
-
+ CDir *imp = dir; // might be *dir
- // pin path?
- vector<CDentry*> trace;
- make_trace(trace, dir->inode);
- if (!path_pin(trace, 0, 0)) {
- dout(7) << "export_dir couldn't pin path, failing." << endl;
- return;
+ // find the underlying import or hash that delegates dir
+ while (true) {
+ if (imp->is_import()) break; // import
+ imp = imp->get_parent_dir();
+ assert(imp);
+ if (imp->is_hashed()) break; // hash
}
- // ok, let's go.
-
- // send ExportDirDiscover (ask target)
- export_gather[dir].insert(dest);
- mds->send_message_mds(new MExportDirDiscover(dir->inode), dest, MDS_PORT_CACHE);
- dir->auth_pin(); // pin dir, to hang up our freeze (unpin on prep ack)
-
- // take away the popularity we're sending. FIXME: do this later?
- mds->balancer->subtract_export(dir);
-
-
- // freeze the subtree
- dir->freeze_tree(new C_MDC_ExportFreeze(mds, dir, dest));
+ return imp;
}
-/*
- * called on receipt of MExportDirDiscoverAck
- * the importer now has the directory's _inode_ in memory, and pinned.
- */
-void MDCache::handle_export_dir_discover_ack(MExportDirDiscoverAck *m)
+void MDCache::find_nested_exports(CDir *dir, set<CDir*>& s)
{
- CInode *in = get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- int from = MSG_ADDR_NUM(m->get_source());
- assert(export_gather[dir].count(from));
- export_gather[dir].erase(from);
-
- if (export_gather[dir].empty()) {
- dout(7) << "export_dir_discover_ack " << *dir << ", releasing auth_pin" << endl;
- dir->auth_unpin(); // unpin to allow freeze to complete
- } else {
- dout(7) << "export_dir_discover_ack " << *dir << ", still waiting for " << export_gather[dir] << endl;
- }
-
- delete m; // done
+ CDir *import = get_auth_container(dir);
+ find_nested_exports_under(import, dir, s);
}
-
-void MDCache::export_dir_frozen(CDir *dir,
- int dest)
+void MDCache::find_nested_exports_under(CDir *import, CDir *dir, set<CDir*>& s)
{
- // subtree is now frozen!
- dout(7) << "export_dir_frozen on " << *dir << " to " << dest << endl;
-
- show_imports();
+ dout(10) << "find_nested_exports for " << *dir << endl;
+ dout(10) << "find_nested_exports_under import " << *import << endl;
- MExportDirPrep *prep = new MExportDirPrep(dir->inode);
+ if (import == dir) {
+ // yay, my job is easy!
+ for (set<CDir*>::iterator p = nested_exports[import].begin();
+ p != nested_exports[import].end();
+ p++) {
+ CDir *nested = *p;
+ s.insert(nested);
+ dout(10) << "find_nested_exports " << *dir << " " << *nested << endl;
+ }
+ return;
+ }
- // include spanning tree for all nested exports.
- // these need to be on the destination _before_ the final export so that
- // dir_auth updates on any nested exports are properly absorbed.
-
- set<inodeno_t> inodes_added;
-
- // include base dir
- prep->add_dir( new CDirDiscover(dir, dir->open_by_add(dest)) );
-
- // also include traces to all nested exports.
- set<CDir*> my_nested;
- find_nested_exports(dir, my_nested);
- for (set<CDir*>::iterator it = my_nested.begin();
- it != my_nested.end();
- it++) {
- CDir *exp = *it;
+ // ok, my job is annoying.
+ for (set<CDir*>::iterator p = nested_exports[import].begin();
+ p != nested_exports[import].end();
+ p++) {
+ CDir *nested = *p;
- dout(7) << " including nested export " << *exp << " in prep" << endl;
-
- prep->add_export( exp->ino() );
-
- /* first assemble each trace, in trace order, and put in message */
- list<CInode*> inode_trace;
-
- // trace to dir
- CDir *cur = exp;
- while (cur != dir) {
- // don't repeat ourselves
- if (inodes_added.count(cur->ino())) break; // did already!
- inodes_added.insert(cur->ino());
-
- CDir *parent_dir = cur->get_parent_dir();
+ dout(12) << "find_nested_exports checking " << *nested << endl;
- // inode?
- assert(cur->inode->is_auth());
- inode_trace.push_front(cur->inode);
- dout(7) << " will add " << *cur->inode << endl;
-
- // include dir? note: this'll include everything except the nested exports themselves,
- // since someone else is obviously auth.
- if (cur->is_auth()) {
- prep->add_dir( new CDirDiscover(cur, cur->open_by_add(dest)) ); // yay!
- dout(7) << " added " << *cur << endl;
+ // trace back to import, or dir
+ CDir *cur = nested->get_parent_dir();
+ while (!cur->is_import() || cur == dir) {
+ if (cur == dir) {
+ s.insert(nested);
+ dout(10) << "find_nested_exports " << *dir << " " << *nested << endl;
+ break;
+ } else {
+ cur = cur->get_parent_dir();
}
-
- cur = parent_dir;
- }
-
- for (list<CInode*>::iterator it = inode_trace.begin();
- it != inode_trace.end();
- it++) {
- CInode *in = *it;
- dout(7) << " added " << *in << endl;
- prep->add_inode( in->parent->dir->ino(),
- in->parent->name,
- in->replicate_to(dest) );
}
-
}
-
- // send it!
- mds->send_message_mds(prep, dest, MDS_PORT_CACHE);
-}
-
-void MDCache::handle_export_dir_prep_ack(MExportDirPrepAck *m)
-{
- CInode *in = get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- dout(7) << "export_dir_prep_ack " << *dir << ", starting export" << endl;
-
- // start export.
- export_dir_go(dir, MSG_ADDR_NUM(m->get_source()));
-
- // done
- delete m;
}
-void MDCache::export_dir_go(CDir *dir,
- int dest)
-{
- dout(7) << "export_dir_go " << *dir << " to " << dest << endl;
-
- show_imports();
-
-
- // build export message
- MExportDir *req = new MExportDir(dir->inode); // include pop
-
-
- // update imports/exports
- CDir *containing_import = get_auth_container(dir);
- if (containing_import == dir) {
- dout(7) << " i'm rexporting a previous import" << endl;
- assert(dir->is_import());
- imports.erase(dir);
- dir->state_clear(CDIR_STATE_IMPORT);
- dir->put(CDIR_PIN_IMPORT); // unpin, no longer an import
-
- // discard nested exports (that we're handing off
- for (set<CDir*>::iterator p = nested_exports[dir].begin();
- p != nested_exports[dir].end(); ) {
- CDir *nested = *p;
- p++;
- // add to export message
- req->add_export(nested);
-
- // nested beneath our new export *in; remove!
- dout(7) << " export " << *nested << " was nested beneath us; removing from export list(s)" << endl;
- assert(exports.count(nested) == 1);
- nested_exports[dir].erase(nested);
- }
-
- } else {
- dout(7) << " i'm a subdir nested under import " << *containing_import << endl;
- exports.insert(dir);
- nested_exports[containing_import].insert(dir);
-
- dir->state_set(CDIR_STATE_EXPORT);
- dir->get(CDIR_PIN_EXPORT); // i must keep it pinned
-
- // discard nested exports (that we're handing off)
- for (set<CDir*>::iterator p = nested_exports[containing_import].begin();
- p != nested_exports[containing_import].end(); ) {
- CDir *nested = *p;
- p++;
- if (nested == dir) continue; // ignore myself
-
- // container of parent; otherwise we get ourselves.
- CDir *containing_export = nested->get_parent_dir();
- while (containing_export && !containing_export->is_export())
- containing_export = containing_export->get_parent_dir();
- if (!containing_export) continue;
-
- if (containing_export == dir) {
- // nested beneath our new export *in; remove!
- dout(7) << " export " << *nested << " was nested beneath us; removing from nested_exports" << endl;
- nested_exports[containing_import].erase(nested);
- // exports.erase(nested); _walk does this
-
- // add to msg
- req->add_export(nested);
- } else {
- dout(12) << " export " << *nested << " is under other export " << *containing_export << ", which is unrelated" << endl;
- assert(get_auth_container(containing_export) != containing_import);
- }
- }
- }
-
- // note new authority (locally)
- if (dir->inode->authority() == dest)
- dir->set_dir_auth( CDIR_AUTH_PARENT );
- else
- dir->set_dir_auth( dest );
-
- // make list of nodes i expect an export_dir_notify_ack from
- // (everyone w/ this dir open, but me!)
- assert(export_notify_ack_waiting[dir].empty());
- for (set<int>::iterator it = dir->open_by.begin();
- it != dir->open_by.end();
- it++) {
- if (*it == mds->get_nodeid()) continue;
- export_notify_ack_waiting[dir].insert( *it );
-
- // send warning to all but dest
- if (*it != dest) {
- dout(10) << " sending export_dir_warning to mds" << *it << endl;
- mds->send_message_mds(new MExportDirWarning( dir->ino() ), *it, MDS_PORT_CACHE);
- }
- }
- assert(export_notify_ack_waiting[dir].count( dest ));
-
- // fill export message with cache data
- C_Contexts *fin = new C_Contexts;
- int num_exported_inodes = export_dir_walk( req,
- fin,
- dir, // base
- dir, // recur start point
- dest );
-
- // send the export data!
- mds->send_message_mds(req, dest, MDS_PORT_CACHE);
-
- // queue up the finisher
- dir->add_waiter( CDIR_WAIT_UNFREEZE, fin );
-
-
- // stats
- mds->logger->inc("ex");
- mds->logger->inc("iex", num_exported_inodes);
-
- show_imports();
-}
-
-
-/** encode_export_inode
- * update our local state for this inode to export.
- * encode relevant state to be sent over the wire.
- * used by: export_dir_walk, file_rename (if foreign)
- */
-void MDCache::encode_export_inode(CInode *in, bufferlist& enc_state, int new_auth)
-{
- in->version++; // so local log entries are ignored, etc. (FIXME ??)
-
- // tell (all) clients about migrating caps.. mark STALE
- for (map<int, Capability>::iterator it = in->client_caps.begin();
- it != in->client_caps.end();
- it++) {
- dout(7) << "encode_export_inode " << *in << " telling client" << it->first << " stale caps" << endl;
- MClientFileCaps *m = new MClientFileCaps(in->inode,
- it->second.get_last_seq(),
- it->second.pending(),
- it->second.wanted(),
- MClientFileCaps::FILECAP_STALE);
- mds->messenger->send_message(m, MSG_ADDR_CLIENT(it->first), mds->clientmap.get_inst(it->first),
- 0, MDS_PORT_CACHE);
- }
-
- // relax locks?
- if (!in->is_cached_by_anyone())
- in->replicate_relax_locks();
-
- // add inode
- assert(in->cached_by.count(mds->get_nodeid()) == 0);
- CInodeExport istate( in );
- istate._encode( enc_state );
-
- // we're export this inode; fix inode state
- dout(7) << "encode_export_inode " << *in << endl;
-
- if (in->is_dirty()) in->mark_clean();
-
- // clear/unpin cached_by (we're no longer the authority)
- in->cached_by_clear();
-
- // twiddle lock states for auth -> replica transition
- // hard
- in->hardlock.clear_gather();
- if (in->hardlock.get_state() == LOCK_GLOCKR)
- in->hardlock.set_state(LOCK_LOCK);
-
- // file : we lost all our caps, so move to stable state!
- in->filelock.clear_gather();
- if (in->filelock.get_state() == LOCK_GLOCKR ||
- in->filelock.get_state() == LOCK_GLOCKM ||
- in->filelock.get_state() == LOCK_GLOCKL ||
- in->filelock.get_state() == LOCK_GLONERR ||
- in->filelock.get_state() == LOCK_GLONERM ||
- in->filelock.get_state() == LOCK_LONER)
- in->filelock.set_state(LOCK_LOCK);
- if (in->filelock.get_state() == LOCK_GMIXEDR)
- in->filelock.set_state(LOCK_MIXED);
- // this looks like a step backwards, but it's what we want!
- if (in->filelock.get_state() == LOCK_GSYNCM)
- in->filelock.set_state(LOCK_MIXED);
- if (in->filelock.get_state() == LOCK_GSYNCL)
- in->filelock.set_state(LOCK_LOCK);
- if (in->filelock.get_state() == LOCK_GMIXEDL)
- in->filelock.set_state(LOCK_LOCK);
- //in->filelock.set_state(LOCK_MIXED);
-
- // mark auth
- assert(in->is_auth());
- in->set_auth(false);
- in->replica_nonce = CINODE_EXPORT_NONCE;
-
- // *** other state too?
-
- // move to end of LRU so we drop it out of cache quickly!
- lru.lru_bottouch(in);
-}
-
-
-int MDCache::export_dir_walk(MExportDir *req,
- C_Contexts *fin,
- CDir *basedir,
- CDir *dir,
- int newauth)
-{
- int num_exported = 0;
-
- dout(7) << "export_dir_walk " << *dir << " " << dir->nitems << " items" << endl;
-
- // dir
- bufferlist enc_dir;
-
- CDirExport dstate(dir);
- dstate._encode( enc_dir );
-
- // release open_by
- dir->open_by_clear();
-
- // mark
- assert(dir->is_auth());
- dir->state_clear(CDIR_STATE_AUTH);
- dir->replica_nonce = CDIR_NONCE_EXPORT;
-
- // proxy
- dir->state_set(CDIR_STATE_PROXY);
- dir->get(CDIR_PIN_PROXY);
- export_proxy_dirinos[basedir].push_back(dir->ino());
-
- list<CDir*> subdirs;
-
- if (dir->is_hashed()) {
- // fix state
- dir->state_clear( CDIR_STATE_AUTH );
-
- } else {
-
- if (dir->is_dirty())
- dir->mark_clean();
-
- // discard most dir state
- dir->state &= CDIR_MASK_STATE_EXPORT_KEPT; // i only retain a few things.
-
- // suck up all waiters
- list<Context*> waiting;
- dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters
- fin->take(waiting);
-
- // inodes
-
- CDir_map_t::iterator it;
- for (it = dir->begin(); it != dir->end(); it++) {
- CDentry *dn = it->second;
- CInode *in = dn->inode;
-
- num_exported++;
-
- // -- dentry
- dout(7) << "export_dir_walk exporting " << *dn << endl;
- _encode(it->first, enc_dir);
-
- if (dn->is_dirty())
- enc_dir.append("D", 1); // dirty
- else
- enc_dir.append("C", 1); // clean
-
- // null dentry?
- if (dn->is_null()) {
- enc_dir.append("N", 1); // null dentry
- assert(dn->is_sync());
- continue;
- }
-
- if (dn->is_remote()) {
- // remote link
- enc_dir.append("L", 1); // remote link
-
- inodeno_t ino = dn->get_remote_ino();
- enc_dir.append((char*)&ino, sizeof(ino));
- continue;
- }
-
- // primary link
- // -- inode
- enc_dir.append("I", 1); // inode dentry
-
- encode_export_inode(in, enc_dir, newauth); // encode, and (update state for) export
-
- // directory?
- if (in->is_dir() && in->dir) {
- if (in->dir->is_auth()) {
- // nested subdir
- assert(in->dir->get_dir_auth() == CDIR_AUTH_PARENT);
- subdirs.push_back(in->dir); // it's ours, recurse (later)
-
- } else {
- // nested export
- assert(in->dir->get_dir_auth() >= 0);
- dout(7) << " encountered nested export " << *in->dir << " dir_auth " << in->dir->get_dir_auth() << "; removing from exports" << endl;
- assert(exports.count(in->dir) == 1);
- exports.erase(in->dir); // discard nested export (nested_exports updated above)
-
- in->dir->state_clear(CDIR_STATE_EXPORT);
- in->dir->put(CDIR_PIN_EXPORT);
-
- // simplify dir_auth?
- if (in->dir->get_dir_auth() == newauth)
- in->dir->set_dir_auth( CDIR_AUTH_PARENT );
- }
- }
-
- // add to proxy
- export_proxy_inos[basedir].push_back(in->ino());
- in->state_set(CINODE_STATE_PROXY);
- in->get(CINODE_PIN_PROXY);
-
- // waiters
- list<Context*> waiters;
- in->take_waiting(CINODE_WAIT_ANY, waiters);
- fin->take(waiters);
- }
- }
-
- req->add_dir( enc_dir );
-
- // subdirs
- for (list<CDir*>::iterator it = subdirs.begin(); it != subdirs.end(); it++)
- num_exported += export_dir_walk(req, fin, basedir, *it, newauth);
-
- return num_exported;
-}
-
-
-/*
- * i should get an export_dir_notify_ack from every mds that had me open, including the new auth (an ack)
- */
-void MDCache::handle_export_dir_notify_ack(MExportDirNotifyAck *m)
-{
- CInode *diri = get_inode(m->get_ino());
- CDir *dir = diri->dir;
- assert(dir);
- assert(dir->is_frozen_tree_root()); // i'm exporting!
-
- // remove from waiting list
- int from = MSG_ADDR_NUM(m->get_source());
- assert(export_notify_ack_waiting[dir].count(from));
- export_notify_ack_waiting[dir].erase(from);
-
- // done?
- if (!export_notify_ack_waiting[dir].empty()) {
- dout(7) << "handle_export_dir_notify_ack on " << *dir << " from " << from
- << ", still waiting for " << export_notify_ack_waiting[dir] << endl;
-
- } else {
- dout(7) << "handle_export_dir_notify_ack on " << *dir << " from " << from
- << ", last one!" << endl;
-
- // ok, we're finished!
- export_notify_ack_waiting.erase(dir);
-
- // finish export (unfreeze, trigger finish context, etc.)
- export_dir_finish(dir);
-
- // unpin proxies
- // inodes
- for (list<inodeno_t>::iterator it = export_proxy_inos[dir].begin();
- it != export_proxy_inos[dir].end();
- it++) {
- CInode *in = get_inode(*it);
- in->put(CINODE_PIN_PROXY);
- assert(in->state_test(CINODE_STATE_PROXY));
- in->state_clear(CINODE_STATE_PROXY);
- }
- export_proxy_inos.erase(dir);
-
- // dirs
- for (list<inodeno_t>::iterator it = export_proxy_dirinos[dir].begin();
- it != export_proxy_dirinos[dir].end();
- it++) {
- CDir *dir = get_inode(*it)->dir;
- dir->put(CDIR_PIN_PROXY);
- assert(dir->state_test(CDIR_STATE_PROXY));
- dir->state_clear(CDIR_STATE_PROXY);
-
- // hose neg dentries, too, since we're no longer auth
- CDir_map_t::iterator it;
- for (it = dir->begin(); it != dir->end(); ) {
- CDentry *dn = it->second;
- it++;
- if (dn->is_null()) {
- assert(dn->is_sync());
- dir->remove_dentry(dn);
- } else {
- //dout(10) << "export_dir_notify_ack leaving xlocked neg " << *dn << endl;
- if (dn->is_dirty())
- dn->mark_clean();
- }
- }
- }
- export_proxy_dirinos.erase(dir);
-
- }
-
- delete m;
-}
-
-
-/*
- * once i get all teh notify_acks i can finish
- */
-void MDCache::export_dir_finish(CDir *dir)
-{
- // exported!
-
-
- // FIXME log it
-
- // send finish to new auth
- mds->send_message_mds(new MExportDirFinish(dir->ino()), dir->authority(), MDS_PORT_CACHE);
-
- // unfreeze
- dout(7) << "export_dir_finish " << *dir << ", unfreezing" << endl;
- dir->unfreeze_tree();
-
- // unpin path
- dout(7) << "export_dir_finish unpinning path" << endl;
- vector<CDentry*> trace;
- make_trace(trace, dir->inode);
- path_unpin(trace, 0);
-
-
- // stats
- mds->logger->set("nex", exports.size());
-
- show_imports();
-}
-
-
-
-
-
-
-
-
-
-
-
-
-// IMPORTS
-
-class C_MDC_ExportDirDiscover : public Context {
- MDCache *mdc;
- MExportDirDiscover *m;
-public:
- vector<CDentry*> trace;
- C_MDC_ExportDirDiscover(MDCache *mdc, MExportDirDiscover *m) {
- this->mdc = mdc;
- this->m = m;
- }
- void finish(int r) {
- CInode *in = 0;
- if (r >= 0) in = trace[trace.size()-1]->get_inode();
- mdc->handle_export_dir_discover_2(m, in, r);
- }
-};
-
-void MDCache::handle_export_dir_discover(MExportDirDiscover *m)
-{
- assert(MSG_ADDR_NUM(m->get_source()) != mds->get_nodeid());
-
- dout(7) << "handle_export_dir_discover on " << m->get_path() << endl;
-
- // must discover it!
- C_MDC_ExportDirDiscover *onfinish = new C_MDC_ExportDirDiscover(this, m);
- filepath fpath(m->get_path());
- path_traverse(fpath, onfinish->trace, true,
- m, new C_MDS_RetryMessage(mds,m), // on delay/retry
- MDS_TRAVERSE_DISCOVER,
- onfinish); // on completion|error
-}
-
-void MDCache::handle_export_dir_discover_2(MExportDirDiscover *m, CInode *in, int r)
-{
- // yay!
- if (in) {
- dout(7) << "handle_export_dir_discover_2 has " << *in << endl;
- }
-
- if (r < 0 || !in->is_dir()) {
- dout(7) << "handle_export_dir_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl;
-
- assert(0); // this shouldn't happen if the auth pins his path properly!!!!
-
- mds->send_message_mds(new MExportDirDiscoverAck(m->get_ino(), false),
- m->get_source().num(), MDS_PORT_CACHE);
- delete m;
- return;
- }
-
- assert(in->is_dir());
-
- if (in->is_frozen()) {
- dout(7) << "frozen, waiting." << endl;
- in->add_waiter(CINODE_WAIT_AUTHPINNABLE,
- new C_MDS_RetryMessage(mds,m));
- return;
- }
-
- // pin inode in the cache (for now)
- in->get(CINODE_PIN_IMPORTING);
-
- // pin auth too, until the import completes.
- in->auth_pin();
-
- // reply
- dout(7) << " sending export_dir_discover_ack on " << *in << endl;
- mds->send_message_mds(new MExportDirDiscoverAck(in->ino()),
- m->get_source().num(), MDS_PORT_CACHE);
- delete m;
-}
-
-
-
-void MDCache::handle_export_dir_prep(MExportDirPrep *m)
-{
- assert(MSG_ADDR_NUM(m->get_source()) != mds->get_nodeid());
-
- CInode *diri = get_inode(m->get_ino());
- assert(diri);
-
- list<Context*> finished;
-
- // assimilate root dir.
- CDir *dir = diri->dir;
- if (dir) {
- dout(7) << "handle_export_dir_prep on " << *dir << " (had dir)" << endl;
-
- if (!m->did_assim())
- m->get_dir(diri->ino())->update_dir(dir);
- } else {
- assert(!m->did_assim());
-
- // open dir i'm importing.
- diri->set_dir( new CDir(diri, mds, false) );
- dir = diri->dir;
- m->get_dir(diri->ino())->update_dir(dir);
-
- dout(7) << "handle_export_dir_prep on " << *dir << " (opening dir)" << endl;
-
- diri->take_waiting(CINODE_WAIT_DIR, finished);
- }
- assert(dir->is_auth() == false);
-
- show_imports();
-
- // assimilate contents?
- if (!m->did_assim()) {
- dout(7) << "doing assim on " << *dir << endl;
- m->mark_assim(); // only do this the first time!
-
- // move pin to dir
- diri->put(CINODE_PIN_IMPORTING);
- dir->get(CDIR_PIN_IMPORTING);
-
- // auth pin too
- dir->auth_pin();
- diri->auth_unpin();
-
- // assimilate traces to exports
- for (list<CInodeDiscover*>::iterator it = m->get_inodes().begin();
- it != m->get_inodes().end();
- it++) {
- // inode
- CInode *in = get_inode( (*it)->get_ino() );
- if (in) {
- (*it)->update_inode(in);
- dout(7) << " updated " << *in << endl;
- } else {
- in = new CInode(false);
- (*it)->update_inode(in);
-
- // link to the containing dir
- CInode *condiri = get_inode( m->get_containing_dirino(in->ino()) );
- assert(condiri && condiri->dir);
- add_inode( in );
- condiri->dir->add_dentry( m->get_dentry(in->ino()), in );
-
- dout(7) << " added " << *in << endl;
- }
-
- assert( in->get_parent_dir()->ino() == m->get_containing_dirino(in->ino()) );
-
- // dir
- if (m->have_dir(in->ino())) {
- if (in->dir) {
- m->get_dir(in->ino())->update_dir(in->dir);
- dout(7) << " updated " << *in->dir << endl;
- } else {
- in->set_dir( new CDir(in, mds, false) );
- m->get_dir(in->ino())->update_dir(in->dir);
- dout(7) << " added " << *in->dir << endl;
- in->take_waiting(CINODE_WAIT_DIR, finished);
- }
- }
- }
-
- // open export dirs?
- for (list<inodeno_t>::iterator it = m->get_exports().begin();
- it != m->get_exports().end();
- it++) {
- dout(7) << " checking dir " << hex << *it << dec << endl;
- CInode *in = get_inode(*it);
- assert(in);
-
- if (!in->dir) {
- dout(7) << " opening nested export on " << *in << endl;
- open_remote_dir(in,
- new C_MDS_RetryMessage(mds, m));
-
- // pin it!
- in->get(CINODE_PIN_OPENINGDIR);
- in->state_set(CINODE_STATE_OPENINGDIR);
- }
- }
- } else {
- dout(7) << " not doing assim on " << *dir << endl;
- }
-
-
- // verify we have all exports
- int waiting_for = 0;
- for (list<inodeno_t>::iterator it = m->get_exports().begin();
- it != m->get_exports().end();
- it++) {
- inodeno_t ino = *it;
- CInode *in = get_inode(ino);
- if (!in) dout(0) << "** missing ino " << hex << ino << dec << endl;
- assert(in);
- if (in->dir) {
- if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) {
- dout(7) << " pinning nested export " << *in->dir << endl;
- in->dir->get(CDIR_PIN_IMPORTINGEXPORT);
- in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT);
-
- if (in->state_test(CINODE_STATE_OPENINGDIR)) {
- in->put(CINODE_PIN_OPENINGDIR);
- in->state_clear(CINODE_STATE_OPENINGDIR);
- }
- } else {
- dout(7) << " already pinned nested export " << *in << endl;
- }
- } else {
- dout(7) << " waiting for nested export dir on " << *in << endl;
- waiting_for++;
- }
- }
- if (waiting_for) {
- dout(7) << " waiting for " << waiting_for << " nested export dir opens" << endl;
- } else {
- // ok!
- dout(7) << " all ready, sending export_dir_prep_ack on " << *dir << endl;
- mds->send_message_mds(new MExportDirPrepAck(dir->ino()),
- m->get_source().num(), MDS_PORT_CACHE);
-
- // done
- delete m;
- }
-
- // finish waiters
- finish_contexts(finished, 0);
-}
-
-
-
-
-/* this guy waits for the pre-import discovers on hashed directory dir inodes to finish.
- * if it's the last one on the dir, it reprocessed the import.
- */
-/*
-class C_MDS_ImportPrediscover : public Context {
-public:
- MDS *mds;
- MExportDir *m;
- inodeno_t dir_ino;
- string dentry;
- C_MDS_ImportPrediscover(MDS *mds, MExportDir *m, inodeno_t dir_ino, const string& dentry) {
- this->mds = mds;
- this->m = m;
- this->dir_ino = dir_ino;
- this->dentry = dentry;
- }
- virtual void finish(int r) {
- assert(r == 0); // should never fail!
-
- m->remove_prediscover(dir_ino, dentry);
-
- if (!m->any_prediscovers())
- mds->mdcache->handle_export_dir(m);
- }
-};
-*/
-
-
-
-void MDCache::handle_export_dir(MExportDir *m)
-{
- CInode *diri = get_inode(m->get_ino());
- assert(diri);
- CDir *dir = diri->dir;
- assert(dir);
-
- int oldauth = MSG_ADDR_NUM(m->get_source());
- dout(7) << "handle_export_dir, import " << *dir << " from " << oldauth << endl;
- assert(dir->is_auth() == false);
-
-
-
- show_imports();
-
- // note new authority (locally)
- if (dir->inode->is_auth())
- dir->set_dir_auth( CDIR_AUTH_PARENT );
- else
- dir->set_dir_auth( mds->get_nodeid() );
- dout(10) << " set dir_auth to " << dir->get_dir_auth() << endl;
-
- // update imports/exports
- CDir *containing_import;
- if (exports.count(dir)) {
- // reimporting
- dout(7) << " i'm reimporting " << *dir << endl;
- exports.erase(dir);
-
- dir->state_clear(CDIR_STATE_EXPORT);
- dir->put(CDIR_PIN_EXPORT); // unpin, no longer an export
-
- containing_import = get_auth_container(dir);
- dout(7) << " it is nested under import " << *containing_import << endl;
- nested_exports[containing_import].erase(dir);
- } else {
- // new import
- imports.insert(dir);
- dir->state_set(CDIR_STATE_IMPORT);
- dir->get(CDIR_PIN_IMPORT); // must keep it pinned
-
- containing_import = dir; // imported exports nested under *in
-
- dout(7) << " new import at " << *dir << endl;
- }
-
-
- // take out my temp pin
- dir->put(CDIR_PIN_IMPORTING);
-
- // add any inherited exports
- for (list<inodeno_t>::iterator it = m->get_exports().begin();
- it != m->get_exports().end();
- it++) {
- CInode *exi = get_inode(*it);
- assert(exi && exi->dir);
- CDir *ex = exi->dir;
-
- dout(15) << " nested export " << *ex << endl;
-
- // remove our pin
- ex->put(CDIR_PIN_IMPORTINGEXPORT);
- ex->state_clear(CDIR_STATE_IMPORTINGEXPORT);
-
-
- // add...
- if (ex->is_import()) {
- dout(7) << " importing my import " << *ex << endl;
- imports.erase(ex);
- ex->state_clear(CDIR_STATE_IMPORT);
-
- mds->logger->inc("imex");
-
- // move nested exports under containing_import
- for (set<CDir*>::iterator it = nested_exports[ex].begin();
- it != nested_exports[ex].end();
- it++) {
- dout(7) << " moving nested export " << **it << " under " << *containing_import << endl;
- nested_exports[containing_import].insert(*it);
- }
- nested_exports.erase(ex); // de-list under old import
-
- ex->set_dir_auth( CDIR_AUTH_PARENT );
- ex->put(CDIR_PIN_IMPORT); // imports are pinned, no longer import
-
- } else {
- dout(7) << " importing export " << *ex << endl;
-
- // add it
- ex->state_set(CDIR_STATE_EXPORT);
- ex->get(CDIR_PIN_EXPORT); // all exports are pinned
- exports.insert(ex);
- nested_exports[containing_import].insert(ex);
- mds->logger->inc("imex");
- }
-
- }
-
-
- // add this crap to my cache
- list<inodeno_t> imported_subdirs;
- bufferlist dir_state;
- dir_state.claim( m->get_state() );
- int off = 0;
- int num_imported_inodes = 0;
-
- for (int i = 0; i < m->get_ndirs(); i++) {
- num_imported_inodes +=
- import_dir_block(dir_state,
- off,
- oldauth,
- dir, // import root
- imported_subdirs);
- }
- dout(10) << " " << imported_subdirs.size() << " imported subdirs" << endl;
- dout(10) << " " << m->get_exports().size() << " imported nested exports" << endl;
-
-
- // adjust popularity
- mds->balancer->add_import(dir);
-
- // send notify's etc.
- dout(7) << "sending notifyack for " << *dir << " to old auth " << MSG_ADDR_NUM(m->get_source()) << endl;
- mds->send_message_mds(new MExportDirNotifyAck(dir->inode->ino()),
- m->get_source().num(), MDS_PORT_CACHE);
-
- dout(7) << "sending notify to others" << endl;
- for (set<int>::iterator it = dir->open_by.begin();
- it != dir->open_by.end();
- it++) {
- assert( *it != mds->get_nodeid() );
- if ( *it == MSG_ADDR_NUM(m->get_source()) ) continue; // not to old auth.
-
- MExportDirNotify *notify = new MExportDirNotify(dir->ino(), MSG_ADDR_NUM(m->get_source()), mds->get_nodeid());
- notify->copy_exports(m->get_exports());
-
- if (g_conf.mds_verify_export_dirauth)
- notify->copy_subdirs(imported_subdirs); // copy subdir list (DEBUG)
-
- mds->send_message_mds(notify, *it, MDS_PORT_CACHE);
- }
-
- // done
- delete m;
-
- show_imports();
-
-
- // is it empty?
- if (dir->get_size() == 0 &&
- !dir->inode->is_auth()) {
- // reexport!
- export_empty_import(dir);
- }
-
-
- // some stats
- mds->logger->inc("im");
- mds->logger->inc("iim", num_imported_inodes);
- mds->logger->set("nim", imports.size());
-
-
- // FIXME LOG IT
-
- /*
- stupid hashing crap, FIXME
-
- // wait for replicas in hashed dirs?
- if (import_hashed_replicate_waiting.count(m->get_ino())) {
- // it'll happen later!, when i get my inodegetreplicaack's back
- } else {
- // finish now
- //not anymoreimport_dir_finish(dir);
- }
- */
-
-}
-
-
-
-void MDCache::handle_export_dir_finish(MExportDirFinish *m)
-{
- CInode *diri = get_inode(m->get_ino());
- CDir *dir = diri->dir;
- assert(dir);
-
- dout(7) << "handle_export_dir_finish on " << *dir << endl;
- assert(dir->is_auth());
-
- dout(5) << "done with import of " << *dir << endl;
- show_imports();
- mds->logger->set("nex", exports.size());
- mds->logger->set("nim", imports.size());
-
- // un auth pin (other exports can now proceed)
- dir->auth_unpin();
-
- // ok now finish contexts
- dout(5) << "finishing any waiters on imported data" << endl;
- dir->finish_waiting(CDIR_WAIT_IMPORTED);
-
- delete m;
-}
-
-
-void MDCache::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int oldauth)
-{
- CInodeExport istate;
- off = istate._decode(bl, off);
- dout(15) << "got a cinodeexport " << endl;
-
- bool added = false;
- CInode *in = get_inode(istate.get_ino());
- if (!in) {
- in = new CInode;
- added = true;
- } else {
- in->set_auth(true);
- }
-
- // link before state
- if (dn->inode != in) {
- assert(!dn->inode);
- dn->dir->link_inode(dn, in);
- }
-
- // state after link
- set<int> merged_client_caps;
- istate.update_inode(in, merged_client_caps);
-
-
- // add inode?
- if (added) {
- add_inode(in);
- dout(10) << "added " << *in << endl;
- } else {
- dout(10) << " had " << *in << endl;
- }
-
-
- // cached_by
- assert(!in->is_cached_by(oldauth));
- in->cached_by_add( oldauth, CINODE_EXPORT_NONCE );
- if (in->is_cached_by(mds->get_nodeid()))
- in->cached_by_remove(mds->get_nodeid());
-
- // twiddle locks
- // hard
- if (in->hardlock.get_state() == LOCK_GLOCKR) {
- in->hardlock.gather_set.erase(mds->get_nodeid());
- in->hardlock.gather_set.erase(oldauth);
- if (in->hardlock.gather_set.empty())
- inode_hard_eval(in);
- }
-
- // caps
- for (set<int>::iterator it = merged_client_caps.begin();
- it != merged_client_caps.end();
- it++) {
- MClientFileCaps *caps = new MClientFileCaps(in->inode,
- in->client_caps[*it].get_last_seq(),
- in->client_caps[*it].pending(),
- in->client_caps[*it].wanted(),
- MClientFileCaps::FILECAP_REAP);
- caps->set_mds( oldauth ); // reap from whom?
- mds->messenger->send_message(caps,
- MSG_ADDR_CLIENT(*it), mds->clientmap.get_inst(*it),
- 0, MDS_PORT_CACHE);
- }
-
- // filelock
- if (!in->filelock.is_stable()) {
- // take me and old auth out of gather set
- in->filelock.gather_set.erase(mds->get_nodeid());
- in->filelock.gather_set.erase(oldauth);
- if (in->filelock.gather_set.empty()) // necessary but not suffient...
- inode_file_eval(in);
- }
-
- // other
- if (in->is_dirty()) {
- dout(10) << "logging dirty import " << *in << endl;
- mds->mdlog->submit_entry(new EInodeUpdate(in));
- }
-}
-
-
-int MDCache::import_dir_block(bufferlist& bl,
- int& off,
- int oldauth,
- CDir *import_root,
- list<inodeno_t>& imported_subdirs)
-{
- // set up dir
- CDirExport dstate;
- off = dstate._decode(bl, off);
-
- CInode *diri = get_inode(dstate.get_ino());
- assert(diri);
- CDir *dir = diri->get_or_open_dir(mds);
- assert(dir);
-
- dout(7) << " import_dir_block " << *dir << " have " << dir->nitems << " items, importing " << dstate.get_nden() << " dentries" << endl;
-
- // add to list
- if (dir != import_root)
- imported_subdirs.push_back(dir->ino());
-
- // assimilate state
- dstate.update_dir( dir );
- if (diri->is_auth())
- dir->set_dir_auth( CDIR_AUTH_PARENT ); // update_dir may hose dir_auth
-
- // mark (may already be marked from get_or_open_dir() above)
- if (!dir->is_auth())
- dir->state_set(CDIR_STATE_AUTH);
-
- // open_by
- assert(!dir->is_open_by(oldauth));
- dir->open_by_add(oldauth);
- if (dir->is_open_by(mds->get_nodeid()))
- dir->open_by_remove(mds->get_nodeid());
-
- if (dir->is_hashed()) {
-
- // do nothing; dir is hashed
- return 0;
- } else {
- // take all waiters on this dir
- // NOTE: a pass of imported data is guaranteed to get all of my waiters because
- // a replica's presense in my cache implies/forces it's presense in authority's.
- list<Context*> waiters;
-
- dir->take_waiting(CDIR_WAIT_ANY, waiters);
- for (list<Context*>::iterator it = waiters.begin();
- it != waiters.end();
- it++)
- import_root->add_waiter(CDIR_WAIT_IMPORTED, *it);
-
- dout(15) << "doing contents" << endl;
-
- // contents
- int num_imported = 0;
- long nden = dstate.get_nden();
-
- for (; nden>0; nden--) {
-
- num_imported++;
-
- // dentry
- string dname;
- _decode(dname, bl, off);
- dout(15) << "dname is " << dname << endl;
-
- char dirty;
- bl.copy(off, 1, &dirty);
- off++;
-
- char icode;
- bl.copy(off, 1, &icode);
- off++;
-
- CDentry *dn = dir->lookup(dname);
- if (!dn)
- dn = dir->add_dentry(dname); // null
-
- // mark dn dirty _after_ we link the inode (scroll down)
-
- if (icode == 'N') {
- // null dentry
- assert(dn->is_null());
-
- // fall thru
- }
- else if (icode == 'L') {
- // remote link
- inodeno_t ino;
- bl.copy(off, sizeof(ino), (char*)&ino);
- off += sizeof(ino);
- dir->link_inode(dn, ino);
- }
- else if (icode == 'I') {
- // inode
- decode_import_inode(dn, bl, off, oldauth);
- }
-
- // mark dentry dirty? (only _after_ we link the inode!)
- if (dirty == 'D') dn->mark_dirty();
-
- }
-
- if (dir->is_dirty())
- mds->mdlog->submit_entry(new EDirUpdate(dir));
-
- return num_imported;
- }
-}
-
-
-
-
-
-// authority bystander
-
-void MDCache::handle_export_dir_warning(MExportDirWarning *m)
-{
- // add to warning list
- stray_export_warnings.insert( m->get_ino() );
-
- // did i already see the notify?
- if (stray_export_notifies.count(m->get_ino())) {
- // i did, we're good.
- dout(7) << "handle_export_dir_warning on " << m->get_ino() << ". already got notify." << endl;
-
- // process the notify
- map<inodeno_t, MExportDirNotify*>::iterator it = stray_export_notifies.find(m->get_ino());
- handle_export_dir_notify(it->second);
- stray_export_notifies.erase(it);
- } else {
- dout(7) << "handle_export_dir_warning on " << m->get_ino() << ". waiting for notify." << endl;
- }
-
- // done
- delete m;
-}
-
-
-void MDCache::handle_export_dir_notify(MExportDirNotify *m)
-{
- CDir *dir = 0;
- CInode *in = get_inode(m->get_ino());
- if (in) dir = in->dir;
-
- // did i see the warning yet?
- if (!stray_export_warnings.count(m->get_ino())) {
- // wait for it.
- dout(7) << "export_dir_notify on " << m->get_ino() << ", waiting for warning." << endl;
- stray_export_notifies.insert(pair<inodeno_t, MExportDirNotify*>( m->get_ino(), m ));
- return;
- }
-
- // i did, we're all good.
- dout(7) << "export_dir_notify on " << m->get_ino() << ", already saw warning." << endl;
-
- // update dir_auth!
- if (dir) {
- dout(7) << "export_dir_notify on " << *dir << " new_auth " << m->get_new_auth() << " (old_auth " << m->get_old_auth() << ")" << endl;
-
- // update bounds first
- for (list<inodeno_t>::iterator it = m->get_exports().begin();
- it != m->get_exports().end();
- it++) {
- CInode *n = get_inode(*it);
- if (!n) continue;
- CDir *ndir = n->dir;
- if (!ndir) continue;
-
- int boundauth = ndir->authority();
- dout(7) << "export_dir_notify bound " << *ndir << " was dir_auth " << ndir->get_dir_auth() << " (" << boundauth << ")" << endl;
- if (ndir->get_dir_auth() == CDIR_AUTH_PARENT) {
- if (boundauth != m->get_new_auth())
- ndir->set_dir_auth( boundauth );
- else assert(dir->authority() == m->get_new_auth()); // apparently we already knew!
- } else {
- if (boundauth == m->get_new_auth())
- ndir->set_dir_auth( CDIR_AUTH_PARENT );
- }
- }
-
- // update dir_auth
- if (in->authority() == m->get_new_auth()) {
- dout(7) << "handle_export_dir_notify on " << *in << ": inode auth is the same, setting dir_auth -1" << endl;
- dir->set_dir_auth( CDIR_AUTH_PARENT );
- assert(!in->is_auth());
- assert(!dir->is_auth());
- } else {
- dir->set_dir_auth( m->get_new_auth() );
- }
- assert(dir->authority() != mds->get_nodeid());
- assert(!dir->is_auth());
-
- // DEBUG: verify subdirs
- if (g_conf.mds_verify_export_dirauth) {
-
- dout(7) << "handle_export_dir_notify on " << *dir << " checking " << m->num_subdirs() << " subdirs" << endl;
- for (list<inodeno_t>::iterator it = m->subdirs_begin();
- it != m->subdirs_end();
- it++) {
- CInode *diri = get_inode(*it);
- if (!diri) continue; // don't have it, don't care
- if (!diri->dir) continue;
- dout(10) << "handle_export_dir_notify checking subdir " << *diri->dir << " is auth " << diri->dir->get_dir_auth() << endl;
- assert(diri->dir != dir); // base shouldn't be in subdir list
- if (diri->dir->get_dir_auth() != CDIR_AUTH_PARENT) {
- dout(7) << "*** weird value for dir_auth " << diri->dir->get_dir_auth() << " on " << *diri->dir << ", should have been -1 probably??? ******************" << endl;
- assert(0); // bad news!
- //dir->set_dir_auth( CDIR_AUTH_PARENT );
- }
- assert(diri->dir->authority() == m->get_new_auth());
- }
- }
- }
-
- // send notify ack to old auth
- dout(7) << "handle_export_dir_notify sending ack to old_auth " << m->get_old_auth() << endl;
- mds->send_message_mds(new MExportDirNotifyAck(m->get_ino()),
- m->get_old_auth(), MDS_PORT_CACHE);
-
-
- // done
- stray_export_warnings.erase( m->get_ino() );
- delete m;
-}
-
-
-
-
-
-// =======================================================================
-// HASHING
-
-
-void MDCache::import_hashed_content(CDir *dir, bufferlist& bl, int nden, int oldauth)
-{
- int off = 0;
-
- for (; nden>0; nden--) {
- // dentry
- string dname;
- _decode(dname, bl, off);
- dout(15) << "dname is " << dname << endl;
-
- char icode;
- bl.copy(off, 1, &icode);
- off++;
-
- CDentry *dn = dir->lookup(dname);
- if (!dn)
- dn = dir->add_dentry(dname); // null
-
- // mark dn dirty _after_ we link the inode (scroll down)
-
- if (icode == 'N') {
-
- // null dentry
- assert(dn->is_null());
-
- // fall thru
- }
- else if (icode == 'L') {
- // remote link
- inodeno_t ino;
- bl.copy(off, sizeof(ino), (char*)&ino);
- off += sizeof(ino);
- dir->link_inode(dn, ino);
- }
- else if (icode == 'I') {
- // inode
- decode_import_inode(dn, bl, off, oldauth);
-
- // fix up subdir export?
- if (dn->inode->dir) {
- assert(dn->inode->dir->state_test(CDIR_STATE_IMPORTINGEXPORT));
- dn->inode->dir->put(CDIR_PIN_IMPORTINGEXPORT);
- dn->inode->dir->state_clear(CDIR_STATE_IMPORTINGEXPORT);
-
- if (dn->inode->dir->is_auth()) {
- // mine. must have been an import.
- assert(dn->inode->dir->is_import());
- dout(7) << "unimporting subdir now that inode is mine " << *dn->inode->dir << endl;
- dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT );
- imports.erase(dn->inode->dir);
- dn->inode->dir->put(CDIR_PIN_IMPORT);
- dn->inode->dir->state_clear(CDIR_STATE_IMPORT);
-
- // move nested under hashdir
- for (set<CDir*>::iterator it = nested_exports[dn->inode->dir].begin();
- it != nested_exports[dn->inode->dir].end();
- it++)
- nested_exports[dir].insert(*it);
- nested_exports.erase(dn->inode->dir);
-
- // now it matches the inode
- dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT );
- }
- else {
- // not mine. make it an export.
- dout(7) << "making subdir into export " << *dn->inode->dir << endl;
- dn->inode->dir->get(CDIR_PIN_EXPORT);
- dn->inode->dir->state_set(CDIR_STATE_EXPORT);
- exports.insert(dn->inode->dir);
- nested_exports[dir].insert(dn->inode->dir);
-
- if (dn->inode->dir->get_dir_auth() == CDIR_AUTH_PARENT)
- dn->inode->dir->set_dir_auth( oldauth ); // no longer matches inode
- assert(dn->inode->dir->get_dir_auth() >= 0);
- }
- }
- }
-
- // mark dentry dirty? (only _after_ we link the inode!)
- dn->mark_dirty();
- }
-}
-
-/*
-
- notes on interaction of hashing and export/import:
-
- - dir->is_auth() is completely independent of hashing. for a hashed dir,
- - all nodes are partially authoritative
- - all nodes dir->is_hashed() == true
- - all nodes dir->inode->dir_is_hashed() == true
- - one node dir->is_auth() == true, the rest == false
- - dir_auth for all subdirs in a hashed dir will (likely?) be explicit.
-
- - remember simple rule: dir auth follows inode, unless dir_auth is explicit.
-
- - export_dir_walk and import_dir_block take care with dir_auth: (for import/export)
- - on export, -1 is changed to mds->get_nodeid()
- - on import, nothing special, actually.
-
- - hashed dir files aren't included in export; subdirs are converted to imports
- or exports as necessary.
- - hashed dir subdirs are discovered on export. this is important
- because dirs are needed to tie together auth hierarchy, for auth to know about
- imports/exports, etc.
-
- - dir state is maintained on auth.
- - COMPLETE and HASHED are transfered to importers.
- - DIRTY is set everywhere.
-
- - hashed dir is like an import: hashed dir used for nested_exports map.
- - nested_exports is updated appropriately on auth and replicas.
- - a subtree terminates as a hashed dir, since the hashing explicitly
- redelegates all inodes. thus export_dir_walk includes hashed dirs, but
- not their inodes.
-*/
-
-// HASH on auth
-
-class C_MDC_HashFreeze : public Context {
-public:
- MDS *mds;
- CDir *dir;
- C_MDC_HashFreeze(MDS *mds, CDir *dir) {
- this->mds = mds;
- this->dir = dir;
- }
- virtual void finish(int r) {
- mds->mdcache->hash_dir_frozen(dir);
- }
-};
-
-class C_MDC_HashComplete : public Context {
-public:
- MDS *mds;
- CDir *dir;
- C_MDC_HashComplete(MDS *mds, CDir *dir) {
- this->mds = mds;
- this->dir = dir;
- }
- virtual void finish(int r) {
- mds->mdcache->hash_dir_complete(dir);
- }
-};
-
-
-/** hash_dir(dir)
- * start hashing a directory.
- */
-void MDCache::hash_dir(CDir *dir)
-{
- dout(-7) << "hash_dir " << *dir << endl;
-
- assert(!dir->is_hashed());
- assert(dir->is_auth());
-
- if (dir->is_frozen() ||
- dir->is_freezing()) {
- dout(7) << " can't hash, freezing|frozen." << endl;
- return;
- }
-
- // pin path?
- vector<CDentry*> trace;
- make_trace(trace, dir->inode);
- if (!path_pin(trace, 0, 0)) {
- dout(7) << "hash_dir couldn't pin path, failing." << endl;
- return;
- }
-
- // ok, go
- dir->state_set(CDIR_STATE_HASHING);
- dir->get(CDIR_PIN_HASHING);
- assert(dir->hashed_subset.empty());
-
- // discover on all mds
- assert(hash_gather.count(dir) == 0);
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i == mds->get_nodeid()) continue; // except me
- hash_gather[dir].insert(i);
- mds->send_message_mds(new MHashDirDiscover(dir->inode), i, MDS_PORT_CACHE);
- }
- dir->auth_pin(); // pin until discovers are all acked.
-
- // start freeze
- dir->freeze_dir(new C_MDC_HashFreeze(mds, dir));
-
- // make complete
- if (!dir->is_complete()) {
- dout(7) << "hash_dir " << *dir << " not complete, fetching" << endl;
- mds->mdstore->fetch_dir(dir,
- new C_MDC_HashComplete(mds, dir));
- } else
- hash_dir_complete(dir);
-}
-
-
-/*
- * wait for everybody to discover and open the hashing dir
- * then auth_unpin, to let the freeze happen
- */
-void MDCache::handle_hash_dir_discover_ack(MHashDirDiscoverAck *m)
-{
- CInode *in = get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- int from = MSG_ADDR_NUM(m->get_source());
- assert(hash_gather[dir].count(from));
- hash_gather[dir].erase(from);
-
- if (hash_gather[dir].empty()) {
- hash_gather.erase(dir);
- dout(7) << "hash_dir_discover_ack " << *dir << ", releasing auth_pin" << endl;
- dir->auth_unpin(); // unpin to allow freeze to complete
- } else {
- dout(7) << "hash_dir_discover_ack " << *dir << ", still waiting for " << hash_gather[dir] << endl;
- }
-
- delete m; // done
-}
-
-
-
-/*
- * once the dir is completely in memory,
- * mark all migrating inodes dirty (to pin in cache)
- */
-void MDCache::hash_dir_complete(CDir *dir)
-{
- dout(7) << "hash_dir_complete " << *dir << ", dirtying inodes" << endl;
-
- assert(!dir->is_hashed());
- assert(dir->is_auth());
-
- // mark dirty to pin in cache
- for (CDir_map_t::iterator it = dir->begin();
- it != dir->end();
- it++) {
- CInode *in = it->second->inode;
- in->mark_dirty();
- }
-
- if (dir->is_frozen_dir())
- hash_dir_go(dir);
-}
-
-
-/*
- * once the dir is frozen,
- * make sure it's complete
- * send the prep messages!
- */
-void MDCache::hash_dir_frozen(CDir *dir)
-{
- dout(7) << "hash_dir_frozen " << *dir << endl;
-
- assert(!dir->is_hashed());
- assert(dir->is_auth());
- assert(dir->is_frozen_dir());
-
- if (!dir->is_complete()) {
- dout(7) << "hash_dir_frozen !complete, waiting still on " << *dir << endl;
- return;
- }
-
- // send prep messages w/ export directories to open
- vector<MHashDirPrep*> msgs(mds->get_mds_map()->get_num_mds());
-
- // check for subdirs
- for (CDir_map_t::iterator it = dir->begin();
- it != dir->end();
- it++) {
- CDentry *dn = it->second;
- CInode *in = dn->inode;
-
- if (!in->is_dir()) continue;
- if (!in->dir) continue;
-
- int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
- if (dentryhashcode == mds->get_nodeid()) continue;
-
- // msg?
- if (msgs[dentryhashcode] == 0) {
- msgs[dentryhashcode] = new MHashDirPrep(dir->ino());
- }
- msgs[dentryhashcode]->add_inode(it->first, in->replicate_to(dentryhashcode));
- }
-
- // send them!
- assert(hash_gather[dir].empty());
- for (unsigned i=0; i<msgs.size(); i++) {
- if (msgs[i]) {
- mds->send_message_mds(msgs[i], i, MDS_PORT_CACHE);
- hash_gather[dir].insert(i);
- }
- }
-
- if (hash_gather[dir].empty()) {
- // no subdirs! continue!
- hash_gather.erase(dir);
- hash_dir_go(dir);
- } else {
- // wait!
- }
-}
-
-/*
- * wait for peers to open all subdirs
- */
-void MDCache::handle_hash_dir_prep_ack(MHashDirPrepAck *m)
-{
- CInode *in = get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- int from = MSG_ADDR_NUM(m->get_source());
-
- assert(hash_gather[dir].count(from) == 1);
- hash_gather[dir].erase(from);
-
- if (hash_gather[dir].empty()) {
- hash_gather.erase(dir);
- dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", last one" << endl;
- hash_dir_go(dir);
- } else {
- dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl;
- }
-
- delete m;
-}
-
-
-/*
- * once the dir is frozen,
- * make sure it's complete
- * do the hashing!
- */
-void MDCache::hash_dir_go(CDir *dir)
-{
- dout(7) << "hash_dir_go " << *dir << endl;
-
- assert(!dir->is_hashed());
- assert(dir->is_auth());
- assert(dir->is_frozen_dir());
-
- // get messages to other nodes ready
- vector<MHashDir*> msgs(mds->get_mds_map()->get_num_mds());
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i == mds->get_nodeid()) continue;
- msgs[i] = new MHashDir(dir->ino());
- }
-
- // pick a hash seed.
- dir->inode->inode.hash_seed = 1;//dir->ino();
-
- // suck up all waiters
- C_Contexts *fin = new C_Contexts;
- list<Context*> waiting;
- dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters
- fin->take(waiting);
-
- // get containing import. might be me.
- CDir *containing_import = get_auth_container(dir);
- assert(containing_import != dir || dir->is_import());
-
- // divy up contents
- for (CDir_map_t::iterator it = dir->begin();
- it != dir->end();
- it++) {
- CDentry *dn = it->second;
- CInode *in = dn->inode;
-
- int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
- if (dentryhashcode == mds->get_nodeid()) {
- continue; // still mine!
- }
-
- bufferlist *bl = msgs[dentryhashcode]->get_state_ptr();
- assert(bl);
-
- // -- dentry
- dout(7) << "hash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl;
- _encode(it->first, *bl);
-
- // null dentry?
- if (dn->is_null()) {
- bl->append("N", 1); // null dentry
- assert(dn->is_sync());
- continue;
- }
-
- if (dn->is_remote()) {
- // remote link
- bl->append("L", 1); // remote link
-
- inodeno_t ino = dn->get_remote_ino();
- bl->append((char*)&ino, sizeof(ino));
- continue;
- }
-
- // primary link
- // -- inode
- bl->append("I", 1); // inode dentry
-
- encode_export_inode(in, *bl, dentryhashcode); // encode, and (update state for) export
- msgs[dentryhashcode]->inc_nden();
-
- if (dn->is_dirty())
- dn->mark_clean();
-
- // add to proxy
- hash_proxy_inos[dir].push_back(in);
- in->state_set(CINODE_STATE_PROXY);
- in->get(CINODE_PIN_PROXY);
-
- // fix up subdirs
- if (in->dir) {
- if (in->dir->is_auth()) {
- // mine. make it into an import.
- dout(7) << "making subdir into import " << *in->dir << endl;
- in->dir->set_dir_auth( mds->get_nodeid() );
- imports.insert(in->dir);
- in->dir->get(CDIR_PIN_IMPORT);
- in->dir->state_set(CDIR_STATE_IMPORT);
-
- // fix nested bits
- for (set<CDir*>::iterator it = nested_exports[containing_import].begin();
- it != nested_exports[containing_import].end(); ) {
- CDir *ex = *it;
- it++;
- if (get_auth_container(ex) == in->dir) {
- dout(10) << "moving nested export " << *ex << endl;
- nested_exports[containing_import].erase(ex);
- nested_exports[in->dir].insert(ex);
- }
- }
- }
- else {
- // not mine.
- dout(7) << "un-exporting subdir that's being hashed away " << *in->dir << endl;
- assert(in->dir->is_export());
- in->dir->put(CDIR_PIN_EXPORT);
- in->dir->state_clear(CDIR_STATE_EXPORT);
- exports.erase(in->dir);
- nested_exports[containing_import].erase(in->dir);
- if (in->dir->authority() == dentryhashcode)
- in->dir->set_dir_auth( CDIR_AUTH_PARENT );
- else
- in->dir->set_dir_auth( in->dir->authority() );
- }
- }
-
- // waiters
- list<Context*> waiters;
- in->take_waiting(CINODE_WAIT_ANY, waiters);
- fin->take(waiters);
- }
-
- // dir state
- dir->state_set(CDIR_STATE_HASHED);
- dir->get(CDIR_PIN_HASHED);
- hashdirs.insert(dir);
- dir->mark_dirty();
- mds->mdlog->submit_entry(new EDirUpdate(dir));
-
- // inode state
- if (dir->inode->is_auth()) {
- dir->inode->mark_dirty();
- mds->mdlog->submit_entry(new EInodeUpdate(dir->inode));
- }
-
- // fix up nested_exports?
- if (containing_import != dir) {
- dout(7) << "moving nested exports under hashed dir" << endl;
- for (set<CDir*>::iterator it = nested_exports[containing_import].begin();
- it != nested_exports[containing_import].end(); ) {
- CDir *ex = *it;
- it++;
- if (get_auth_container(ex) == dir) {
- dout(7) << " moving nested export under hashed dir: " << *ex << endl;
- nested_exports[containing_import].erase(ex);
- nested_exports[dir].insert(ex);
- } else {
- dout(7) << " NOT moving nested export under hashed dir: " << *ex << endl;
- }
- }
- }
-
- // send hash messages
- assert(hash_gather[dir].empty());
- assert(hash_notify_gather[dir].empty());
- assert(dir->hashed_subset.empty());
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- // all nodes hashed locally..
- dir->hashed_subset.insert(i);
-
- if (i == mds->get_nodeid()) continue;
-
- // init hash_gather and hash_notify_gather sets
- hash_gather[dir].insert(i);
-
- assert(hash_notify_gather[dir][i].empty());
- for (int j=0; j<mds->get_mds_map()->get_num_mds(); j++) {
- if (j == mds->get_nodeid()) continue;
- if (j == i) continue;
- hash_notify_gather[dir][i].insert(j);
- }
-
- mds->send_message_mds(msgs[i], i, MDS_PORT_CACHE);
- }
-
- // wait for all the acks.
-}
-
-
-void MDCache::handle_hash_dir_ack(MHashDirAck *m)
-{
- CInode *in = get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- assert(dir->is_hashed());
- assert(dir->is_hashing());
-
- int from = MSG_ADDR_NUM(m->get_source());
- assert(hash_gather[dir].count(from) == 1);
- hash_gather[dir].erase(from);
-
- if (hash_gather[dir].empty()) {
- dout(7) << "handle_hash_dir_ack on " << *dir << ", last one" << endl;
-
- if (hash_notify_gather[dir].empty()) {
- dout(7) << "got notifies too, all done" << endl;
- hash_dir_finish(dir);
- } else {
- dout(7) << "waiting on notifies " << endl;
- }
-
- } else {
- dout(7) << "handle_hash_dir_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl;
- }
-
- delete m;
-}
-
-
-void MDCache::hash_dir_finish(CDir *dir)
-{
- dout(7) << "hash_dir_finish finishing " << *dir << endl;
- assert(dir->is_hashed());
- assert(dir->is_hashing());
-
- // dir state
- hash_gather.erase(dir);
- dir->state_clear(CDIR_STATE_HASHING);
- dir->put(CDIR_PIN_HASHING);
- dir->hashed_subset.clear();
-
- // unproxy inodes
- // this _could_ happen sooner, on a per-peer basis, but no harm in waiting a few more seconds.
- for (list<CInode*>::iterator it = hash_proxy_inos[dir].begin();
- it != hash_proxy_inos[dir].end();
- it++) {
- CInode *in = *it;
- assert(in->state_test(CINODE_STATE_PROXY));
- in->state_clear(CINODE_STATE_PROXY);
- in->put(CINODE_PIN_PROXY);
- }
- hash_proxy_inos.erase(dir);
-
- // unpin path
- vector<CDentry*> trace;
- make_trace(trace, dir->inode);
- path_unpin(trace, 0);
-
- // unfreeze
- dir->unfreeze_dir();
-
- show_imports();
- assert(hash_gather.count(dir) == 0);
-
- // stats
- //mds->logger->inc("nh", 1);
-
-}
-
-
-
-
-// HASH on auth and non-auth
-
-void MDCache::handle_hash_dir_notify(MHashDirNotify *m)
-{
- CInode *in = get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
- assert(dir->is_hashing());
-
- dout(5) << "handle_hash_dir_notify " << *dir << endl;
- int from = m->get_from();
-
- int source = MSG_ADDR_NUM(m->get_source());
- if (dir->is_auth()) {
- // gather notifies
- assert(dir->is_hashed());
-
- assert( hash_notify_gather[dir][from].count(source) );
- hash_notify_gather[dir][from].erase(source);
-
- if (hash_notify_gather[dir][from].empty()) {
- dout(7) << "last notify from " << from << endl;
- hash_notify_gather[dir].erase(from);
-
- if (hash_notify_gather[dir].empty()) {
- dout(7) << "last notify!" << endl;
- hash_notify_gather.erase(dir);
-
- if (hash_gather[dir].empty()) {
- dout(7) << "got acks too, all done" << endl;
- hash_dir_finish(dir);
- } else {
- dout(7) << "still waiting on acks from " << hash_gather[dir] << endl;
- }
- } else {
- dout(7) << "still waiting for notify gathers from " << hash_notify_gather[dir].size() << " others" << endl;
- }
- } else {
- dout(7) << "still waiting for notifies from " << from << " via " << hash_notify_gather[dir][from] << endl;
- }
-
- // delete msg
- delete m;
- } else {
- // update dir hashed_subset
- assert(dir->hashed_subset.count(from) == 0);
- dir->hashed_subset.insert(from);
-
- // update open subdirs
- for (CDir_map_t::iterator it = dir->begin();
- it != dir->end();
- it++) {
- CDentry *dn = it->second;
- CInode *in = dn->get_inode();
- if (!in) continue;
- if (!in->dir) continue;
-
- int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
- if (dentryhashcode != from) continue; // we'll import these in a minute
-
- if (in->dir->authority() != dentryhashcode)
- in->dir->set_dir_auth( in->dir->authority() );
- else
- in->dir->set_dir_auth( CDIR_AUTH_PARENT );
- }
-
- // remove from notify gather set
- assert(hash_gather[dir].count(from));
- hash_gather[dir].erase(from);
-
- // last notify?
- if (hash_gather[dir].empty()) {
- dout(7) << "gathered all the notifies, finishing hash of " << *dir << endl;
- hash_gather.erase(dir);
-
- dir->state_clear(CDIR_STATE_HASHING);
- dir->put(CDIR_PIN_HASHING);
- dir->hashed_subset.clear();
- } else {
- dout(7) << "still waiting for notify from " << hash_gather[dir] << endl;
- }
-
- // fw notify to auth
- mds->send_message_mds(m, dir->authority(), MDS_PORT_CACHE);
- }
-}
-
-
-
-
-// HASH on non-auth
-
-/*
- * discover step:
- * each peer needs to open up the directory and pin it before we start
- */
-class C_MDC_HashDirDiscover : public Context {
- MDCache *mdc;
- MHashDirDiscover *m;
-public:
- vector<CDentry*> trace;
- C_MDC_HashDirDiscover(MDCache *mdc, MHashDirDiscover *m) {
- this->mdc = mdc;
- this->m = m;
- }
- void finish(int r) {
- CInode *in = 0;
- if (r >= 0) {
- if (trace.size())
- in = trace[trace.size()-1]->get_inode();
- else
- in = mdc->get_root();
- }
- mdc->handle_hash_dir_discover_2(m, in, r);
- }
-};
-
-void MDCache::handle_hash_dir_discover(MHashDirDiscover *m)
-{
- assert(MSG_ADDR_NUM(m->get_source()) != mds->get_nodeid());
-
- dout(7) << "handle_hash_dir_discover on " << m->get_path() << endl;
-
- // must discover it!
- C_MDC_HashDirDiscover *onfinish = new C_MDC_HashDirDiscover(this, m);
- filepath fpath(m->get_path());
- path_traverse(fpath, onfinish->trace, true,
- m, new C_MDS_RetryMessage(mds,m), // on delay/retry
- MDS_TRAVERSE_DISCOVER,
- onfinish); // on completion|error
-}
-
-void MDCache::handle_hash_dir_discover_2(MHashDirDiscover *m, CInode *in, int r)
-{
- // yay!
- if (in) {
- dout(7) << "handle_hash_dir_discover_2 has " << *in << endl;
- }
-
- if (r < 0 || !in->is_dir()) {
- dout(7) << "handle_hash_dir_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl;
- assert(0); // this shouldn't happen if the auth pins his path properly!!!!
- }
- assert(in->is_dir());
-
- // is dir open?
- if (!in->dir) {
- dout(7) << "handle_hash_dir_discover_2 opening dir " << *in << endl;
- open_remote_dir(in,
- new C_MDS_RetryMessage(mds, m));
- return;
- }
- CDir *dir = in->dir;
-
- // pin dir, set hashing flag
- dir->state_set(CDIR_STATE_HASHING);
- dir->get(CDIR_PIN_HASHING);
- assert(dir->hashed_subset.empty());
-
- // inode state
- dir->inode->inode.hash_seed = 1;// dir->ino();
- if (dir->inode->is_auth()) {
- dir->inode->mark_dirty();
- mds->mdlog->submit_entry(new EInodeUpdate(dir->inode));
- }
-
- // get gather set ready for notifies
- assert(hash_gather[dir].empty());
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i == mds->get_nodeid()) continue;
- if (i == dir->authority()) continue;
- hash_gather[dir].insert(i);
- }
-
- // reply
- dout(7) << " sending hash_dir_discover_ack on " << *dir << endl;
- mds->send_message_mds(new MHashDirDiscoverAck(dir->ino()),
- m->get_source().num(), MDS_PORT_CACHE);
- delete m;
-}
-
-/*
- * prep step:
- * peers need to open up all subdirs of the hashed dir
- */
-
-void MDCache::handle_hash_dir_prep(MHashDirPrep *m)
-{
- CInode *in = get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- dout(7) << "handle_hash_dir_prep " << *dir << endl;
-
- if (!m->did_assim()) {
- m->mark_assim(); // only do this the first time!
-
- // assimilate dentry+inodes for exports
- for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
- it != m->get_inodes().end();
- it++) {
- CInode *in = get_inode( it->second->get_ino() );
- if (in) {
- it->second->update_inode(in);
- dout(5) << " updated " << *in << endl;
- } else {
- in = new CInode(false);
- it->second->update_inode(in);
- add_inode(in);
-
- // link
- dir->add_dentry( it->first, in );
- dout(5) << " added " << *in << endl;
- }
-
- // open!
- if (!in->dir) {
- dout(5) << " opening nested export on " << *in << endl;
- open_remote_dir(in,
- new C_MDS_RetryMessage(mds, m));
- }
- }
- }
-
- // verify!
- int waiting_for = 0;
- for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
- it != m->get_inodes().end();
- it++) {
- CInode *in = get_inode( it->second->get_ino() );
- assert(in);
-
- if (in->dir) {
- if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) {
- dout(5) << " pinning nested export " << *in->dir << endl;
- in->dir->get(CDIR_PIN_IMPORTINGEXPORT);
- in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT);
- } else {
- dout(5) << " already pinned nested export " << *in << endl;
- }
- } else {
- dout(5) << " waiting for nested export dir on " << *in << endl;
- waiting_for++;
- }
- }
-
- if (waiting_for) {
- dout(5) << "waiting for " << waiting_for << " dirs to open" << endl;
- return;
- }
-
- // ack!
- mds->send_message_mds(new MHashDirPrepAck(dir->ino()),
- m->get_source().num(), MDS_PORT_CACHE);
-
- // done.
- delete m;
-}
-
-
-/*
- * hash step:
- */
-
-void MDCache::handle_hash_dir(MHashDir *m)
-{
- CInode *in = get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
- assert(!dir->is_auth());
- assert(!dir->is_hashed());
- assert(dir->is_hashing());
-
- dout(5) << "handle_hash_dir " << *dir << endl;
- int oldauth = MSG_ADDR_NUM(m->get_source());
-
- // content
- import_hashed_content(dir, m->get_state(), m->get_nden(), oldauth);
-
- // dir state
- dir->state_set(CDIR_STATE_HASHED);
- dir->get(CDIR_PIN_HASHED);
- hashdirs.insert(dir);
- dir->hashed_subset.insert(mds->get_nodeid());
-
- // dir is complete
- dir->mark_complete();
- dir->mark_dirty();
- mds->mdlog->submit_entry(new EDirUpdate(dir));
-
- // commit
- mds->mdstore->commit_dir(dir, 0);
-
- // send notifies
- dout(7) << "sending notifies" << endl;
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i == mds->get_nodeid()) continue;
- if (i == MSG_ADDR_NUM(m->get_source())) continue;
- mds->send_message_mds(new MHashDirNotify(dir->ino(), mds->get_nodeid()),
- i, MDS_PORT_CACHE);
- }
-
- // ack
- dout(7) << "acking" << endl;
- mds->send_message_mds(new MHashDirAck(dir->ino()),
- m->get_source().num(), MDS_PORT_CACHE);
-
- // done.
- delete m;
-
- show_imports();
-}
-
-
-
-
-
-// UNHASH on auth
-
-class C_MDC_UnhashFreeze : public Context {
-public:
- MDS *mds;
- CDir *dir;
- C_MDC_UnhashFreeze(MDS *mds, CDir *dir) {
- this->mds = mds;
- this->dir = dir;
- }
- virtual void finish(int r) {
- mds->mdcache->unhash_dir_frozen(dir);
- }
-};
-
-class C_MDC_UnhashComplete : public Context {
-public:
- MDS *mds;
- CDir *dir;
- C_MDC_UnhashComplete(MDS *mds, CDir *dir) {
- this->mds = mds;
- this->dir = dir;
- }
- virtual void finish(int r) {
- mds->mdcache->unhash_dir_complete(dir);
- }
-};
-
-
-void MDCache::unhash_dir(CDir *dir)
-{
- dout(-7) << "unhash_dir " << *dir << endl;
-
- assert(dir->is_hashed());
- assert(!dir->is_unhashing());
- assert(dir->is_auth());
- assert(hash_gather.count(dir)==0);
-
- // pin path?
- vector<CDentry*> trace;
- make_trace(trace, dir->inode);
- if (!path_pin(trace, 0, 0)) {
- dout(7) << "unhash_dir couldn't pin path, failing." << endl;
- return;
- }
-
- // twiddle state
- dir->state_set(CDIR_STATE_UNHASHING);
-
- // first, freeze the dir.
- dir->freeze_dir(new C_MDC_UnhashFreeze(mds, dir));
-
- // make complete
- if (!dir->is_complete()) {
- dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl;
- mds->mdstore->fetch_dir(dir,
- new C_MDC_UnhashComplete(mds, dir));
- } else
- unhash_dir_complete(dir);
-
-}
-
-void MDCache::unhash_dir_frozen(CDir *dir)
-{
- dout(7) << "unhash_dir_frozen " << *dir << endl;
-
- assert(dir->is_hashed());
- assert(dir->is_auth());
- assert(dir->is_frozen_dir());
-
- if (!dir->is_complete()) {
- dout(7) << "unhash_dir_frozen !complete, waiting still on " << *dir << endl;
- } else
- unhash_dir_prep(dir);
-}
-
-
-/*
- * ask peers to freeze and complete hashed dir
- */
-void MDCache::unhash_dir_prep(CDir *dir)
-{
- dout(7) << "unhash_dir_prep " << *dir << endl;
- assert(dir->is_hashed());
- assert(dir->is_auth());
- assert(dir->is_frozen_dir());
- assert(dir->is_complete());
-
- if (!hash_gather[dir].empty()) return; // already been here..freeze must have been instantaneous
-
- // send unhash prep to all peers
- assert(hash_gather[dir].empty());
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i == mds->get_nodeid()) continue;
- hash_gather[dir].insert(i);
- mds->send_message_mds(new MUnhashDirPrep(dir->ino()),
- i, MDS_PORT_CACHE);
- }
-}
-
-/*
- * wait for peers to freeze and complete hashed dirs
- */
-void MDCache::handle_unhash_dir_prep_ack(MUnhashDirPrepAck *m)
-{
- CInode *in = get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- int from = MSG_ADDR_NUM(m->get_source());
- dout(7) << "handle_unhash_dir_prep_ack from " << from << " " << *dir << endl;
-
- if (!m->did_assim()) {
- m->mark_assim(); // only do this the first time!
-
- // assimilate dentry+inodes for exports
- for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
- it != m->get_inodes().end();
- it++) {
- CInode *in = get_inode( it->second->get_ino() );
- if (in) {
- it->second->update_inode(in);
- dout(5) << " updated " << *in << endl;
- } else {
- in = new CInode(false);
- it->second->update_inode(in);
- add_inode(in);
-
- // link
- dir->add_dentry( it->first, in );
- dout(5) << " added " << *in << endl;
- }
-
- // open!
- if (!in->dir) {
- dout(5) << " opening nested export on " << *in << endl;
- open_remote_dir(in,
- new C_MDS_RetryMessage(mds, m));
- }
- }
- }
-
- // verify!
- int waiting_for = 0;
- for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
- it != m->get_inodes().end();
- it++) {
- CInode *in = get_inode( it->second->get_ino() );
- assert(in);
-
- if (in->dir) {
- if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) {
- dout(5) << " pinning nested export " << *in->dir << endl;
- in->dir->get(CDIR_PIN_IMPORTINGEXPORT);
- in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT);
- } else {
- dout(5) << " already pinned nested export " << *in << endl;
- }
- } else {
- dout(5) << " waiting for nested export dir on " << *in << endl;
- waiting_for++;
- }
- }
-
- if (waiting_for) {
- dout(5) << "waiting for " << waiting_for << " dirs to open" << endl;
- return;
- }
-
- // ok, done with this PrepAck
- assert(hash_gather[dir].count(from) == 1);
- hash_gather[dir].erase(from);
-
- if (hash_gather[dir].empty()) {
- hash_gather.erase(dir);
- dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", last one" << endl;
- unhash_dir_go(dir);
- } else {
- dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl;
- }
-
- delete m;
-}
-
-
-/*
- * auth:
- * send out MHashDir's to peers
- */
-void MDCache::unhash_dir_go(CDir *dir)
-{
- dout(7) << "unhash_dir_go " << *dir << endl;
- assert(dir->is_hashed());
- assert(dir->is_auth());
- assert(dir->is_frozen_dir());
- assert(dir->is_complete());
-
- // send unhash prep to all peers
- assert(hash_gather[dir].empty());
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i == mds->get_nodeid()) continue;
- hash_gather[dir].insert(i);
- mds->send_message_mds(new MUnhashDir(dir->ino()),
- i, MDS_PORT_CACHE);
- }
-}
-
-/*
- * auth:
- * assimilate unhashing content
- */
-void MDCache::handle_unhash_dir_ack(MUnhashDirAck *m)
-{
- CInode *in = get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- dout(7) << "handle_unhash_dir_ack " << *dir << endl;
- assert(dir->is_hashed());
-
- // assimilate content
- int from = MSG_ADDR_NUM(m->get_source());
- import_hashed_content(dir, m->get_state(), m->get_nden(), from);
- delete m;
-
- // done?
- assert(hash_gather[dir].count(from));
- hash_gather[dir].erase(from);
-
- if (!hash_gather[dir].empty()) {
- dout(7) << "still waiting for unhash acks from " << hash_gather[dir] << endl;
- return;
- }
-
- // done!
-
- // fix up nested_exports
- CDir *containing_import = get_auth_container(dir);
- if (containing_import != dir) {
- for (set<CDir*>::iterator it = nested_exports[dir].begin();
- it != nested_exports[dir].end();
- it++) {
- dout(7) << "moving nested export out from under hashed dir : " << **it << endl;
- nested_exports[containing_import].insert(*it);
- }
- nested_exports.erase(dir);
- }
-
- // dir state
- //dir->state_clear(CDIR_STATE_UNHASHING); //later
- dir->state_clear(CDIR_STATE_HASHED);
- dir->put(CDIR_PIN_HASHED);
- hashdirs.erase(dir);
-
- // commit!
- assert(dir->is_complete());
- //dir->mark_complete();
- dir->mark_dirty();
- mds->mdstore->commit_dir(dir, 0);
-
- // inode state
- dir->inode->inode.hash_seed = 0;
- if (dir->inode->is_auth()) {
- dir->inode->mark_dirty();
- mds->mdlog->submit_entry(new EInodeUpdate(dir->inode));
- }
-
- // notify
- assert(hash_gather[dir].empty());
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i == mds->get_nodeid()) continue;
-
- hash_gather[dir].insert(i);
-
- mds->send_message_mds(new MUnhashDirNotify(dir->ino()),
- i, MDS_PORT_CACHE);
- }
-}
-
-
-/*
- * sent by peer to flush mds links. unfreeze when all gathered.
- */
-void MDCache::handle_unhash_dir_notify_ack(MUnhashDirNotifyAck *m)
-{
- CInode *in = get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- dout(7) << "handle_unhash_dir_ack " << *dir << endl;
- assert(!dir->is_hashed());
- assert(dir->is_unhashing());
- assert(dir->is_frozen_dir());
-
- // done?
- int from = MSG_ADDR_NUM(m->get_source());
- assert(hash_gather[dir].count(from));
- hash_gather[dir].erase(from);
- delete m;
-
- if (!hash_gather[dir].empty()) {
- dout(7) << "still waiting for notifyack from " << hash_gather[dir] << " on " << *dir << endl;
- } else {
- unhash_dir_finish(dir);
- }
-}
-
-
-/*
- * all mds links are flushed. unfreeze dir!
- */
-void MDCache::unhash_dir_finish(CDir *dir)
-{
- dout(7) << "unhash_dir_finish " << *dir << endl;
- hash_gather.erase(dir);
-
- // unpin path
- vector<CDentry*> trace;
- make_trace(trace, dir->inode);
- path_unpin(trace, 0);
-
- // state
- dir->state_clear(CDIR_STATE_UNHASHING);
-
- // unfreeze
- dir->unfreeze_dir();
-
-}
-
-
-
-// UNHASH on all
-
-/*
- * hashed dir is complete.
- * mark all migrating inodes dirty (to pin in cache)
- * if frozen too, then go to next step (depending on auth)
- */
-void MDCache::unhash_dir_complete(CDir *dir)
-{
- dout(7) << "unhash_dir_complete " << *dir << ", dirtying inodes" << endl;
-
- assert(dir->is_hashed());
- assert(dir->is_complete());
-
- // mark dirty to pin in cache
- for (CDir_map_t::iterator it = dir->begin();
- it != dir->end();
- it++) {
- CInode *in = it->second->inode;
- if (in->is_auth()) {
- in->mark_dirty();
- mds->mdlog->submit_entry(new EInodeUpdate(in));
- }
- }
-
- if (!dir->is_frozen_dir()) {
- dout(7) << "dir complete but !frozen, waiting " << *dir << endl;
- } else {
- if (dir->is_auth())
- unhash_dir_prep(dir); // auth
- else
- unhash_dir_prep_finish(dir); // nonauth
- }
-}
-
-
-// UNHASH on non-auth
-
-class C_MDC_UnhashPrepFreeze : public Context {
-public:
- MDS *mds;
- CDir *dir;
- C_MDC_UnhashPrepFreeze(MDS *mds, CDir *dir) {
- this->mds = mds;
- this->dir = dir;
- }
- virtual void finish(int r) {
- mds->mdcache->unhash_dir_prep_frozen(dir);
- }
-};
-
-
-/*
- * peers need to freeze their dir and make them complete
- */
-void MDCache::handle_unhash_dir_prep(MUnhashDirPrep *m)
-{
- CInode *in = get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- dout(7) << "handle_unhash_dir_prep " << *dir << endl;
- assert(dir->is_hashed());
-
- // freeze
- dir->freeze_dir(new C_MDC_UnhashPrepFreeze(mds, dir));
-
- // make complete
- if (!dir->is_complete()) {
- dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl;
- mds->mdstore->fetch_dir(dir,
- new C_MDC_UnhashComplete(mds, dir));
- } else {
- unhash_dir_complete(dir);
- }
-
- delete m;
-}
-
-/*
- * peer has hashed dir frozen.
- * complete too?
- */
-void MDCache::unhash_dir_prep_frozen(CDir *dir)
-{
- dout(7) << "unhash_dir_prep_frozen " << *dir << endl;
-
- assert(dir->is_hashed());
- assert(dir->is_frozen_dir());
- assert(!dir->is_auth());
-
- if (!dir->is_complete()) {
- dout(7) << "unhash_dir_prep_frozen !complete, waiting still on " << *dir << endl;
- } else
- unhash_dir_prep_finish(dir);
-}
-
-/*
- * peer has hashed dir complete and frozen. ack.
- */
-void MDCache::unhash_dir_prep_finish(CDir *dir)
-{
- dout(7) << "unhash_dir_prep_finish " << *dir << endl;
- assert(dir->is_hashed());
- assert(!dir->is_auth());
- assert(dir->is_frozen());
- assert(dir->is_complete());
-
- // twiddle state
- if (dir->is_unhashing())
- return; // already replied.
- dir->state_set(CDIR_STATE_UNHASHING);
-
- // send subdirs back to auth
- MUnhashDirPrepAck *ack = new MUnhashDirPrepAck(dir->ino());
- int auth = dir->authority();
-
- for (CDir_map_t::iterator it = dir->begin();
- it != dir->end();
- it++) {
- CDentry *dn = it->second;
- CInode *in = dn->inode;
-
- if (!in->is_dir()) continue;
- if (!in->dir) continue;
-
- int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
- if (dentryhashcode != mds->get_nodeid()) continue;
-
- // msg?
- ack->add_inode(it->first, in->replicate_to(auth));
- }
-
- // ack
- mds->send_message_mds(ack, auth, MDS_PORT_CACHE);
-}
-
-
-
-/*
- * peer needs to send hashed dir content back to auth.
- * unhash dir.
- */
-void MDCache::handle_unhash_dir(MUnhashDir *m)
-{
- CInode *in = get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- dout(7) << "handle_unhash_dir " << *dir << endl;//" .. hash_seed is " << dir->inode->inode.hash_seed << endl;
- assert(dir->is_hashed());
- assert(dir->is_unhashing());
- assert(!dir->is_auth());
-
- // get message ready
- bufferlist bl;
- int nden = 0;
-
- // suck up all waiters
- C_Contexts *fin = new C_Contexts;
- list<Context*> waiting;
- dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters
- fin->take(waiting);
-
- // divy up contents
- for (CDir_map_t::iterator it = dir->begin();
- it != dir->end();
- it++) {
- CDentry *dn = it->second;
- CInode *in = dn->inode;
-
- int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
- if (dentryhashcode != mds->get_nodeid()) {
- // not mine!
- // twiddle dir_auth?
- if (in->dir) {
- if (in->dir->authority() != dir->authority())
- in->dir->set_dir_auth( in->dir->authority() );
- else
- in->dir->set_dir_auth( CDIR_AUTH_PARENT );
- }
- continue;
- }
-
- // -- dentry
- dout(7) << "unhash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl;
- _encode(it->first, bl);
-
- // null dentry?
- if (dn->is_null()) {
- bl.append("N", 1); // null dentry
- assert(dn->is_sync());
- continue;
- }
-
- if (dn->is_remote()) {
- // remote link
- bl.append("L", 1); // remote link
-
- inodeno_t ino = dn->get_remote_ino();
- bl.append((char*)&ino, sizeof(ino));
- continue;
- }
-
- // primary link
- // -- inode
- bl.append("I", 1); // inode dentry
-
- encode_export_inode(in, bl, dentryhashcode); // encode, and (update state for) export
- nden++;
-
- if (dn->is_dirty())
- dn->mark_clean();
-
- // proxy
- in->state_set(CINODE_STATE_PROXY);
- in->get(CINODE_PIN_PROXY);
- hash_proxy_inos[dir].push_back(in);
-
- if (in->dir) {
- if (in->dir->is_auth()) {
- // mine. make it into an import.
- dout(7) << "making subdir into import " << *in->dir << endl;
- in->dir->set_dir_auth( mds->get_nodeid() );
- imports.insert(in->dir);
- in->dir->get(CDIR_PIN_IMPORT);
- in->dir->state_set(CDIR_STATE_IMPORT);
- }
- else {
- // not mine.
- dout(7) << "un-exporting subdir that's being unhashed away " << *in->dir << endl;
- assert(in->dir->is_export());
- in->dir->put(CDIR_PIN_EXPORT);
- in->dir->state_clear(CDIR_STATE_EXPORT);
- exports.erase(in->dir);
- nested_exports[dir].erase(in->dir);
- }
- }
-
- // waiters
- list<Context*> waiters;
- in->take_waiting(CINODE_WAIT_ANY, waiters);
- fin->take(waiters);
- }
-
- // we should have no nested exports; we're not auth for the dir!
- assert(nested_exports[dir].empty());
- nested_exports.erase(dir);
-
- // dir state
- //dir->state_clear(CDIR_STATE_UNHASHING); // later
- dir->state_clear(CDIR_STATE_HASHED);
- dir->put(CDIR_PIN_HASHED);
- hashdirs.erase(dir);
- dir->mark_clean();
-
- // inode state
- dir->inode->inode.hash_seed = 0;
- if (dir->inode->is_auth()) {
- dir->inode->mark_dirty();
- mds->mdlog->submit_entry(new EInodeUpdate(dir->inode));
- }
-
- // init gather set
- hash_gather[dir] = mds->get_mds_map()->get_mds();
- hash_gather[dir].erase(mds->get_nodeid());
-
- // send unhash message
- mds->send_message_mds(new MUnhashDirAck(dir->ino(), bl, nden),
- dir->authority(), MDS_PORT_CACHE);
-}
-
-
-/*
- * first notify comes from auth.
- * send notifies to all other peers, with peer = self
- * if we get notify from peer=other, remove from our gather list.
- * when we've gotten notifies from everyone,
- * unpin proxies,
- * send notify_ack to auth.
- * this ensures that all mds links are flushed of cache_expire type messages.
- */
-void MDCache::handle_unhash_dir_notify(MUnhashDirNotify *m)
-{
- CInode *in = get_inode(m->get_ino());
- assert(in);
- CDir *dir = in->dir;
- assert(dir);
-
- dout(7) << "handle_unhash_dir_finish " << *dir << endl;
- assert(!dir->is_hashed());
- assert(dir->is_unhashing());
- assert(!dir->is_auth());
-
- int from = MSG_ADDR_NUM(m->get_source());
- assert(hash_gather[dir].count(from) == 1);
- hash_gather[dir].erase(from);
- delete m;
-
- // did we send our shout out?
- if (from == dir->authority()) {
- // send notify to everyone else in weird chatter storm
- for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
- if (i == from) continue;
- if (i == mds->get_nodeid()) continue;
- mds->send_message_mds(new MUnhashDirNotify(dir->ino()), i, MDS_PORT_CACHE);
- }
- }
-
- // are we done?
- if (!hash_gather[dir].empty()) {
- dout(7) << "still waiting for notify from " << hash_gather[dir] << endl;
- return;
- }
- hash_gather.erase(dir);
-
- // all done!
- dout(7) << "all mds links flushed, unpinning unhash proxies" << endl;
-
- // unpin proxies
- for (list<CInode*>::iterator it = hash_proxy_inos[dir].begin();
- it != hash_proxy_inos[dir].end();
- it++) {
- CInode *in = *it;
- assert(in->state_test(CINODE_STATE_PROXY));
- in->state_clear(CINODE_STATE_PROXY);
- in->put(CINODE_PIN_PROXY);
- }
-
- // unfreeze
- dir->unfreeze_dir();
-
- // ack
- dout(7) << "sending notify_ack to auth for unhash of " << *dir << endl;
- mds->send_message_mds(new MUnhashDirNotifyAck(dir->ino()), dir->authority(), MDS_PORT_CACHE);
-
-}
class MDS;
+class Migrator;
+
class Message;
-class MExportDirDiscover;
-class MExportDirDiscoverAck;
-class MExportDirPrep;
-class MExportDirPrepAck;
-class MExportDirWarning;
-class MExportDir;
-class MExportDirNotify;
-class MExportDirNotifyAck;
-class MExportDirFinish;
+
class MDiscover;
class MDiscoverReply;
//class MInodeUpdate;
class MClientRequest;
-class MHashDirDiscover;
-class MHashDirDiscoverAck;
-class MHashDirPrep;
-class MHashDirPrepAck;
-class MHashDir;
-class MHashDirAck;
-class MHashDirNotify;
-
-class MUnhashDirPrep;
-class MUnhashDirPrepAck;
-class MUnhashDir;
-class MUnhashDirAck;
-class MUnhashDirNotify;
-class MUnhashDirNotifyAck;
-
// MDCache
set<CDir*> hashdirs;
map<CDir*,set<CDir*> > nested_exports; // exports nested under imports _or_ hashdirs
- // export fun
- map<CDir*, set<int> > export_notify_ack_waiting; // nodes i am waiting to get export_notify_ack's from
- map<CDir*, list<inodeno_t> > export_proxy_inos;
- map<CDir*, list<inodeno_t> > export_proxy_dirinos;
-
- set<inodeno_t> stray_export_warnings; // notifies i haven't seen
- map<inodeno_t, MExportDirNotify*> stray_export_notifies;
-
// rename fun
set<inodeno_t> stray_rename_warnings; // notifies i haven't seen
map<inodeno_t, MRenameNotify*> stray_rename_notifies;
- // hashing madness
- multimap<CDir*, int> unhash_waiting; // nodes i am waiting for UnhashDirAck's from
- multimap<inodeno_t, inodeno_t> import_hashed_replicate_waiting; // nodes i am waiting to discover to complete my import of a hashed dir
- // maps frozen_dir_ino's to waiting-for-discover ino's.
- multimap<inodeno_t, inodeno_t> import_hashed_frozen_waiting; // dirs i froze (for the above)
- // maps import_root_ino's to frozen dir ino's (with pending discovers)
-
public:
+ Migrator *migrator;
+
// active MDS requests
//map<Message*, active_request_t> active_requests;
hash_map<Message*, active_request_t> active_requests;
int shutdown_commits;
bool did_shutdown_exports;
-
+ friend class Migrator;
friend class MDBalancer;
public:
return NULL;
}
- protected:
- CDir *get_auth_container(CDir *in);
- void find_nested_exports(CDir *dir, set<CDir*>& s);
- void find_nested_exports_under(CDir *import, CDir *dir, set<CDir*>& s);
-
// adding/removing
public:
}
public:
- void export_empty_import(CDir *dir);
protected:
+ // private methods
+ CDir *get_auth_container(CDir *in);
+ void find_nested_exports(CDir *dir, set<CDir*>& s);
+ void find_nested_exports_under(CDir *import, CDir *dir, set<CDir*>& s);
+
+
void rename_file(CDentry *srcdn, CDentry *destdn);
void fix_renamed_dir(CDir *srcdir,
CInode *in,
void do_dir_proxy(CDir *dir, Message *m);
- // -- import/export --
- // exporter
- public:
- void export_dir(CDir *dir,
- int mds);
- protected:
- map< CDir*, set<int> > export_gather;
- void handle_export_dir_discover_ack(MExportDirDiscoverAck *m);
- void export_dir_frozen(CDir *dir, int dest);
- void handle_export_dir_prep_ack(MExportDirPrepAck *m);
- void export_dir_go(CDir *dir,
- int dest);
- int export_dir_walk(MExportDir *req,
- class C_Contexts *fin,
- CDir *basedir,
- CDir *dir,
- int newauth);
- void export_dir_finish(CDir *dir);
- void handle_export_dir_notify_ack(MExportDirNotifyAck *m);
-
- void encode_export_inode(CInode *in, bufferlist& enc_state, int newauth);
-
- friend class C_MDC_ExportFreeze;
-
- // importer
- void handle_export_dir_discover(MExportDirDiscover *m);
- void handle_export_dir_discover_2(MExportDirDiscover *m, CInode *in, int r);
- void handle_export_dir_prep(MExportDirPrep *m);
- void handle_export_dir(MExportDir *m);
- void import_dir_finish(CDir *dir);
- void handle_export_dir_finish(MExportDirFinish *m);
- int import_dir_block(bufferlist& bl,
- int& off,
- int oldauth,
- CDir *import_root,
- list<inodeno_t>& imported_subdirs);
- void got_hashed_replica(CDir *import,
- inodeno_t dir_ino,
- inodeno_t replica_ino);
-
- void decode_import_inode(CDentry *dn, bufferlist& bl, int &off, int oldauth);
-
- friend class C_MDC_ExportDirDiscover;
- // bystander
- void handle_export_dir_warning(MExportDirWarning *m);
- void handle_export_dir_notify(MExportDirNotify *m);
-
-
- // -- hashed directories --
-
- // HASH
- public:
- void hash_dir(CDir *dir); // on auth
- protected:
- map< CDir*, set<int> > hash_gather;
- map< CDir*, map< int, set<int> > > hash_notify_gather;
- map< CDir*, list<CInode*> > hash_proxy_inos;
-
- // hash on auth
- void handle_hash_dir_discover_ack(MHashDirDiscoverAck *m);
- void hash_dir_complete(CDir *dir);
- void hash_dir_frozen(CDir *dir);
- void handle_hash_dir_prep_ack(MHashDirPrepAck *m);
- void hash_dir_go(CDir *dir);
- void handle_hash_dir_ack(MHashDirAck *m);
- void hash_dir_finish(CDir *dir);
- friend class C_MDC_HashFreeze;
- friend class C_MDC_HashComplete;
-
- // auth and non-auth
- void handle_hash_dir_notify(MHashDirNotify *m);
-
- // hash on non-auth
- void handle_hash_dir_discover(MHashDirDiscover *m);
- void handle_hash_dir_discover_2(MHashDirDiscover *m, CInode *in, int r);
- void handle_hash_dir_prep(MHashDirPrep *m);
- void handle_hash_dir(MHashDir *m);
- friend class C_MDC_HashDirDiscover;
-
- // UNHASH
- public:
- void unhash_dir(CDir *dir); // on auth
- protected:
- map< CDir*, list<MUnhashDirAck*> > unhash_content;
- void import_hashed_content(CDir *dir, bufferlist& bl, int nden, int oldauth);
-
- // unhash on auth
- void unhash_dir_frozen(CDir *dir);
- void unhash_dir_prep(CDir *dir);
- void handle_unhash_dir_prep_ack(MUnhashDirPrepAck *m);
- void unhash_dir_go(CDir *dir);
- void handle_unhash_dir_ack(MUnhashDirAck *m);
- void handle_unhash_dir_notify_ack(MUnhashDirNotifyAck *m);
- void unhash_dir_finish(CDir *dir);
- friend class C_MDC_UnhashFreeze;
- friend class C_MDC_UnhashComplete;
-
- // unhash on all
- void unhash_dir_complete(CDir *dir);
-
- // unhash on non-auth
- void handle_unhash_dir_prep(MUnhashDirPrep *m);
- void unhash_dir_prep_frozen(CDir *dir);
- void unhash_dir_prep_finish(CDir *dir);
- void handle_unhash_dir(MUnhashDir *m);
- void handle_unhash_dir_notify(MUnhashDirNotify *m);
- friend class C_MDC_UnhashPrepFreeze;
// -- updates --
//int send_inode_updates(CInode *in);
#include "MDLog.h"
#include "MDBalancer.h"
#include "IdAllocator.h"
+#include "Migrator.h"
#include "AnchorTable.h"
#include "AnchorClient.h"
mdcache->proc_message(m);
break;
+ case MDS_PORT_MIGRATOR:
+ mdcache->migrator->dispatch(m);
+ break;
+
/*
case MSG_PORT_MDLOG:
mymds->logger->proc_message(m);
!(mdcache->get_root()->dir->is_hashed() ||
mdcache->get_root()->dir->is_hashing())) {
dout(0) << "hashing root" << endl;
- mdcache->hash_dir(mdcache->get_root()->dir);
+ mdcache->migrator->hash_dir(mdcache->get_root()->dir);
}
CInode *in = mdcache->get_inode(100000010);
if (in && in->dir) {
if (in->dir->is_auth())
- mdcache->hash_dir(in->dir);
+ mdcache->migrator->hash_dir(in->dir);
didhash[whoami] = 1;
}
}
CInode *in = mdcache->get_inode(100000010);
if (in && in->dir) {
if (in->dir->is_auth() && in->dir->is_hashed())
- mdcache->unhash_dir(in->dir);
+ mdcache->migrator->unhash_dir(in->dir);
didhash[whoami] = 2;
}
}
CDir *dir = in->get_or_open_dir(this);
if (dir->is_auth()) {
dout(1) << "FORCING EXPORT" << endl;
- mdcache->export_dir(dir,1);
+ mdcache->migrator->export_dir(dir,1);
didit = true;
}
}
if (!dir->is_hashed() &&
!dir->is_hashing() &&
dir->is_auth())
- mdcache->hash_dir(dir);
+ mdcache->migrator->hash_dir(dir);
}
// </HACK>
dout(7) << "handle_client_rmdir dir is auth, but not inode." << endl;
if (!in->dir->is_freezing() && in->dir->is_frozen()) {
assert(in->dir->is_import());
- mdcache->export_empty_import(in->dir);
+ mdcache->migrator->export_empty_import(in->dir);
} else {
dout(7) << "apparently already exporting" << endl;
}
int dest = rand() % mdsmap->get_num_mds();
if (dest != whoami) {
dout(10) << "exporting new dir " << *newdir << " in replicated parent " << *diri->dir << endl;
- mdcache->export_dir(newdir, dest);
+ mdcache->migrator->export_dir(newdir, dest);
}
}
#define MDS_PORT_CACHE 101
#define MDS_PORT_STORE 102
#define MDS_PORT_BALANCER 103
+#define MDS_PORT_MIGRATOR 104
#define MDS_PORT_ANCHORCLIENT 200
#define MDS_PORT_ANCHORMGR 201
#define __MDSTORE_H
#include "include/types.h"
-#include "include/bufferlist.h"
+#include "include/buffer.h"
class MDS;
class CDir;
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "MDS.h"
+#include "MDCache.h"
+#include "CInode.h"
+#include "CDir.h"
+#include "CDentry.h"
+#include "Migrator.h"
+
+#include "MDBalancer.h"
+#include "MDLog.h"
+#include "MDSMap.h"
+
+#include "include/filepath.h"
+
+#include "events/EInodeUpdate.h"
+#include "events/EDirUpdate.h"
+
+#include "msg/Messenger.h"
+
+#include "messages/MClientFileCaps.h"
+
+#include "messages/MExportDirDiscover.h"
+#include "messages/MExportDirDiscoverAck.h"
+#include "messages/MExportDirPrep.h"
+#include "messages/MExportDirPrepAck.h"
+#include "messages/MExportDirWarning.h"
+#include "messages/MExportDir.h"
+#include "messages/MExportDirNotify.h"
+#include "messages/MExportDirNotifyAck.h"
+#include "messages/MExportDirFinish.h"
+
+#include "messages/MHashDirDiscover.h"
+#include "messages/MHashDirDiscoverAck.h"
+#include "messages/MHashDirPrep.h"
+#include "messages/MHashDirPrepAck.h"
+#include "messages/MHashDir.h"
+#include "messages/MHashDirNotify.h"
+#include "messages/MHashDirAck.h"
+
+#include "messages/MUnhashDirPrep.h"
+#include "messages/MUnhashDirPrepAck.h"
+#include "messages/MUnhashDir.h"
+#include "messages/MUnhashDirAck.h"
+#include "messages/MUnhashDirNotify.h"
+#include "messages/MUnhashDirNotifyAck.h"
+
+
+
+void Migrator::dispatch(Message *m)
+{
+ switch (m->get_type()) {
+ // import
+ case MSG_MDS_EXPORTDIRDISCOVER:
+ handle_export_dir_discover((MExportDirDiscover*)m);
+ break;
+ case MSG_MDS_EXPORTDIRPREP:
+ handle_export_dir_prep((MExportDirPrep*)m);
+ break;
+ case MSG_MDS_EXPORTDIR:
+ handle_export_dir((MExportDir*)m);
+ break;
+ case MSG_MDS_EXPORTDIRFINISH:
+ handle_export_dir_finish((MExportDirFinish*)m);
+ break;
+
+ // export
+ case MSG_MDS_EXPORTDIRDISCOVERACK:
+ handle_export_dir_discover_ack((MExportDirDiscoverAck*)m);
+ break;
+ case MSG_MDS_EXPORTDIRPREPACK:
+ handle_export_dir_prep_ack((MExportDirPrepAck*)m);
+ break;
+ case MSG_MDS_EXPORTDIRNOTIFYACK:
+ handle_export_dir_notify_ack((MExportDirNotifyAck*)m);
+ break;
+
+ // export 3rd party (inode authority)
+ case MSG_MDS_EXPORTDIRWARNING:
+ handle_export_dir_warning((MExportDirWarning*)m);
+ break;
+ case MSG_MDS_EXPORTDIRNOTIFY:
+ handle_export_dir_notify((MExportDirNotify*)m);
+ break;
+
+
+ // hashing
+ case MSG_MDS_HASHDIRDISCOVER:
+ handle_hash_dir_discover((MHashDirDiscover*)m);
+ break;
+ case MSG_MDS_HASHDIRDISCOVERACK:
+ handle_hash_dir_discover_ack((MHashDirDiscoverAck*)m);
+ break;
+ case MSG_MDS_HASHDIRPREP:
+ handle_hash_dir_prep((MHashDirPrep*)m);
+ break;
+ case MSG_MDS_HASHDIRPREPACK:
+ handle_hash_dir_prep_ack((MHashDirPrepAck*)m);
+ break;
+ case MSG_MDS_HASHDIR:
+ handle_hash_dir((MHashDir*)m);
+ break;
+ case MSG_MDS_HASHDIRACK:
+ handle_hash_dir_ack((MHashDirAck*)m);
+ break;
+ case MSG_MDS_HASHDIRNOTIFY:
+ handle_hash_dir_notify((MHashDirNotify*)m);
+ break;
+
+ // unhashing
+ case MSG_MDS_UNHASHDIRPREP:
+ handle_unhash_dir_prep((MUnhashDirPrep*)m);
+ break;
+ case MSG_MDS_UNHASHDIRPREPACK:
+ handle_unhash_dir_prep_ack((MUnhashDirPrepAck*)m);
+ break;
+ case MSG_MDS_UNHASHDIR:
+ handle_unhash_dir((MUnhashDir*)m);
+ break;
+ case MSG_MDS_UNHASHDIRACK:
+ handle_unhash_dir_ack((MUnhashDirAck*)m);
+ break;
+ case MSG_MDS_UNHASHDIRNOTIFY:
+ handle_unhash_dir_notify((MUnhashDirNotify*)m);
+ break;
+ case MSG_MDS_UNHASHDIRNOTIFYACK:
+ handle_unhash_dir_notify_ack((MUnhashDirNotifyAck*)m);
+ break;
+
+ default:
+ assert(0);
+ }
+}
+
+
+class C_MDC_EmptyImport : public Context {
+ Migrator *mig;
+ CDir *dir;
+public:
+ C_MDC_EmptyImport(Migrator *m, CDir *d) : mig(m), dir(d) {}
+ void finish(int r) {
+ mig->export_empty_import(dir);
+ }
+};
+
+
+void Migrator::export_empty_import(CDir *dir)
+{
+ dout(7) << "export_empty_import " << *dir << endl;
+
+ return; // hack fixme
+
+ if (!dir->is_import()) {
+ dout(7) << "not import (anymore?)" << endl;
+ return;
+ }
+ if (dir->inode->is_root()) {
+ dout(7) << "root" << endl;
+ return;
+ }
+
+ if (dir->get_size() > 0) {
+ dout(7) << "not actually empty" << endl;
+ return;
+ }
+
+ // is it really empty?
+ if (!dir->is_complete()) {
+ dout(7) << "not complete, fetching." << endl;
+ mds->mdstore->fetch_dir(dir,
+ new C_MDC_EmptyImport(this,dir));
+ return;
+ }
+
+ int dest = dir->inode->authority();
+
+ // comment this out ot wreak havoc?
+ //if (mds->is_shutting_down()) dest = 0; // this is more efficient.
+
+ dout(7) << "really empty, exporting to " << dest << endl;
+ assert (dest != mds->get_nodeid());
+
+ dout(-7) << "exporting to mds" << dest
+ << " empty import " << *dir << endl;
+ export_dir( dir, dest );
+}
+
+
+// ==========================================================
+// IMPORT/EXPORT
+
+
+class C_MDC_ExportFreeze : public Context {
+ Migrator *mig;
+ CDir *ex; // dir i'm exporting
+ int dest;
+
+public:
+ C_MDC_ExportFreeze(Migrator *m, CDir *e, int d) :
+ mig(m), ex(e), dest(d) {}
+ virtual void finish(int r) {
+ mig->export_dir_frozen(ex, dest);
+ }
+};
+
+
+
+/** export_dir(dir, dest)
+ * public method to initiate an export.
+ * will fail if the directory is freezing, frozen, unpinnable, or root.
+ */
+void Migrator::export_dir(CDir *dir,
+ int dest)
+{
+ dout(7) << "export_dir " << *dir << " to " << dest << endl;
+ assert(dest != mds->get_nodeid());
+ assert(!dir->is_hashed());
+
+ if (dir->inode->is_root()) {
+ dout(7) << "i won't export root" << endl;
+ assert(0);
+ return;
+ }
+
+ if (dir->is_frozen() ||
+ dir->is_freezing()) {
+ dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << endl;
+ return;
+ }
+ if (dir->is_hashed()) {
+ dout(7) << "can't export hashed dir right now. implement me carefully later." << endl;
+ return;
+ }
+
+
+ // pin path?
+ vector<CDentry*> trace;
+ cache->make_trace(trace, dir->inode);
+ if (!cache->path_pin(trace, 0, 0)) {
+ dout(7) << "export_dir couldn't pin path, failing." << endl;
+ return;
+ }
+
+ // ok, let's go.
+
+ // send ExportDirDiscover (ask target)
+ export_gather[dir].insert(dest);
+ mds->send_message_mds(new MExportDirDiscover(dir->inode), dest, MDS_PORT_MIGRATOR);
+ dir->auth_pin(); // pin dir, to hang up our freeze (unpin on prep ack)
+
+ // take away the popularity we're sending. FIXME: do this later?
+ mds->balancer->subtract_export(dir);
+
+
+ // freeze the subtree
+ dir->freeze_tree(new C_MDC_ExportFreeze(this, dir, dest));
+}
+
+
+/*
+ * called on receipt of MExportDirDiscoverAck
+ * the importer now has the directory's _inode_ in memory, and pinned.
+ */
+void Migrator::handle_export_dir_discover_ack(MExportDirDiscoverAck *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ int from = MSG_ADDR_NUM(m->get_source());
+ assert(export_gather[dir].count(from));
+ export_gather[dir].erase(from);
+
+ if (export_gather[dir].empty()) {
+ dout(7) << "export_dir_discover_ack " << *dir << ", releasing auth_pin" << endl;
+ dir->auth_unpin(); // unpin to allow freeze to complete
+ } else {
+ dout(7) << "export_dir_discover_ack " << *dir << ", still waiting for " << export_gather[dir] << endl;
+ }
+
+ delete m; // done
+}
+
+
+void Migrator::export_dir_frozen(CDir *dir,
+ int dest)
+{
+ // subtree is now frozen!
+ dout(7) << "export_dir_frozen on " << *dir << " to " << dest << endl;
+
+ show_imports();
+
+ MExportDirPrep *prep = new MExportDirPrep(dir->inode);
+
+ // include spanning tree for all nested exports.
+ // these need to be on the destination _before_ the final export so that
+ // dir_auth updates on any nested exports are properly absorbed.
+
+ set<inodeno_t> inodes_added;
+
+ // include base dir
+ prep->add_dir( new CDirDiscover(dir, dir->open_by_add(dest)) );
+
+ // also include traces to all nested exports.
+ set<CDir*> my_nested;
+ cache->find_nested_exports(dir, my_nested);
+ for (set<CDir*>::iterator it = my_nested.begin();
+ it != my_nested.end();
+ it++) {
+ CDir *exp = *it;
+
+ dout(7) << " including nested export " << *exp << " in prep" << endl;
+
+ prep->add_export( exp->ino() );
+
+ /* first assemble each trace, in trace order, and put in message */
+ list<CInode*> inode_trace;
+
+ // trace to dir
+ CDir *cur = exp;
+ while (cur != dir) {
+ // don't repeat ourselves
+ if (inodes_added.count(cur->ino())) break; // did already!
+ inodes_added.insert(cur->ino());
+
+ CDir *parent_dir = cur->get_parent_dir();
+
+ // inode?
+ assert(cur->inode->is_auth());
+ inode_trace.push_front(cur->inode);
+ dout(7) << " will add " << *cur->inode << endl;
+
+ // include dir? note: this'll include everything except the nested exports themselves,
+ // since someone else is obviously auth.
+ if (cur->is_auth()) {
+ prep->add_dir( new CDirDiscover(cur, cur->open_by_add(dest)) ); // yay!
+ dout(7) << " added " << *cur << endl;
+ }
+
+ cur = parent_dir;
+ }
+
+ for (list<CInode*>::iterator it = inode_trace.begin();
+ it != inode_trace.end();
+ it++) {
+ CInode *in = *it;
+ dout(7) << " added " << *in << endl;
+ prep->add_inode( in->parent->dir->ino(),
+ in->parent->name,
+ in->replicate_to(dest) );
+ }
+
+ }
+
+ // send it!
+ mds->send_message_mds(prep, dest, MDS_PORT_MIGRATOR);
+}
+
+void Migrator::handle_export_dir_prep_ack(MExportDirPrepAck *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ dout(7) << "export_dir_prep_ack " << *dir << ", starting export" << endl;
+
+ // start export.
+ export_dir_go(dir, MSG_ADDR_NUM(m->get_source()));
+
+ // done
+ delete m;
+}
+
+
+void Migrator::export_dir_go(CDir *dir,
+ int dest)
+{
+ dout(7) << "export_dir_go " << *dir << " to " << dest << endl;
+
+ show_imports();
+
+
+ // build export message
+ MExportDir *req = new MExportDir(dir->inode); // include pop
+
+
+ // update imports/exports
+ CDir *containing_import = cache->get_auth_container(dir);
+
+ if (containing_import == dir) {
+ dout(7) << " i'm rexporting a previous import" << endl;
+ assert(dir->is_import());
+ cache->imports.erase(dir);
+ dir->state_clear(CDIR_STATE_IMPORT);
+ dir->put(CDIR_PIN_IMPORT); // unpin, no longer an import
+
+ // discard nested exports (that we're handing off
+ for (set<CDir*>::iterator p = cache->nested_exports[dir].begin();
+ p != cache->nested_exports[dir].end(); ) {
+ CDir *nested = *p;
+ p++;
+
+ // add to export message
+ req->add_export(nested);
+
+ // nested beneath our new export *in; remove!
+ dout(7) << " export " << *nested << " was nested beneath us; removing from export list(s)" << endl;
+ assert(cache->exports.count(nested) == 1);
+ cache->nested_exports[dir].erase(nested);
+ }
+
+ } else {
+ dout(7) << " i'm a subdir nested under import " << *containing_import << endl;
+ cache->exports.insert(dir);
+ cache->nested_exports[containing_import].insert(dir);
+
+ dir->state_set(CDIR_STATE_EXPORT);
+ dir->get(CDIR_PIN_EXPORT); // i must keep it pinned
+
+ // discard nested exports (that we're handing off)
+ for (set<CDir*>::iterator p = cache->nested_exports[containing_import].begin();
+ p != cache->nested_exports[containing_import].end(); ) {
+ CDir *nested = *p;
+ p++;
+ if (nested == dir) continue; // ignore myself
+
+ // container of parent; otherwise we get ourselves.
+ CDir *containing_export = nested->get_parent_dir();
+ while (containing_export && !containing_export->is_export())
+ containing_export = containing_export->get_parent_dir();
+ if (!containing_export) continue;
+
+ if (containing_export == dir) {
+ // nested beneath our new export *in; remove!
+ dout(7) << " export " << *nested << " was nested beneath us; removing from nested_exports" << endl;
+ cache->nested_exports[containing_import].erase(nested);
+ // exports.erase(nested); _walk does this
+
+ // add to msg
+ req->add_export(nested);
+ } else {
+ dout(12) << " export " << *nested << " is under other export " << *containing_export << ", which is unrelated" << endl;
+ assert(cache->get_auth_container(containing_export) != containing_import);
+ }
+ }
+ }
+
+ // note new authority (locally)
+ if (dir->inode->authority() == dest)
+ dir->set_dir_auth( CDIR_AUTH_PARENT );
+ else
+ dir->set_dir_auth( dest );
+
+ // make list of nodes i expect an export_dir_notify_ack from
+ // (everyone w/ this dir open, but me!)
+ assert(export_notify_ack_waiting[dir].empty());
+ for (set<int>::iterator it = dir->open_by.begin();
+ it != dir->open_by.end();
+ it++) {
+ if (*it == mds->get_nodeid()) continue;
+ export_notify_ack_waiting[dir].insert( *it );
+
+ // send warning to all but dest
+ if (*it != dest) {
+ dout(10) << " sending export_dir_warning to mds" << *it << endl;
+ mds->send_message_mds(new MExportDirWarning( dir->ino() ), *it, MDS_PORT_MIGRATOR);
+ }
+ }
+ assert(export_notify_ack_waiting[dir].count( dest ));
+
+ // fill export message with cache data
+ C_Contexts *fin = new C_Contexts;
+ int num_exported_inodes = export_dir_walk( req,
+ fin,
+ dir, // base
+ dir, // recur start point
+ dest );
+
+ // send the export data!
+ mds->send_message_mds(req, dest, MDS_PORT_MIGRATOR);
+
+ // queue up the finisher
+ dir->add_waiter( CDIR_WAIT_UNFREEZE, fin );
+
+
+ // stats
+ mds->logger->inc("ex");
+ mds->logger->inc("iex", num_exported_inodes);
+
+ show_imports();
+}
+
+
+/** encode_export_inode
+ * update our local state for this inode to export.
+ * encode relevant state to be sent over the wire.
+ * used by: export_dir_walk, file_rename (if foreign)
+ */
+void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, int new_auth)
+{
+ in->version++; // so local log entries are ignored, etc. (FIXME ??)
+
+ // tell (all) clients about migrating caps.. mark STALE
+ for (map<int, Capability>::iterator it = in->client_caps.begin();
+ it != in->client_caps.end();
+ it++) {
+ dout(7) << "encode_export_inode " << *in << " telling client" << it->first << " stale caps" << endl;
+ MClientFileCaps *m = new MClientFileCaps(in->inode,
+ it->second.get_last_seq(),
+ it->second.pending(),
+ it->second.wanted(),
+ MClientFileCaps::FILECAP_STALE);
+ mds->messenger->send_message(m, MSG_ADDR_CLIENT(it->first), mds->clientmap.get_inst(it->first),
+ 0, MDS_PORT_CACHE);
+ }
+
+ // relax locks?
+ if (!in->is_cached_by_anyone())
+ in->replicate_relax_locks();
+
+ // add inode
+ assert(in->cached_by.count(mds->get_nodeid()) == 0);
+ CInodeExport istate( in );
+ istate._encode( enc_state );
+
+ // we're export this inode; fix inode state
+ dout(7) << "encode_export_inode " << *in << endl;
+
+ if (in->is_dirty()) in->mark_clean();
+
+ // clear/unpin cached_by (we're no longer the authority)
+ in->cached_by_clear();
+
+ // twiddle lock states for auth -> replica transition
+ // hard
+ in->hardlock.clear_gather();
+ if (in->hardlock.get_state() == LOCK_GLOCKR)
+ in->hardlock.set_state(LOCK_LOCK);
+
+ // file : we lost all our caps, so move to stable state!
+ in->filelock.clear_gather();
+ if (in->filelock.get_state() == LOCK_GLOCKR ||
+ in->filelock.get_state() == LOCK_GLOCKM ||
+ in->filelock.get_state() == LOCK_GLOCKL ||
+ in->filelock.get_state() == LOCK_GLONERR ||
+ in->filelock.get_state() == LOCK_GLONERM ||
+ in->filelock.get_state() == LOCK_LONER)
+ in->filelock.set_state(LOCK_LOCK);
+ if (in->filelock.get_state() == LOCK_GMIXEDR)
+ in->filelock.set_state(LOCK_MIXED);
+ // this looks like a step backwards, but it's what we want!
+ if (in->filelock.get_state() == LOCK_GSYNCM)
+ in->filelock.set_state(LOCK_MIXED);
+ if (in->filelock.get_state() == LOCK_GSYNCL)
+ in->filelock.set_state(LOCK_LOCK);
+ if (in->filelock.get_state() == LOCK_GMIXEDL)
+ in->filelock.set_state(LOCK_LOCK);
+ //in->filelock.set_state(LOCK_MIXED);
+
+ // mark auth
+ assert(in->is_auth());
+ in->set_auth(false);
+ in->replica_nonce = CINODE_EXPORT_NONCE;
+
+ // *** other state too?
+
+ // move to end of LRU so we drop it out of cache quickly!
+ cache->lru.lru_bottouch(in);
+}
+
+
+int Migrator::export_dir_walk(MExportDir *req,
+ C_Contexts *fin,
+ CDir *basedir,
+ CDir *dir,
+ int newauth)
+{
+ int num_exported = 0;
+
+ dout(7) << "export_dir_walk " << *dir << " " << dir->nitems << " items" << endl;
+
+ // dir
+ bufferlist enc_dir;
+
+ CDirExport dstate(dir);
+ dstate._encode( enc_dir );
+
+ // release open_by
+ dir->open_by_clear();
+
+ // mark
+ assert(dir->is_auth());
+ dir->state_clear(CDIR_STATE_AUTH);
+ dir->replica_nonce = CDIR_NONCE_EXPORT;
+
+ // proxy
+ dir->state_set(CDIR_STATE_PROXY);
+ dir->get(CDIR_PIN_PROXY);
+ export_proxy_dirinos[basedir].push_back(dir->ino());
+
+ list<CDir*> subdirs;
+
+ if (dir->is_hashed()) {
+ // fix state
+ dir->state_clear( CDIR_STATE_AUTH );
+
+ } else {
+
+ if (dir->is_dirty())
+ dir->mark_clean();
+
+ // discard most dir state
+ dir->state &= CDIR_MASK_STATE_EXPORT_KEPT; // i only retain a few things.
+
+ // suck up all waiters
+ list<Context*> waiting;
+ dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters
+ fin->take(waiting);
+
+ // inodes
+
+ CDir_map_t::iterator it;
+ for (it = dir->begin(); it != dir->end(); it++) {
+ CDentry *dn = it->second;
+ CInode *in = dn->inode;
+
+ num_exported++;
+
+ // -- dentry
+ dout(7) << "export_dir_walk exporting " << *dn << endl;
+ _encode(it->first, enc_dir);
+
+ if (dn->is_dirty())
+ enc_dir.append("D", 1); // dirty
+ else
+ enc_dir.append("C", 1); // clean
+
+ // null dentry?
+ if (dn->is_null()) {
+ enc_dir.append("N", 1); // null dentry
+ assert(dn->is_sync());
+ continue;
+ }
+
+ if (dn->is_remote()) {
+ // remote link
+ enc_dir.append("L", 1); // remote link
+
+ inodeno_t ino = dn->get_remote_ino();
+ enc_dir.append((char*)&ino, sizeof(ino));
+ continue;
+ }
+
+ // primary link
+ // -- inode
+ enc_dir.append("I", 1); // inode dentry
+
+ encode_export_inode(in, enc_dir, newauth); // encode, and (update state for) export
+
+ // directory?
+ if (in->is_dir() && in->dir) {
+ if (in->dir->is_auth()) {
+ // nested subdir
+ assert(in->dir->get_dir_auth() == CDIR_AUTH_PARENT);
+ subdirs.push_back(in->dir); // it's ours, recurse (later)
+
+ } else {
+ // nested export
+ assert(in->dir->get_dir_auth() >= 0);
+ dout(7) << " encountered nested export " << *in->dir << " dir_auth " << in->dir->get_dir_auth() << "; removing from exports" << endl;
+ assert(cache->exports.count(in->dir) == 1);
+ cache->exports.erase(in->dir); // discard nested export (nested_exports updated above)
+
+ in->dir->state_clear(CDIR_STATE_EXPORT);
+ in->dir->put(CDIR_PIN_EXPORT);
+
+ // simplify dir_auth?
+ if (in->dir->get_dir_auth() == newauth)
+ in->dir->set_dir_auth( CDIR_AUTH_PARENT );
+ }
+ }
+
+ // add to proxy
+ export_proxy_inos[basedir].push_back(in->ino());
+ in->state_set(CINODE_STATE_PROXY);
+ in->get(CINODE_PIN_PROXY);
+
+ // waiters
+ list<Context*> waiters;
+ in->take_waiting(CINODE_WAIT_ANY, waiters);
+ fin->take(waiters);
+ }
+ }
+
+ req->add_dir( enc_dir );
+
+ // subdirs
+ for (list<CDir*>::iterator it = subdirs.begin(); it != subdirs.end(); it++)
+ num_exported += export_dir_walk(req, fin, basedir, *it, newauth);
+
+ return num_exported;
+}
+
+
+/*
+ * i should get an export_dir_notify_ack from every mds that had me open, including the new auth (an ack)
+ */
+void Migrator::handle_export_dir_notify_ack(MExportDirNotifyAck *m)
+{
+ CInode *diri = cache->get_inode(m->get_ino());
+ CDir *dir = diri->dir;
+ assert(dir);
+ assert(dir->is_frozen_tree_root()); // i'm exporting!
+
+ // remove from waiting list
+ int from = MSG_ADDR_NUM(m->get_source());
+ assert(export_notify_ack_waiting[dir].count(from));
+ export_notify_ack_waiting[dir].erase(from);
+
+ // done?
+ if (!export_notify_ack_waiting[dir].empty()) {
+ dout(7) << "handle_export_dir_notify_ack on " << *dir << " from " << from
+ << ", still waiting for " << export_notify_ack_waiting[dir] << endl;
+
+ } else {
+ dout(7) << "handle_export_dir_notify_ack on " << *dir << " from " << from
+ << ", last one!" << endl;
+
+ // ok, we're finished!
+ export_notify_ack_waiting.erase(dir);
+
+ // finish export (unfreeze, trigger finish context, etc.)
+ export_dir_finish(dir);
+
+ // unpin proxies
+ // inodes
+ for (list<inodeno_t>::iterator it = export_proxy_inos[dir].begin();
+ it != export_proxy_inos[dir].end();
+ it++) {
+ CInode *in = cache->get_inode(*it);
+ in->put(CINODE_PIN_PROXY);
+ assert(in->state_test(CINODE_STATE_PROXY));
+ in->state_clear(CINODE_STATE_PROXY);
+ }
+ export_proxy_inos.erase(dir);
+
+ // dirs
+ for (list<inodeno_t>::iterator it = export_proxy_dirinos[dir].begin();
+ it != export_proxy_dirinos[dir].end();
+ it++) {
+ CDir *dir = cache->get_inode(*it)->dir;
+ dir->put(CDIR_PIN_PROXY);
+ assert(dir->state_test(CDIR_STATE_PROXY));
+ dir->state_clear(CDIR_STATE_PROXY);
+
+ // hose neg dentries, too, since we're no longer auth
+ CDir_map_t::iterator it;
+ for (it = dir->begin(); it != dir->end(); ) {
+ CDentry *dn = it->second;
+ it++;
+ if (dn->is_null()) {
+ assert(dn->is_sync());
+ dir->remove_dentry(dn);
+ } else {
+ //dout(10) << "export_dir_notify_ack leaving xlocked neg " << *dn << endl;
+ if (dn->is_dirty())
+ dn->mark_clean();
+ }
+ }
+ }
+ export_proxy_dirinos.erase(dir);
+
+ }
+
+ delete m;
+}
+
+
+/*
+ * once i get all teh notify_acks i can finish
+ */
+void Migrator::export_dir_finish(CDir *dir)
+{
+ // exported!
+
+
+ // FIXME log it
+
+ // send finish to new auth
+ mds->send_message_mds(new MExportDirFinish(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR);
+
+ // unfreeze
+ dout(7) << "export_dir_finish " << *dir << ", unfreezing" << endl;
+ dir->unfreeze_tree();
+
+ // unpin path
+ dout(7) << "export_dir_finish unpinning path" << endl;
+ vector<CDentry*> trace;
+ cache->make_trace(trace, dir->inode);
+ cache->path_unpin(trace, 0);
+
+
+ // stats
+ mds->logger->set("nex", cache->exports.size());
+
+ show_imports();
+}
+
+
+
+
+
+
+
+
+
+
+
+
+// IMPORTS
+
+class C_MDC_ExportDirDiscover : public Context {
+ Migrator *mig;
+ MExportDirDiscover *m;
+public:
+ vector<CDentry*> trace;
+ C_MDC_ExportDirDiscover(Migrator *mig_, MExportDirDiscover *m_) :
+ mig(mig_), m(m_) {}
+ void finish(int r) {
+ CInode *in = 0;
+ if (r >= 0) in = trace[trace.size()-1]->get_inode();
+ mig->handle_export_dir_discover_2(m, in, r);
+ }
+};
+
+void Migrator::handle_export_dir_discover(MExportDirDiscover *m)
+{
+ assert(MSG_ADDR_NUM(m->get_source()) != mds->get_nodeid());
+
+ dout(7) << "handle_export_dir_discover on " << m->get_path() << endl;
+
+ // must discover it!
+ C_MDC_ExportDirDiscover *onfinish = new C_MDC_ExportDirDiscover(this, m);
+ filepath fpath(m->get_path());
+ cache->path_traverse(fpath, onfinish->trace, true,
+ m, new C_MDS_RetryMessage(mds,m), // on delay/retry
+ MDS_TRAVERSE_DISCOVER,
+ onfinish); // on completion|error
+}
+
+void Migrator::handle_export_dir_discover_2(MExportDirDiscover *m, CInode *in, int r)
+{
+ // yay!
+ if (in) {
+ dout(7) << "handle_export_dir_discover_2 has " << *in << endl;
+ }
+
+ if (r < 0 || !in->is_dir()) {
+ dout(7) << "handle_export_dir_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl;
+
+ assert(0); // this shouldn't happen if the auth pins his path properly!!!!
+
+ mds->send_message_mds(new MExportDirDiscoverAck(m->get_ino(), false),
+ m->get_source().num(), MDS_PORT_MIGRATOR);
+ delete m;
+ return;
+ }
+
+ assert(in->is_dir());
+
+ if (in->is_frozen()) {
+ dout(7) << "frozen, waiting." << endl;
+ in->add_waiter(CINODE_WAIT_AUTHPINNABLE,
+ new C_MDS_RetryMessage(mds,m));
+ return;
+ }
+
+ // pin inode in the cache (for now)
+ in->get(CINODE_PIN_IMPORTING);
+
+ // pin auth too, until the import completes.
+ in->auth_pin();
+
+ // reply
+ dout(7) << " sending export_dir_discover_ack on " << *in << endl;
+ mds->send_message_mds(new MExportDirDiscoverAck(in->ino()),
+ m->get_source().num(), MDS_PORT_MIGRATOR);
+ delete m;
+}
+
+
+
+void Migrator::handle_export_dir_prep(MExportDirPrep *m)
+{
+ assert(MSG_ADDR_NUM(m->get_source()) != mds->get_nodeid());
+
+ CInode *diri = cache->get_inode(m->get_ino());
+ assert(diri);
+
+ list<Context*> finished;
+
+ // assimilate root dir.
+ CDir *dir = diri->dir;
+ if (dir) {
+ dout(7) << "handle_export_dir_prep on " << *dir << " (had dir)" << endl;
+
+ if (!m->did_assim())
+ m->get_dir(diri->ino())->update_dir(dir);
+ } else {
+ assert(!m->did_assim());
+
+ // open dir i'm importing.
+ diri->set_dir( new CDir(diri, mds, false) );
+ dir = diri->dir;
+ m->get_dir(diri->ino())->update_dir(dir);
+
+ dout(7) << "handle_export_dir_prep on " << *dir << " (opening dir)" << endl;
+
+ diri->take_waiting(CINODE_WAIT_DIR, finished);
+ }
+ assert(dir->is_auth() == false);
+
+ show_imports();
+
+ // assimilate contents?
+ if (!m->did_assim()) {
+ dout(7) << "doing assim on " << *dir << endl;
+ m->mark_assim(); // only do this the first time!
+
+ // move pin to dir
+ diri->put(CINODE_PIN_IMPORTING);
+ dir->get(CDIR_PIN_IMPORTING);
+
+ // auth pin too
+ dir->auth_pin();
+ diri->auth_unpin();
+
+ // assimilate traces to exports
+ for (list<CInodeDiscover*>::iterator it = m->get_inodes().begin();
+ it != m->get_inodes().end();
+ it++) {
+ // inode
+ CInode *in = cache->get_inode( (*it)->get_ino() );
+ if (in) {
+ (*it)->update_inode(in);
+ dout(7) << " updated " << *in << endl;
+ } else {
+ in = new CInode(false);
+ (*it)->update_inode(in);
+
+ // link to the containing dir
+ CInode *condiri = cache->get_inode( m->get_containing_dirino(in->ino()) );
+ assert(condiri && condiri->dir);
+ cache->add_inode( in );
+ condiri->dir->add_dentry( m->get_dentry(in->ino()), in );
+
+ dout(7) << " added " << *in << endl;
+ }
+
+ assert( in->get_parent_dir()->ino() == m->get_containing_dirino(in->ino()) );
+
+ // dir
+ if (m->have_dir(in->ino())) {
+ if (in->dir) {
+ m->get_dir(in->ino())->update_dir(in->dir);
+ dout(7) << " updated " << *in->dir << endl;
+ } else {
+ in->set_dir( new CDir(in, mds, false) );
+ m->get_dir(in->ino())->update_dir(in->dir);
+ dout(7) << " added " << *in->dir << endl;
+ in->take_waiting(CINODE_WAIT_DIR, finished);
+ }
+ }
+ }
+
+ // open export dirs?
+ for (list<inodeno_t>::iterator it = m->get_exports().begin();
+ it != m->get_exports().end();
+ it++) {
+ dout(7) << " checking dir " << hex << *it << dec << endl;
+ CInode *in = cache->get_inode(*it);
+ assert(in);
+
+ if (!in->dir) {
+ dout(7) << " opening nested export on " << *in << endl;
+ cache->open_remote_dir(in,
+ new C_MDS_RetryMessage(mds, m));
+
+ // pin it!
+ in->get(CINODE_PIN_OPENINGDIR);
+ in->state_set(CINODE_STATE_OPENINGDIR);
+ }
+ }
+ } else {
+ dout(7) << " not doing assim on " << *dir << endl;
+ }
+
+
+ // verify we have all exports
+ int waiting_for = 0;
+ for (list<inodeno_t>::iterator it = m->get_exports().begin();
+ it != m->get_exports().end();
+ it++) {
+ inodeno_t ino = *it;
+ CInode *in = cache->get_inode(ino);
+ if (!in) dout(0) << "** missing ino " << hex << ino << dec << endl;
+ assert(in);
+ if (in->dir) {
+ if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) {
+ dout(7) << " pinning nested export " << *in->dir << endl;
+ in->dir->get(CDIR_PIN_IMPORTINGEXPORT);
+ in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT);
+
+ if (in->state_test(CINODE_STATE_OPENINGDIR)) {
+ in->put(CINODE_PIN_OPENINGDIR);
+ in->state_clear(CINODE_STATE_OPENINGDIR);
+ }
+ } else {
+ dout(7) << " already pinned nested export " << *in << endl;
+ }
+ } else {
+ dout(7) << " waiting for nested export dir on " << *in << endl;
+ waiting_for++;
+ }
+ }
+ if (waiting_for) {
+ dout(7) << " waiting for " << waiting_for << " nested export dir opens" << endl;
+ } else {
+ // ok!
+ dout(7) << " all ready, sending export_dir_prep_ack on " << *dir << endl;
+ mds->send_message_mds(new MExportDirPrepAck(dir->ino()),
+ m->get_source().num(), MDS_PORT_MIGRATOR);
+
+ // done
+ delete m;
+ }
+
+ // finish waiters
+ finish_contexts(finished, 0);
+}
+
+
+
+
+/* this guy waits for the pre-import discovers on hashed directory dir inodes to finish.
+ * if it's the last one on the dir, it reprocessed the import.
+ */
+/*
+class C_MDS_ImportPrediscover : public Context {
+public:
+ MDS *mds;
+ MExportDir *m;
+ inodeno_t dir_ino;
+ string dentry;
+ C_MDS_ImportPrediscover(MDS *mds, MExportDir *m, inodeno_t dir_ino, const string& dentry) {
+ this->mds = mds;
+ this->m = m;
+ this->dir_ino = dir_ino;
+ this->dentry = dentry;
+ }
+ virtual void finish(int r) {
+ assert(r == 0); // should never fail!
+
+ m->remove_prediscover(dir_ino, dentry);
+
+ if (!m->any_prediscovers())
+ mds->mdcache->handle_export_dir(m);
+ }
+};
+*/
+
+
+
+void Migrator::handle_export_dir(MExportDir *m)
+{
+ CInode *diri = cache->get_inode(m->get_ino());
+ assert(diri);
+ CDir *dir = diri->dir;
+ assert(dir);
+
+ int oldauth = MSG_ADDR_NUM(m->get_source());
+ dout(7) << "handle_export_dir, import " << *dir << " from " << oldauth << endl;
+ assert(dir->is_auth() == false);
+
+
+
+ show_imports();
+
+ // note new authority (locally)
+ if (dir->inode->is_auth())
+ dir->set_dir_auth( CDIR_AUTH_PARENT );
+ else
+ dir->set_dir_auth( mds->get_nodeid() );
+ dout(10) << " set dir_auth to " << dir->get_dir_auth() << endl;
+
+ // update imports/exports
+ CDir *containing_import;
+ if (cache->exports.count(dir)) {
+ // reimporting
+ dout(7) << " i'm reimporting " << *dir << endl;
+ cache->exports.erase(dir);
+
+ dir->state_clear(CDIR_STATE_EXPORT);
+ dir->put(CDIR_PIN_EXPORT); // unpin, no longer an export
+
+ containing_import = cache->get_auth_container(dir);
+ dout(7) << " it is nested under import " << *containing_import << endl;
+ cache->nested_exports[containing_import].erase(dir);
+ } else {
+ // new import
+ cache->imports.insert(dir);
+ dir->state_set(CDIR_STATE_IMPORT);
+ dir->get(CDIR_PIN_IMPORT); // must keep it pinned
+
+ containing_import = dir; // imported exports nested under *in
+
+ dout(7) << " new import at " << *dir << endl;
+ }
+
+
+ // take out my temp pin
+ dir->put(CDIR_PIN_IMPORTING);
+
+ // add any inherited exports
+ for (list<inodeno_t>::iterator it = m->get_exports().begin();
+ it != m->get_exports().end();
+ it++) {
+ CInode *exi = cache->get_inode(*it);
+ assert(exi && exi->dir);
+ CDir *ex = exi->dir;
+
+ dout(15) << " nested export " << *ex << endl;
+
+ // remove our pin
+ ex->put(CDIR_PIN_IMPORTINGEXPORT);
+ ex->state_clear(CDIR_STATE_IMPORTINGEXPORT);
+
+
+ // add...
+ if (ex->is_import()) {
+ dout(7) << " importing my import " << *ex << endl;
+ cache->imports.erase(ex);
+ ex->state_clear(CDIR_STATE_IMPORT);
+
+ mds->logger->inc("imex");
+
+ // move nested exports under containing_import
+ for (set<CDir*>::iterator it = cache->nested_exports[ex].begin();
+ it != cache->nested_exports[ex].end();
+ it++) {
+ dout(7) << " moving nested export " << **it << " under " << *containing_import << endl;
+ cache->nested_exports[containing_import].insert(*it);
+ }
+ cache->nested_exports.erase(ex); // de-list under old import
+
+ ex->set_dir_auth( CDIR_AUTH_PARENT );
+ ex->put(CDIR_PIN_IMPORT); // imports are pinned, no longer import
+
+ } else {
+ dout(7) << " importing export " << *ex << endl;
+
+ // add it
+ ex->state_set(CDIR_STATE_EXPORT);
+ ex->get(CDIR_PIN_EXPORT); // all exports are pinned
+ cache->exports.insert(ex);
+ cache->nested_exports[containing_import].insert(ex);
+ mds->logger->inc("imex");
+ }
+
+ }
+
+
+ // add this crap to my cache
+ list<inodeno_t> imported_subdirs;
+ bufferlist dir_state;
+ dir_state.claim( m->get_state() );
+ int off = 0;
+ int num_imported_inodes = 0;
+
+ for (int i = 0; i < m->get_ndirs(); i++) {
+ num_imported_inodes +=
+ import_dir_block(dir_state,
+ off,
+ oldauth,
+ dir, // import root
+ imported_subdirs);
+ }
+ dout(10) << " " << imported_subdirs.size() << " imported subdirs" << endl;
+ dout(10) << " " << m->get_exports().size() << " imported nested exports" << endl;
+
+
+ // adjust popularity
+ mds->balancer->add_import(dir);
+
+ // send notify's etc.
+ dout(7) << "sending notifyack for " << *dir << " to old auth " << MSG_ADDR_NUM(m->get_source()) << endl;
+ mds->send_message_mds(new MExportDirNotifyAck(dir->inode->ino()),
+ m->get_source().num(), MDS_PORT_MIGRATOR);
+
+ dout(7) << "sending notify to others" << endl;
+ for (set<int>::iterator it = dir->open_by.begin();
+ it != dir->open_by.end();
+ it++) {
+ assert( *it != mds->get_nodeid() );
+ if ( *it == MSG_ADDR_NUM(m->get_source()) ) continue; // not to old auth.
+
+ MExportDirNotify *notify = new MExportDirNotify(dir->ino(), MSG_ADDR_NUM(m->get_source()), mds->get_nodeid());
+ notify->copy_exports(m->get_exports());
+
+ if (g_conf.mds_verify_export_dirauth)
+ notify->copy_subdirs(imported_subdirs); // copy subdir list (DEBUG)
+
+ mds->send_message_mds(notify, *it, MDS_PORT_MIGRATOR);
+ }
+
+ // done
+ delete m;
+
+ show_imports();
+
+
+ // is it empty?
+ if (dir->get_size() == 0 &&
+ !dir->inode->is_auth()) {
+ // reexport!
+ export_empty_import(dir);
+ }
+
+
+ // some stats
+ mds->logger->inc("im");
+ mds->logger->inc("iim", num_imported_inodes);
+ mds->logger->set("nim", cache->imports.size());
+
+
+ // FIXME LOG IT
+
+ /*
+ stupid hashing crap, FIXME
+
+ // wait for replicas in hashed dirs?
+ if (import_hashed_replicate_waiting.count(m->get_ino())) {
+ // it'll happen later!, when i get my inodegetreplicaack's back
+ } else {
+ // finish now
+ //not anymoreimport_dir_finish(dir);
+ }
+ */
+
+}
+
+
+
+void Migrator::handle_export_dir_finish(MExportDirFinish *m)
+{
+ CInode *diri = cache->get_inode(m->get_ino());
+ CDir *dir = diri->dir;
+ assert(dir);
+
+ dout(7) << "handle_export_dir_finish on " << *dir << endl;
+ assert(dir->is_auth());
+
+ dout(5) << "done with import of " << *dir << endl;
+ show_imports();
+ mds->logger->set("nex", cache->exports.size());
+ mds->logger->set("nim", cache->imports.size());
+
+ // un auth pin (other exports can now proceed)
+ dir->auth_unpin();
+
+ // ok now finish contexts
+ dout(5) << "finishing any waiters on imported data" << endl;
+ dir->finish_waiting(CDIR_WAIT_IMPORTED);
+
+ delete m;
+}
+
+
+void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int oldauth)
+{
+ CInodeExport istate;
+ off = istate._decode(bl, off);
+ dout(15) << "got a cinodeexport " << endl;
+
+ bool added = false;
+ CInode *in = cache->get_inode(istate.get_ino());
+ if (!in) {
+ in = new CInode;
+ added = true;
+ } else {
+ in->set_auth(true);
+ }
+
+ // link before state
+ if (dn->inode != in) {
+ assert(!dn->inode);
+ dn->dir->link_inode(dn, in);
+ }
+
+ // state after link
+ set<int> merged_client_caps;
+ istate.update_inode(in, merged_client_caps);
+
+
+ // add inode?
+ if (added) {
+ cache->add_inode(in);
+ dout(10) << "added " << *in << endl;
+ } else {
+ dout(10) << " had " << *in << endl;
+ }
+
+
+ // cached_by
+ assert(!in->is_cached_by(oldauth));
+ in->cached_by_add( oldauth, CINODE_EXPORT_NONCE );
+ if (in->is_cached_by(mds->get_nodeid()))
+ in->cached_by_remove(mds->get_nodeid());
+
+ // twiddle locks
+ // hard
+ if (in->hardlock.get_state() == LOCK_GLOCKR) {
+ in->hardlock.gather_set.erase(mds->get_nodeid());
+ in->hardlock.gather_set.erase(oldauth);
+ if (in->hardlock.gather_set.empty())
+ cache->inode_hard_eval(in);
+ }
+
+ // caps
+ for (set<int>::iterator it = merged_client_caps.begin();
+ it != merged_client_caps.end();
+ it++) {
+ MClientFileCaps *caps = new MClientFileCaps(in->inode,
+ in->client_caps[*it].get_last_seq(),
+ in->client_caps[*it].pending(),
+ in->client_caps[*it].wanted(),
+ MClientFileCaps::FILECAP_REAP);
+ caps->set_mds( oldauth ); // reap from whom?
+ mds->messenger->send_message(caps,
+ MSG_ADDR_CLIENT(*it), mds->clientmap.get_inst(*it),
+ 0, MDS_PORT_CACHE);
+ }
+
+ // filelock
+ if (!in->filelock.is_stable()) {
+ // take me and old auth out of gather set
+ in->filelock.gather_set.erase(mds->get_nodeid());
+ in->filelock.gather_set.erase(oldauth);
+ if (in->filelock.gather_set.empty()) // necessary but not suffient...
+ cache->inode_file_eval(in);
+ }
+
+ // other
+ if (in->is_dirty()) {
+ dout(10) << "logging dirty import " << *in << endl;
+ mds->mdlog->submit_entry(new EInodeUpdate(in));
+ }
+}
+
+
+int Migrator::import_dir_block(bufferlist& bl,
+ int& off,
+ int oldauth,
+ CDir *import_root,
+ list<inodeno_t>& imported_subdirs)
+{
+ // set up dir
+ CDirExport dstate;
+ off = dstate._decode(bl, off);
+
+ CInode *diri = cache->get_inode(dstate.get_ino());
+ assert(diri);
+ CDir *dir = diri->get_or_open_dir(mds);
+ assert(dir);
+
+ dout(7) << " import_dir_block " << *dir << " have " << dir->nitems << " items, importing " << dstate.get_nden() << " dentries" << endl;
+
+ // add to list
+ if (dir != import_root)
+ imported_subdirs.push_back(dir->ino());
+
+ // assimilate state
+ dstate.update_dir( dir );
+ if (diri->is_auth())
+ dir->set_dir_auth( CDIR_AUTH_PARENT ); // update_dir may hose dir_auth
+
+ // mark (may already be marked from get_or_open_dir() above)
+ if (!dir->is_auth())
+ dir->state_set(CDIR_STATE_AUTH);
+
+ // open_by
+ assert(!dir->is_open_by(oldauth));
+ dir->open_by_add(oldauth);
+ if (dir->is_open_by(mds->get_nodeid()))
+ dir->open_by_remove(mds->get_nodeid());
+
+ if (dir->is_hashed()) {
+
+ // do nothing; dir is hashed
+ return 0;
+ } else {
+ // take all waiters on this dir
+ // NOTE: a pass of imported data is guaranteed to get all of my waiters because
+ // a replica's presense in my cache implies/forces it's presense in authority's.
+ list<Context*> waiters;
+
+ dir->take_waiting(CDIR_WAIT_ANY, waiters);
+ for (list<Context*>::iterator it = waiters.begin();
+ it != waiters.end();
+ it++)
+ import_root->add_waiter(CDIR_WAIT_IMPORTED, *it);
+
+ dout(15) << "doing contents" << endl;
+
+ // contents
+ int num_imported = 0;
+ long nden = dstate.get_nden();
+
+ for (; nden>0; nden--) {
+
+ num_imported++;
+
+ // dentry
+ string dname;
+ _decode(dname, bl, off);
+ dout(15) << "dname is " << dname << endl;
+
+ char dirty;
+ bl.copy(off, 1, &dirty);
+ off++;
+
+ char icode;
+ bl.copy(off, 1, &icode);
+ off++;
+
+ CDentry *dn = dir->lookup(dname);
+ if (!dn)
+ dn = dir->add_dentry(dname); // null
+
+ // mark dn dirty _after_ we link the inode (scroll down)
+
+ if (icode == 'N') {
+ // null dentry
+ assert(dn->is_null());
+
+ // fall thru
+ }
+ else if (icode == 'L') {
+ // remote link
+ inodeno_t ino;
+ bl.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ dir->link_inode(dn, ino);
+ }
+ else if (icode == 'I') {
+ // inode
+ decode_import_inode(dn, bl, off, oldauth);
+ }
+
+ // mark dentry dirty? (only _after_ we link the inode!)
+ if (dirty == 'D') dn->mark_dirty();
+
+ }
+
+ if (dir->is_dirty())
+ mds->mdlog->submit_entry(new EDirUpdate(dir));
+
+ return num_imported;
+ }
+}
+
+
+
+
+
+// authority bystander
+
+void Migrator::handle_export_dir_warning(MExportDirWarning *m)
+{
+ // add to warning list
+ stray_export_warnings.insert( m->get_ino() );
+
+ // did i already see the notify?
+ if (stray_export_notifies.count(m->get_ino())) {
+ // i did, we're good.
+ dout(7) << "handle_export_dir_warning on " << m->get_ino() << ". already got notify." << endl;
+
+ // process the notify
+ map<inodeno_t, MExportDirNotify*>::iterator it = stray_export_notifies.find(m->get_ino());
+ handle_export_dir_notify(it->second);
+ stray_export_notifies.erase(it);
+ } else {
+ dout(7) << "handle_export_dir_warning on " << m->get_ino() << ". waiting for notify." << endl;
+ }
+
+ // done
+ delete m;
+}
+
+
+void Migrator::handle_export_dir_notify(MExportDirNotify *m)
+{
+ CDir *dir = 0;
+ CInode *in = cache->get_inode(m->get_ino());
+ if (in) dir = in->dir;
+
+ // did i see the warning yet?
+ if (!stray_export_warnings.count(m->get_ino())) {
+ // wait for it.
+ dout(7) << "export_dir_notify on " << m->get_ino() << ", waiting for warning." << endl;
+ stray_export_notifies.insert(pair<inodeno_t, MExportDirNotify*>( m->get_ino(), m ));
+ return;
+ }
+
+ // i did, we're all good.
+ dout(7) << "export_dir_notify on " << m->get_ino() << ", already saw warning." << endl;
+
+ // update dir_auth!
+ if (dir) {
+ dout(7) << "export_dir_notify on " << *dir << " new_auth " << m->get_new_auth() << " (old_auth " << m->get_old_auth() << ")" << endl;
+
+ // update bounds first
+ for (list<inodeno_t>::iterator it = m->get_exports().begin();
+ it != m->get_exports().end();
+ it++) {
+ CInode *n = cache->get_inode(*it);
+ if (!n) continue;
+ CDir *ndir = n->dir;
+ if (!ndir) continue;
+
+ int boundauth = ndir->authority();
+ dout(7) << "export_dir_notify bound " << *ndir << " was dir_auth " << ndir->get_dir_auth() << " (" << boundauth << ")" << endl;
+ if (ndir->get_dir_auth() == CDIR_AUTH_PARENT) {
+ if (boundauth != m->get_new_auth())
+ ndir->set_dir_auth( boundauth );
+ else assert(dir->authority() == m->get_new_auth()); // apparently we already knew!
+ } else {
+ if (boundauth == m->get_new_auth())
+ ndir->set_dir_auth( CDIR_AUTH_PARENT );
+ }
+ }
+
+ // update dir_auth
+ if (in->authority() == m->get_new_auth()) {
+ dout(7) << "handle_export_dir_notify on " << *in << ": inode auth is the same, setting dir_auth -1" << endl;
+ dir->set_dir_auth( CDIR_AUTH_PARENT );
+ assert(!in->is_auth());
+ assert(!dir->is_auth());
+ } else {
+ dir->set_dir_auth( m->get_new_auth() );
+ }
+ assert(dir->authority() != mds->get_nodeid());
+ assert(!dir->is_auth());
+
+ // DEBUG: verify subdirs
+ if (g_conf.mds_verify_export_dirauth) {
+
+ dout(7) << "handle_export_dir_notify on " << *dir << " checking " << m->num_subdirs() << " subdirs" << endl;
+ for (list<inodeno_t>::iterator it = m->subdirs_begin();
+ it != m->subdirs_end();
+ it++) {
+ CInode *diri = cache->get_inode(*it);
+ if (!diri) continue; // don't have it, don't care
+ if (!diri->dir) continue;
+ dout(10) << "handle_export_dir_notify checking subdir " << *diri->dir << " is auth " << diri->dir->get_dir_auth() << endl;
+ assert(diri->dir != dir); // base shouldn't be in subdir list
+ if (diri->dir->get_dir_auth() != CDIR_AUTH_PARENT) {
+ dout(7) << "*** weird value for dir_auth " << diri->dir->get_dir_auth() << " on " << *diri->dir << ", should have been -1 probably??? ******************" << endl;
+ assert(0); // bad news!
+ //dir->set_dir_auth( CDIR_AUTH_PARENT );
+ }
+ assert(diri->dir->authority() == m->get_new_auth());
+ }
+ }
+ }
+
+ // send notify ack to old auth
+ dout(7) << "handle_export_dir_notify sending ack to old_auth " << m->get_old_auth() << endl;
+ mds->send_message_mds(new MExportDirNotifyAck(m->get_ino()),
+ m->get_old_auth(), MDS_PORT_MIGRATOR);
+
+
+ // done
+ stray_export_warnings.erase( m->get_ino() );
+ delete m;
+}
+
+
+
+
+
+// =======================================================================
+// HASHING
+
+
+void Migrator::import_hashed_content(CDir *dir, bufferlist& bl, int nden, int oldauth)
+{
+ int off = 0;
+
+ for (; nden>0; nden--) {
+ // dentry
+ string dname;
+ _decode(dname, bl, off);
+ dout(15) << "dname is " << dname << endl;
+
+ char icode;
+ bl.copy(off, 1, &icode);
+ off++;
+
+ CDentry *dn = dir->lookup(dname);
+ if (!dn)
+ dn = dir->add_dentry(dname); // null
+
+ // mark dn dirty _after_ we link the inode (scroll down)
+
+ if (icode == 'N') {
+
+ // null dentry
+ assert(dn->is_null());
+
+ // fall thru
+ }
+ else if (icode == 'L') {
+ // remote link
+ inodeno_t ino;
+ bl.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ dir->link_inode(dn, ino);
+ }
+ else if (icode == 'I') {
+ // inode
+ decode_import_inode(dn, bl, off, oldauth);
+
+ // fix up subdir export?
+ if (dn->inode->dir) {
+ assert(dn->inode->dir->state_test(CDIR_STATE_IMPORTINGEXPORT));
+ dn->inode->dir->put(CDIR_PIN_IMPORTINGEXPORT);
+ dn->inode->dir->state_clear(CDIR_STATE_IMPORTINGEXPORT);
+
+ if (dn->inode->dir->is_auth()) {
+ // mine. must have been an import.
+ assert(dn->inode->dir->is_import());
+ dout(7) << "unimporting subdir now that inode is mine " << *dn->inode->dir << endl;
+ dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT );
+ cache->imports.erase(dn->inode->dir);
+ dn->inode->dir->put(CDIR_PIN_IMPORT);
+ dn->inode->dir->state_clear(CDIR_STATE_IMPORT);
+
+ // move nested under hashdir
+ for (set<CDir*>::iterator it = cache->nested_exports[dn->inode->dir].begin();
+ it != cache->nested_exports[dn->inode->dir].end();
+ it++)
+ cache->nested_exports[dir].insert(*it);
+ cache->nested_exports.erase(dn->inode->dir);
+
+ // now it matches the inode
+ dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT );
+ }
+ else {
+ // not mine. make it an export.
+ dout(7) << "making subdir into export " << *dn->inode->dir << endl;
+ dn->inode->dir->get(CDIR_PIN_EXPORT);
+ dn->inode->dir->state_set(CDIR_STATE_EXPORT);
+ cache->exports.insert(dn->inode->dir);
+ cache->nested_exports[dir].insert(dn->inode->dir);
+
+ if (dn->inode->dir->get_dir_auth() == CDIR_AUTH_PARENT)
+ dn->inode->dir->set_dir_auth( oldauth ); // no longer matches inode
+ assert(dn->inode->dir->get_dir_auth() >= 0);
+ }
+ }
+ }
+
+ // mark dentry dirty? (only _after_ we link the inode!)
+ dn->mark_dirty();
+ }
+}
+
+/*
+
+ notes on interaction of hashing and export/import:
+
+ - dir->is_auth() is completely independent of hashing. for a hashed dir,
+ - all nodes are partially authoritative
+ - all nodes dir->is_hashed() == true
+ - all nodes dir->inode->dir_is_hashed() == true
+ - one node dir->is_auth() == true, the rest == false
+ - dir_auth for all subdirs in a hashed dir will (likely?) be explicit.
+
+ - remember simple rule: dir auth follows inode, unless dir_auth is explicit.
+
+ - export_dir_walk and import_dir_block take care with dir_auth: (for import/export)
+ - on export, -1 is changed to mds->get_nodeid()
+ - on import, nothing special, actually.
+
+ - hashed dir files aren't included in export; subdirs are converted to imports
+ or exports as necessary.
+ - hashed dir subdirs are discovered on export. this is important
+ because dirs are needed to tie together auth hierarchy, for auth to know about
+ imports/exports, etc.
+
+ - dir state is maintained on auth.
+ - COMPLETE and HASHED are transfered to importers.
+ - DIRTY is set everywhere.
+
+ - hashed dir is like an import: hashed dir used for nested_exports map.
+ - nested_exports is updated appropriately on auth and replicas.
+ - a subtree terminates as a hashed dir, since the hashing explicitly
+ redelegates all inodes. thus export_dir_walk includes hashed dirs, but
+ not their inodes.
+*/
+
+// HASH on auth
+
+class C_MDC_HashFreeze : public Context {
+public:
+ Migrator *mig;
+ CDir *dir;
+ C_MDC_HashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {}
+ virtual void finish(int r) {
+ mig->hash_dir_frozen(dir);
+ }
+};
+
+class C_MDC_HashComplete : public Context {
+public:
+ Migrator *mig;
+ CDir *dir;
+ C_MDC_HashComplete(Migrator *mig, CDir *dir) {
+ this->mig = mig;
+ this->dir = dir;
+ }
+ virtual void finish(int r) {
+ mig->hash_dir_complete(dir);
+ }
+};
+
+
+/** hash_dir(dir)
+ * start hashing a directory.
+ */
+void Migrator::hash_dir(CDir *dir)
+{
+ dout(-7) << "hash_dir " << *dir << endl;
+
+ assert(!dir->is_hashed());
+ assert(dir->is_auth());
+
+ if (dir->is_frozen() ||
+ dir->is_freezing()) {
+ dout(7) << " can't hash, freezing|frozen." << endl;
+ return;
+ }
+
+ // pin path?
+ vector<CDentry*> trace;
+ cache->make_trace(trace, dir->inode);
+ if (!cache->path_pin(trace, 0, 0)) {
+ dout(7) << "hash_dir couldn't pin path, failing." << endl;
+ return;
+ }
+
+ // ok, go
+ dir->state_set(CDIR_STATE_HASHING);
+ dir->get(CDIR_PIN_HASHING);
+ assert(dir->hashed_subset.empty());
+
+ // discover on all mds
+ assert(hash_gather.count(dir) == 0);
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i == mds->get_nodeid()) continue; // except me
+ hash_gather[dir].insert(i);
+ mds->send_message_mds(new MHashDirDiscover(dir->inode), i, MDS_PORT_MIGRATOR);
+ }
+ dir->auth_pin(); // pin until discovers are all acked.
+
+ // start freeze
+ dir->freeze_dir(new C_MDC_HashFreeze(this, dir));
+
+ // make complete
+ if (!dir->is_complete()) {
+ dout(7) << "hash_dir " << *dir << " not complete, fetching" << endl;
+ mds->mdstore->fetch_dir(dir,
+ new C_MDC_HashComplete(this, dir));
+ } else
+ hash_dir_complete(dir);
+}
+
+
+/*
+ * wait for everybody to discover and open the hashing dir
+ * then auth_unpin, to let the freeze happen
+ */
+void Migrator::handle_hash_dir_discover_ack(MHashDirDiscoverAck *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ int from = MSG_ADDR_NUM(m->get_source());
+ assert(hash_gather[dir].count(from));
+ hash_gather[dir].erase(from);
+
+ if (hash_gather[dir].empty()) {
+ hash_gather.erase(dir);
+ dout(7) << "hash_dir_discover_ack " << *dir << ", releasing auth_pin" << endl;
+ dir->auth_unpin(); // unpin to allow freeze to complete
+ } else {
+ dout(7) << "hash_dir_discover_ack " << *dir << ", still waiting for " << hash_gather[dir] << endl;
+ }
+
+ delete m; // done
+}
+
+
+
+/*
+ * once the dir is completely in memory,
+ * mark all migrating inodes dirty (to pin in cache)
+ */
+void Migrator::hash_dir_complete(CDir *dir)
+{
+ dout(7) << "hash_dir_complete " << *dir << ", dirtying inodes" << endl;
+
+ assert(!dir->is_hashed());
+ assert(dir->is_auth());
+
+ // mark dirty to pin in cache
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CInode *in = it->second->inode;
+ in->mark_dirty();
+ }
+
+ if (dir->is_frozen_dir())
+ hash_dir_go(dir);
+}
+
+
+/*
+ * once the dir is frozen,
+ * make sure it's complete
+ * send the prep messages!
+ */
+void Migrator::hash_dir_frozen(CDir *dir)
+{
+ dout(7) << "hash_dir_frozen " << *dir << endl;
+
+ assert(!dir->is_hashed());
+ assert(dir->is_auth());
+ assert(dir->is_frozen_dir());
+
+ if (!dir->is_complete()) {
+ dout(7) << "hash_dir_frozen !complete, waiting still on " << *dir << endl;
+ return;
+ }
+
+ // send prep messages w/ export directories to open
+ vector<MHashDirPrep*> msgs(mds->get_mds_map()->get_num_mds());
+
+ // check for subdirs
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CDentry *dn = it->second;
+ CInode *in = dn->inode;
+
+ if (!in->is_dir()) continue;
+ if (!in->dir) continue;
+
+ int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
+ if (dentryhashcode == mds->get_nodeid()) continue;
+
+ // msg?
+ if (msgs[dentryhashcode] == 0) {
+ msgs[dentryhashcode] = new MHashDirPrep(dir->ino());
+ }
+ msgs[dentryhashcode]->add_inode(it->first, in->replicate_to(dentryhashcode));
+ }
+
+ // send them!
+ assert(hash_gather[dir].empty());
+ for (unsigned i=0; i<msgs.size(); i++) {
+ if (msgs[i]) {
+ mds->send_message_mds(msgs[i], i, MDS_PORT_MIGRATOR);
+ hash_gather[dir].insert(i);
+ }
+ }
+
+ if (hash_gather[dir].empty()) {
+ // no subdirs! continue!
+ hash_gather.erase(dir);
+ hash_dir_go(dir);
+ } else {
+ // wait!
+ }
+}
+
+/*
+ * wait for peers to open all subdirs
+ */
+void Migrator::handle_hash_dir_prep_ack(MHashDirPrepAck *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ int from = MSG_ADDR_NUM(m->get_source());
+
+ assert(hash_gather[dir].count(from) == 1);
+ hash_gather[dir].erase(from);
+
+ if (hash_gather[dir].empty()) {
+ hash_gather.erase(dir);
+ dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", last one" << endl;
+ hash_dir_go(dir);
+ } else {
+ dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl;
+ }
+
+ delete m;
+}
+
+
+/*
+ * once the dir is frozen,
+ * make sure it's complete
+ * do the hashing!
+ */
+void Migrator::hash_dir_go(CDir *dir)
+{
+ dout(7) << "hash_dir_go " << *dir << endl;
+
+ assert(!dir->is_hashed());
+ assert(dir->is_auth());
+ assert(dir->is_frozen_dir());
+
+ // get messages to other nodes ready
+ vector<MHashDir*> msgs(mds->get_mds_map()->get_num_mds());
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i == mds->get_nodeid()) continue;
+ msgs[i] = new MHashDir(dir->ino());
+ }
+
+ // pick a hash seed.
+ dir->inode->inode.hash_seed = 1;//dir->ino();
+
+ // suck up all waiters
+ C_Contexts *fin = new C_Contexts;
+ list<Context*> waiting;
+ dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters
+ fin->take(waiting);
+
+ // get containing import. might be me.
+ CDir *containing_import = cache->get_auth_container(dir);
+ assert(containing_import != dir || dir->is_import());
+
+ // divy up contents
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CDentry *dn = it->second;
+ CInode *in = dn->inode;
+
+ int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
+ if (dentryhashcode == mds->get_nodeid()) {
+ continue; // still mine!
+ }
+
+ bufferlist *bl = msgs[dentryhashcode]->get_state_ptr();
+ assert(bl);
+
+ // -- dentry
+ dout(7) << "hash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl;
+ _encode(it->first, *bl);
+
+ // null dentry?
+ if (dn->is_null()) {
+ bl->append("N", 1); // null dentry
+ assert(dn->is_sync());
+ continue;
+ }
+
+ if (dn->is_remote()) {
+ // remote link
+ bl->append("L", 1); // remote link
+
+ inodeno_t ino = dn->get_remote_ino();
+ bl->append((char*)&ino, sizeof(ino));
+ continue;
+ }
+
+ // primary link
+ // -- inode
+ bl->append("I", 1); // inode dentry
+
+ encode_export_inode(in, *bl, dentryhashcode); // encode, and (update state for) export
+ msgs[dentryhashcode]->inc_nden();
+
+ if (dn->is_dirty())
+ dn->mark_clean();
+
+ // add to proxy
+ hash_proxy_inos[dir].push_back(in);
+ in->state_set(CINODE_STATE_PROXY);
+ in->get(CINODE_PIN_PROXY);
+
+ // fix up subdirs
+ if (in->dir) {
+ if (in->dir->is_auth()) {
+ // mine. make it into an import.
+ dout(7) << "making subdir into import " << *in->dir << endl;
+ in->dir->set_dir_auth( mds->get_nodeid() );
+ cache->imports.insert(in->dir);
+ in->dir->get(CDIR_PIN_IMPORT);
+ in->dir->state_set(CDIR_STATE_IMPORT);
+
+ // fix nested bits
+ for (set<CDir*>::iterator it = cache->nested_exports[containing_import].begin();
+ it != cache->nested_exports[containing_import].end(); ) {
+ CDir *ex = *it;
+ it++;
+ if (cache->get_auth_container(ex) == in->dir) {
+ dout(10) << "moving nested export " << *ex << endl;
+ cache->nested_exports[containing_import].erase(ex);
+ cache->nested_exports[in->dir].insert(ex);
+ }
+ }
+ }
+ else {
+ // not mine.
+ dout(7) << "un-exporting subdir that's being hashed away " << *in->dir << endl;
+ assert(in->dir->is_export());
+ in->dir->put(CDIR_PIN_EXPORT);
+ in->dir->state_clear(CDIR_STATE_EXPORT);
+ cache->exports.erase(in->dir);
+ cache->nested_exports[containing_import].erase(in->dir);
+ if (in->dir->authority() == dentryhashcode)
+ in->dir->set_dir_auth( CDIR_AUTH_PARENT );
+ else
+ in->dir->set_dir_auth( in->dir->authority() );
+ }
+ }
+
+ // waiters
+ list<Context*> waiters;
+ in->take_waiting(CINODE_WAIT_ANY, waiters);
+ fin->take(waiters);
+ }
+
+ // dir state
+ dir->state_set(CDIR_STATE_HASHED);
+ dir->get(CDIR_PIN_HASHED);
+ cache->hashdirs.insert(dir);
+ dir->mark_dirty();
+ mds->mdlog->submit_entry(new EDirUpdate(dir));
+
+ // inode state
+ if (dir->inode->is_auth()) {
+ dir->inode->mark_dirty();
+ mds->mdlog->submit_entry(new EInodeUpdate(dir->inode));
+ }
+
+ // fix up nested_exports?
+ if (containing_import != dir) {
+ dout(7) << "moving nested exports under hashed dir" << endl;
+ for (set<CDir*>::iterator it = cache->nested_exports[containing_import].begin();
+ it != cache->nested_exports[containing_import].end(); ) {
+ CDir *ex = *it;
+ it++;
+ if (cache->get_auth_container(ex) == dir) {
+ dout(7) << " moving nested export under hashed dir: " << *ex << endl;
+ cache->nested_exports[containing_import].erase(ex);
+ cache->nested_exports[dir].insert(ex);
+ } else {
+ dout(7) << " NOT moving nested export under hashed dir: " << *ex << endl;
+ }
+ }
+ }
+
+ // send hash messages
+ assert(hash_gather[dir].empty());
+ assert(hash_notify_gather[dir].empty());
+ assert(dir->hashed_subset.empty());
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ // all nodes hashed locally..
+ dir->hashed_subset.insert(i);
+
+ if (i == mds->get_nodeid()) continue;
+
+ // init hash_gather and hash_notify_gather sets
+ hash_gather[dir].insert(i);
+
+ assert(hash_notify_gather[dir][i].empty());
+ for (int j=0; j<mds->get_mds_map()->get_num_mds(); j++) {
+ if (j == mds->get_nodeid()) continue;
+ if (j == i) continue;
+ hash_notify_gather[dir][i].insert(j);
+ }
+
+ mds->send_message_mds(msgs[i], i, MDS_PORT_MIGRATOR);
+ }
+
+ // wait for all the acks.
+}
+
+
+void Migrator::handle_hash_dir_ack(MHashDirAck *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ assert(dir->is_hashed());
+ assert(dir->is_hashing());
+
+ int from = MSG_ADDR_NUM(m->get_source());
+ assert(hash_gather[dir].count(from) == 1);
+ hash_gather[dir].erase(from);
+
+ if (hash_gather[dir].empty()) {
+ dout(7) << "handle_hash_dir_ack on " << *dir << ", last one" << endl;
+
+ if (hash_notify_gather[dir].empty()) {
+ dout(7) << "got notifies too, all done" << endl;
+ hash_dir_finish(dir);
+ } else {
+ dout(7) << "waiting on notifies " << endl;
+ }
+
+ } else {
+ dout(7) << "handle_hash_dir_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl;
+ }
+
+ delete m;
+}
+
+
+void Migrator::hash_dir_finish(CDir *dir)
+{
+ dout(7) << "hash_dir_finish finishing " << *dir << endl;
+ assert(dir->is_hashed());
+ assert(dir->is_hashing());
+
+ // dir state
+ hash_gather.erase(dir);
+ dir->state_clear(CDIR_STATE_HASHING);
+ dir->put(CDIR_PIN_HASHING);
+ dir->hashed_subset.clear();
+
+ // unproxy inodes
+ // this _could_ happen sooner, on a per-peer basis, but no harm in waiting a few more seconds.
+ for (list<CInode*>::iterator it = hash_proxy_inos[dir].begin();
+ it != hash_proxy_inos[dir].end();
+ it++) {
+ CInode *in = *it;
+ assert(in->state_test(CINODE_STATE_PROXY));
+ in->state_clear(CINODE_STATE_PROXY);
+ in->put(CINODE_PIN_PROXY);
+ }
+ hash_proxy_inos.erase(dir);
+
+ // unpin path
+ vector<CDentry*> trace;
+ cache->make_trace(trace, dir->inode);
+ cache->path_unpin(trace, 0);
+
+ // unfreeze
+ dir->unfreeze_dir();
+
+ show_imports();
+ assert(hash_gather.count(dir) == 0);
+
+ // stats
+ //mds->logger->inc("nh", 1);
+
+}
+
+
+
+
+// HASH on auth and non-auth
+
+void Migrator::handle_hash_dir_notify(MHashDirNotify *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+ assert(dir->is_hashing());
+
+ dout(5) << "handle_hash_dir_notify " << *dir << endl;
+ int from = m->get_from();
+
+ int source = MSG_ADDR_NUM(m->get_source());
+ if (dir->is_auth()) {
+ // gather notifies
+ assert(dir->is_hashed());
+
+ assert( hash_notify_gather[dir][from].count(source) );
+ hash_notify_gather[dir][from].erase(source);
+
+ if (hash_notify_gather[dir][from].empty()) {
+ dout(7) << "last notify from " << from << endl;
+ hash_notify_gather[dir].erase(from);
+
+ if (hash_notify_gather[dir].empty()) {
+ dout(7) << "last notify!" << endl;
+ hash_notify_gather.erase(dir);
+
+ if (hash_gather[dir].empty()) {
+ dout(7) << "got acks too, all done" << endl;
+ hash_dir_finish(dir);
+ } else {
+ dout(7) << "still waiting on acks from " << hash_gather[dir] << endl;
+ }
+ } else {
+ dout(7) << "still waiting for notify gathers from " << hash_notify_gather[dir].size() << " others" << endl;
+ }
+ } else {
+ dout(7) << "still waiting for notifies from " << from << " via " << hash_notify_gather[dir][from] << endl;
+ }
+
+ // delete msg
+ delete m;
+ } else {
+ // update dir hashed_subset
+ assert(dir->hashed_subset.count(from) == 0);
+ dir->hashed_subset.insert(from);
+
+ // update open subdirs
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CDentry *dn = it->second;
+ CInode *in = dn->get_inode();
+ if (!in) continue;
+ if (!in->dir) continue;
+
+ int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
+ if (dentryhashcode != from) continue; // we'll import these in a minute
+
+ if (in->dir->authority() != dentryhashcode)
+ in->dir->set_dir_auth( in->dir->authority() );
+ else
+ in->dir->set_dir_auth( CDIR_AUTH_PARENT );
+ }
+
+ // remove from notify gather set
+ assert(hash_gather[dir].count(from));
+ hash_gather[dir].erase(from);
+
+ // last notify?
+ if (hash_gather[dir].empty()) {
+ dout(7) << "gathered all the notifies, finishing hash of " << *dir << endl;
+ hash_gather.erase(dir);
+
+ dir->state_clear(CDIR_STATE_HASHING);
+ dir->put(CDIR_PIN_HASHING);
+ dir->hashed_subset.clear();
+ } else {
+ dout(7) << "still waiting for notify from " << hash_gather[dir] << endl;
+ }
+
+ // fw notify to auth
+ mds->send_message_mds(m, dir->authority(), MDS_PORT_MIGRATOR);
+ }
+}
+
+
+
+
+// HASH on non-auth
+
+/*
+ * discover step:
+ * each peer needs to open up the directory and pin it before we start
+ */
+class C_MDC_HashDirDiscover : public Context {
+ Migrator *mig;
+ MHashDirDiscover *m;
+public:
+ vector<CDentry*> trace;
+ C_MDC_HashDirDiscover(Migrator *mig, MHashDirDiscover *m) {
+ this->mig = mig;
+ this->m = m;
+ }
+ void finish(int r) {
+ CInode *in = 0;
+ if (r >= 0) {
+ if (trace.size())
+ in = trace[trace.size()-1]->get_inode();
+ else
+ in = mig->cache->get_root();
+ }
+ mig->handle_hash_dir_discover_2(m, in, r);
+ }
+};
+
+void Migrator::handle_hash_dir_discover(MHashDirDiscover *m)
+{
+ assert(MSG_ADDR_NUM(m->get_source()) != mds->get_nodeid());
+
+ dout(7) << "handle_hash_dir_discover on " << m->get_path() << endl;
+
+ // must discover it!
+ C_MDC_HashDirDiscover *onfinish = new C_MDC_HashDirDiscover(this, m);
+ filepath fpath(m->get_path());
+ cache->path_traverse(fpath, onfinish->trace, true,
+ m, new C_MDS_RetryMessage(mds,m), // on delay/retry
+ MDS_TRAVERSE_DISCOVER,
+ onfinish); // on completion|error
+}
+
+void Migrator::handle_hash_dir_discover_2(MHashDirDiscover *m, CInode *in, int r)
+{
+ // yay!
+ if (in) {
+ dout(7) << "handle_hash_dir_discover_2 has " << *in << endl;
+ }
+
+ if (r < 0 || !in->is_dir()) {
+ dout(7) << "handle_hash_dir_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl;
+ assert(0); // this shouldn't happen if the auth pins his path properly!!!!
+ }
+ assert(in->is_dir());
+
+ // is dir open?
+ if (!in->dir) {
+ dout(7) << "handle_hash_dir_discover_2 opening dir " << *in << endl;
+ cache->open_remote_dir(in,
+ new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+ CDir *dir = in->dir;
+
+ // pin dir, set hashing flag
+ dir->state_set(CDIR_STATE_HASHING);
+ dir->get(CDIR_PIN_HASHING);
+ assert(dir->hashed_subset.empty());
+
+ // inode state
+ dir->inode->inode.hash_seed = 1;// dir->ino();
+ if (dir->inode->is_auth()) {
+ dir->inode->mark_dirty();
+ mds->mdlog->submit_entry(new EInodeUpdate(dir->inode));
+ }
+
+ // get gather set ready for notifies
+ assert(hash_gather[dir].empty());
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i == mds->get_nodeid()) continue;
+ if (i == dir->authority()) continue;
+ hash_gather[dir].insert(i);
+ }
+
+ // reply
+ dout(7) << " sending hash_dir_discover_ack on " << *dir << endl;
+ mds->send_message_mds(new MHashDirDiscoverAck(dir->ino()),
+ m->get_source().num(), MDS_PORT_MIGRATOR);
+ delete m;
+}
+
+/*
+ * prep step:
+ * peers need to open up all subdirs of the hashed dir
+ */
+
+void Migrator::handle_hash_dir_prep(MHashDirPrep *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ dout(7) << "handle_hash_dir_prep " << *dir << endl;
+
+ if (!m->did_assim()) {
+ m->mark_assim(); // only do this the first time!
+
+ // assimilate dentry+inodes for exports
+ for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
+ it != m->get_inodes().end();
+ it++) {
+ CInode *in = cache->get_inode( it->second->get_ino() );
+ if (in) {
+ it->second->update_inode(in);
+ dout(5) << " updated " << *in << endl;
+ } else {
+ in = new CInode(false);
+ it->second->update_inode(in);
+ cache->add_inode(in);
+
+ // link
+ dir->add_dentry( it->first, in );
+ dout(5) << " added " << *in << endl;
+ }
+
+ // open!
+ if (!in->dir) {
+ dout(5) << " opening nested export on " << *in << endl;
+ cache->open_remote_dir(in,
+ new C_MDS_RetryMessage(mds, m));
+ }
+ }
+ }
+
+ // verify!
+ int waiting_for = 0;
+ for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
+ it != m->get_inodes().end();
+ it++) {
+ CInode *in = cache->get_inode( it->second->get_ino() );
+ assert(in);
+
+ if (in->dir) {
+ if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) {
+ dout(5) << " pinning nested export " << *in->dir << endl;
+ in->dir->get(CDIR_PIN_IMPORTINGEXPORT);
+ in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT);
+ } else {
+ dout(5) << " already pinned nested export " << *in << endl;
+ }
+ } else {
+ dout(5) << " waiting for nested export dir on " << *in << endl;
+ waiting_for++;
+ }
+ }
+
+ if (waiting_for) {
+ dout(5) << "waiting for " << waiting_for << " dirs to open" << endl;
+ return;
+ }
+
+ // ack!
+ mds->send_message_mds(new MHashDirPrepAck(dir->ino()),
+ m->get_source().num(), MDS_PORT_MIGRATOR);
+
+ // done.
+ delete m;
+}
+
+
+/*
+ * hash step:
+ */
+
+void Migrator::handle_hash_dir(MHashDir *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+ assert(!dir->is_auth());
+ assert(!dir->is_hashed());
+ assert(dir->is_hashing());
+
+ dout(5) << "handle_hash_dir " << *dir << endl;
+ int oldauth = MSG_ADDR_NUM(m->get_source());
+
+ // content
+ import_hashed_content(dir, m->get_state(), m->get_nden(), oldauth);
+
+ // dir state
+ dir->state_set(CDIR_STATE_HASHED);
+ dir->get(CDIR_PIN_HASHED);
+ cache->hashdirs.insert(dir);
+ dir->hashed_subset.insert(mds->get_nodeid());
+
+ // dir is complete
+ dir->mark_complete();
+ dir->mark_dirty();
+ mds->mdlog->submit_entry(new EDirUpdate(dir));
+
+ // commit
+ mds->mdstore->commit_dir(dir, 0);
+
+ // send notifies
+ dout(7) << "sending notifies" << endl;
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i == mds->get_nodeid()) continue;
+ if (i == MSG_ADDR_NUM(m->get_source())) continue;
+ mds->send_message_mds(new MHashDirNotify(dir->ino(), mds->get_nodeid()),
+ i, MDS_PORT_MIGRATOR);
+ }
+
+ // ack
+ dout(7) << "acking" << endl;
+ mds->send_message_mds(new MHashDirAck(dir->ino()),
+ m->get_source().num(), MDS_PORT_MIGRATOR);
+
+ // done.
+ delete m;
+
+ show_imports();
+}
+
+
+
+
+
+// UNHASH on auth
+
+class C_MDC_UnhashFreeze : public Context {
+public:
+ Migrator *mig;
+ CDir *dir;
+ C_MDC_UnhashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {}
+ virtual void finish(int r) {
+ mig->unhash_dir_frozen(dir);
+ }
+};
+
+class C_MDC_UnhashComplete : public Context {
+public:
+ Migrator *mig;
+ CDir *dir;
+ C_MDC_UnhashComplete(Migrator *m, CDir *d) : mig(m), dir(d) {}
+ virtual void finish(int r) {
+ mig->unhash_dir_complete(dir);
+ }
+};
+
+
+void Migrator::unhash_dir(CDir *dir)
+{
+ dout(-7) << "unhash_dir " << *dir << endl;
+
+ assert(dir->is_hashed());
+ assert(!dir->is_unhashing());
+ assert(dir->is_auth());
+ assert(hash_gather.count(dir)==0);
+
+ // pin path?
+ vector<CDentry*> trace;
+ cache->make_trace(trace, dir->inode);
+ if (!cache->path_pin(trace, 0, 0)) {
+ dout(7) << "unhash_dir couldn't pin path, failing." << endl;
+ return;
+ }
+
+ // twiddle state
+ dir->state_set(CDIR_STATE_UNHASHING);
+
+ // first, freeze the dir.
+ dir->freeze_dir(new C_MDC_UnhashFreeze(this, dir));
+
+ // make complete
+ if (!dir->is_complete()) {
+ dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl;
+ mds->mdstore->fetch_dir(dir,
+ new C_MDC_UnhashComplete(this, dir));
+ } else
+ unhash_dir_complete(dir);
+
+}
+
+void Migrator::unhash_dir_frozen(CDir *dir)
+{
+ dout(7) << "unhash_dir_frozen " << *dir << endl;
+
+ assert(dir->is_hashed());
+ assert(dir->is_auth());
+ assert(dir->is_frozen_dir());
+
+ if (!dir->is_complete()) {
+ dout(7) << "unhash_dir_frozen !complete, waiting still on " << *dir << endl;
+ } else
+ unhash_dir_prep(dir);
+}
+
+
+/*
+ * ask peers to freeze and complete hashed dir
+ */
+void Migrator::unhash_dir_prep(CDir *dir)
+{
+ dout(7) << "unhash_dir_prep " << *dir << endl;
+ assert(dir->is_hashed());
+ assert(dir->is_auth());
+ assert(dir->is_frozen_dir());
+ assert(dir->is_complete());
+
+ if (!hash_gather[dir].empty()) return; // already been here..freeze must have been instantaneous
+
+ // send unhash prep to all peers
+ assert(hash_gather[dir].empty());
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i == mds->get_nodeid()) continue;
+ hash_gather[dir].insert(i);
+ mds->send_message_mds(new MUnhashDirPrep(dir->ino()),
+ i, MDS_PORT_MIGRATOR);
+ }
+}
+
+/*
+ * wait for peers to freeze and complete hashed dirs
+ */
+void Migrator::handle_unhash_dir_prep_ack(MUnhashDirPrepAck *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ int from = MSG_ADDR_NUM(m->get_source());
+ dout(7) << "handle_unhash_dir_prep_ack from " << from << " " << *dir << endl;
+
+ if (!m->did_assim()) {
+ m->mark_assim(); // only do this the first time!
+
+ // assimilate dentry+inodes for exports
+ for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
+ it != m->get_inodes().end();
+ it++) {
+ CInode *in = cache->get_inode( it->second->get_ino() );
+ if (in) {
+ it->second->update_inode(in);
+ dout(5) << " updated " << *in << endl;
+ } else {
+ in = new CInode(false);
+ it->second->update_inode(in);
+ cache->add_inode(in);
+
+ // link
+ dir->add_dentry( it->first, in );
+ dout(5) << " added " << *in << endl;
+ }
+
+ // open!
+ if (!in->dir) {
+ dout(5) << " opening nested export on " << *in << endl;
+ cache->open_remote_dir(in,
+ new C_MDS_RetryMessage(mds, m));
+ }
+ }
+ }
+
+ // verify!
+ int waiting_for = 0;
+ for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
+ it != m->get_inodes().end();
+ it++) {
+ CInode *in = cache->get_inode( it->second->get_ino() );
+ assert(in);
+
+ if (in->dir) {
+ if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) {
+ dout(5) << " pinning nested export " << *in->dir << endl;
+ in->dir->get(CDIR_PIN_IMPORTINGEXPORT);
+ in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT);
+ } else {
+ dout(5) << " already pinned nested export " << *in << endl;
+ }
+ } else {
+ dout(5) << " waiting for nested export dir on " << *in << endl;
+ waiting_for++;
+ }
+ }
+
+ if (waiting_for) {
+ dout(5) << "waiting for " << waiting_for << " dirs to open" << endl;
+ return;
+ }
+
+ // ok, done with this PrepAck
+ assert(hash_gather[dir].count(from) == 1);
+ hash_gather[dir].erase(from);
+
+ if (hash_gather[dir].empty()) {
+ hash_gather.erase(dir);
+ dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", last one" << endl;
+ unhash_dir_go(dir);
+ } else {
+ dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl;
+ }
+
+ delete m;
+}
+
+
+/*
+ * auth:
+ * send out MHashDir's to peers
+ */
+void Migrator::unhash_dir_go(CDir *dir)
+{
+ dout(7) << "unhash_dir_go " << *dir << endl;
+ assert(dir->is_hashed());
+ assert(dir->is_auth());
+ assert(dir->is_frozen_dir());
+ assert(dir->is_complete());
+
+ // send unhash prep to all peers
+ assert(hash_gather[dir].empty());
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i == mds->get_nodeid()) continue;
+ hash_gather[dir].insert(i);
+ mds->send_message_mds(new MUnhashDir(dir->ino()),
+ i, MDS_PORT_MIGRATOR);
+ }
+}
+
+/*
+ * auth:
+ * assimilate unhashing content
+ */
+void Migrator::handle_unhash_dir_ack(MUnhashDirAck *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ dout(7) << "handle_unhash_dir_ack " << *dir << endl;
+ assert(dir->is_hashed());
+
+ // assimilate content
+ int from = MSG_ADDR_NUM(m->get_source());
+ import_hashed_content(dir, m->get_state(), m->get_nden(), from);
+ delete m;
+
+ // done?
+ assert(hash_gather[dir].count(from));
+ hash_gather[dir].erase(from);
+
+ if (!hash_gather[dir].empty()) {
+ dout(7) << "still waiting for unhash acks from " << hash_gather[dir] << endl;
+ return;
+ }
+
+ // done!
+
+ // fix up nested_exports
+ CDir *containing_import = cache->get_auth_container(dir);
+ if (containing_import != dir) {
+ for (set<CDir*>::iterator it = cache->nested_exports[dir].begin();
+ it != cache->nested_exports[dir].end();
+ it++) {
+ dout(7) << "moving nested export out from under hashed dir : " << **it << endl;
+ cache->nested_exports[containing_import].insert(*it);
+ }
+ cache->nested_exports.erase(dir);
+ }
+
+ // dir state
+ //dir->state_clear(CDIR_STATE_UNHASHING); //later
+ dir->state_clear(CDIR_STATE_HASHED);
+ dir->put(CDIR_PIN_HASHED);
+ cache->hashdirs.erase(dir);
+
+ // commit!
+ assert(dir->is_complete());
+ //dir->mark_complete();
+ dir->mark_dirty();
+ mds->mdstore->commit_dir(dir, 0);
+
+ // inode state
+ dir->inode->inode.hash_seed = 0;
+ if (dir->inode->is_auth()) {
+ dir->inode->mark_dirty();
+ mds->mdlog->submit_entry(new EInodeUpdate(dir->inode));
+ }
+
+ // notify
+ assert(hash_gather[dir].empty());
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i == mds->get_nodeid()) continue;
+
+ hash_gather[dir].insert(i);
+
+ mds->send_message_mds(new MUnhashDirNotify(dir->ino()),
+ i, MDS_PORT_MIGRATOR);
+ }
+}
+
+
+/*
+ * sent by peer to flush mds links. unfreeze when all gathered.
+ */
+void Migrator::handle_unhash_dir_notify_ack(MUnhashDirNotifyAck *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ dout(7) << "handle_unhash_dir_ack " << *dir << endl;
+ assert(!dir->is_hashed());
+ assert(dir->is_unhashing());
+ assert(dir->is_frozen_dir());
+
+ // done?
+ int from = MSG_ADDR_NUM(m->get_source());
+ assert(hash_gather[dir].count(from));
+ hash_gather[dir].erase(from);
+ delete m;
+
+ if (!hash_gather[dir].empty()) {
+ dout(7) << "still waiting for notifyack from " << hash_gather[dir] << " on " << *dir << endl;
+ } else {
+ unhash_dir_finish(dir);
+ }
+}
+
+
+/*
+ * all mds links are flushed. unfreeze dir!
+ */
+void Migrator::unhash_dir_finish(CDir *dir)
+{
+ dout(7) << "unhash_dir_finish " << *dir << endl;
+ hash_gather.erase(dir);
+
+ // unpin path
+ vector<CDentry*> trace;
+ cache->make_trace(trace, dir->inode);
+ cache->path_unpin(trace, 0);
+
+ // state
+ dir->state_clear(CDIR_STATE_UNHASHING);
+
+ // unfreeze
+ dir->unfreeze_dir();
+
+}
+
+
+
+// UNHASH on all
+
+/*
+ * hashed dir is complete.
+ * mark all migrating inodes dirty (to pin in cache)
+ * if frozen too, then go to next step (depending on auth)
+ */
+void Migrator::unhash_dir_complete(CDir *dir)
+{
+ dout(7) << "unhash_dir_complete " << *dir << ", dirtying inodes" << endl;
+
+ assert(dir->is_hashed());
+ assert(dir->is_complete());
+
+ // mark dirty to pin in cache
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CInode *in = it->second->inode;
+ if (in->is_auth()) {
+ in->mark_dirty();
+ mds->mdlog->submit_entry(new EInodeUpdate(in));
+ }
+ }
+
+ if (!dir->is_frozen_dir()) {
+ dout(7) << "dir complete but !frozen, waiting " << *dir << endl;
+ } else {
+ if (dir->is_auth())
+ unhash_dir_prep(dir); // auth
+ else
+ unhash_dir_prep_finish(dir); // nonauth
+ }
+}
+
+
+// UNHASH on non-auth
+
+class C_MDC_UnhashPrepFreeze : public Context {
+public:
+ Migrator *mig;
+ CDir *dir;
+ C_MDC_UnhashPrepFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {}
+ virtual void finish(int r) {
+ mig->unhash_dir_prep_frozen(dir);
+ }
+};
+
+
+/*
+ * peers need to freeze their dir and make them complete
+ */
+void Migrator::handle_unhash_dir_prep(MUnhashDirPrep *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ dout(7) << "handle_unhash_dir_prep " << *dir << endl;
+ assert(dir->is_hashed());
+
+ // freeze
+ dir->freeze_dir(new C_MDC_UnhashPrepFreeze(this, dir));
+
+ // make complete
+ if (!dir->is_complete()) {
+ dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl;
+ mds->mdstore->fetch_dir(dir,
+ new C_MDC_UnhashComplete(this, dir));
+ } else {
+ unhash_dir_complete(dir);
+ }
+
+ delete m;
+}
+
+/*
+ * peer has hashed dir frozen.
+ * complete too?
+ */
+void Migrator::unhash_dir_prep_frozen(CDir *dir)
+{
+ dout(7) << "unhash_dir_prep_frozen " << *dir << endl;
+
+ assert(dir->is_hashed());
+ assert(dir->is_frozen_dir());
+ assert(!dir->is_auth());
+
+ if (!dir->is_complete()) {
+ dout(7) << "unhash_dir_prep_frozen !complete, waiting still on " << *dir << endl;
+ } else
+ unhash_dir_prep_finish(dir);
+}
+
+/*
+ * peer has hashed dir complete and frozen. ack.
+ */
+void Migrator::unhash_dir_prep_finish(CDir *dir)
+{
+ dout(7) << "unhash_dir_prep_finish " << *dir << endl;
+ assert(dir->is_hashed());
+ assert(!dir->is_auth());
+ assert(dir->is_frozen());
+ assert(dir->is_complete());
+
+ // twiddle state
+ if (dir->is_unhashing())
+ return; // already replied.
+ dir->state_set(CDIR_STATE_UNHASHING);
+
+ // send subdirs back to auth
+ MUnhashDirPrepAck *ack = new MUnhashDirPrepAck(dir->ino());
+ int auth = dir->authority();
+
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CDentry *dn = it->second;
+ CInode *in = dn->inode;
+
+ if (!in->is_dir()) continue;
+ if (!in->dir) continue;
+
+ int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
+ if (dentryhashcode != mds->get_nodeid()) continue;
+
+ // msg?
+ ack->add_inode(it->first, in->replicate_to(auth));
+ }
+
+ // ack
+ mds->send_message_mds(ack, auth, MDS_PORT_MIGRATOR);
+}
+
+
+
+/*
+ * peer needs to send hashed dir content back to auth.
+ * unhash dir.
+ */
+void Migrator::handle_unhash_dir(MUnhashDir *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ dout(7) << "handle_unhash_dir " << *dir << endl;//" .. hash_seed is " << dir->inode->inode.hash_seed << endl;
+ assert(dir->is_hashed());
+ assert(dir->is_unhashing());
+ assert(!dir->is_auth());
+
+ // get message ready
+ bufferlist bl;
+ int nden = 0;
+
+ // suck up all waiters
+ C_Contexts *fin = new C_Contexts;
+ list<Context*> waiting;
+ dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters
+ fin->take(waiting);
+
+ // divy up contents
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CDentry *dn = it->second;
+ CInode *in = dn->inode;
+
+ int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
+ if (dentryhashcode != mds->get_nodeid()) {
+ // not mine!
+ // twiddle dir_auth?
+ if (in->dir) {
+ if (in->dir->authority() != dir->authority())
+ in->dir->set_dir_auth( in->dir->authority() );
+ else
+ in->dir->set_dir_auth( CDIR_AUTH_PARENT );
+ }
+ continue;
+ }
+
+ // -- dentry
+ dout(7) << "unhash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl;
+ _encode(it->first, bl);
+
+ // null dentry?
+ if (dn->is_null()) {
+ bl.append("N", 1); // null dentry
+ assert(dn->is_sync());
+ continue;
+ }
+
+ if (dn->is_remote()) {
+ // remote link
+ bl.append("L", 1); // remote link
+
+ inodeno_t ino = dn->get_remote_ino();
+ bl.append((char*)&ino, sizeof(ino));
+ continue;
+ }
+
+ // primary link
+ // -- inode
+ bl.append("I", 1); // inode dentry
+
+ encode_export_inode(in, bl, dentryhashcode); // encode, and (update state for) export
+ nden++;
+
+ if (dn->is_dirty())
+ dn->mark_clean();
+
+ // proxy
+ in->state_set(CINODE_STATE_PROXY);
+ in->get(CINODE_PIN_PROXY);
+ hash_proxy_inos[dir].push_back(in);
+
+ if (in->dir) {
+ if (in->dir->is_auth()) {
+ // mine. make it into an import.
+ dout(7) << "making subdir into import " << *in->dir << endl;
+ in->dir->set_dir_auth( mds->get_nodeid() );
+ cache->imports.insert(in->dir);
+ in->dir->get(CDIR_PIN_IMPORT);
+ in->dir->state_set(CDIR_STATE_IMPORT);
+ }
+ else {
+ // not mine.
+ dout(7) << "un-exporting subdir that's being unhashed away " << *in->dir << endl;
+ assert(in->dir->is_export());
+ in->dir->put(CDIR_PIN_EXPORT);
+ in->dir->state_clear(CDIR_STATE_EXPORT);
+ cache->exports.erase(in->dir);
+ cache->nested_exports[dir].erase(in->dir);
+ }
+ }
+
+ // waiters
+ list<Context*> waiters;
+ in->take_waiting(CINODE_WAIT_ANY, waiters);
+ fin->take(waiters);
+ }
+
+ // we should have no nested exports; we're not auth for the dir!
+ assert(cache->nested_exports[dir].empty());
+ cache->nested_exports.erase(dir);
+
+ // dir state
+ //dir->state_clear(CDIR_STATE_UNHASHING); // later
+ dir->state_clear(CDIR_STATE_HASHED);
+ dir->put(CDIR_PIN_HASHED);
+ cache->hashdirs.erase(dir);
+ dir->mark_clean();
+
+ // inode state
+ dir->inode->inode.hash_seed = 0;
+ if (dir->inode->is_auth()) {
+ dir->inode->mark_dirty();
+ mds->mdlog->submit_entry(new EInodeUpdate(dir->inode));
+ }
+
+ // init gather set
+ hash_gather[dir] = mds->get_mds_map()->get_mds();
+ hash_gather[dir].erase(mds->get_nodeid());
+
+ // send unhash message
+ mds->send_message_mds(new MUnhashDirAck(dir->ino(), bl, nden),
+ dir->authority(), MDS_PORT_MIGRATOR);
+}
+
+
+/*
+ * first notify comes from auth.
+ * send notifies to all other peers, with peer = self
+ * if we get notify from peer=other, remove from our gather list.
+ * when we've gotten notifies from everyone,
+ * unpin proxies,
+ * send notify_ack to auth.
+ * this ensures that all mds links are flushed of cache_expire type messages.
+ */
+void Migrator::handle_unhash_dir_notify(MUnhashDirNotify *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ dout(7) << "handle_unhash_dir_finish " << *dir << endl;
+ assert(!dir->is_hashed());
+ assert(dir->is_unhashing());
+ assert(!dir->is_auth());
+
+ int from = MSG_ADDR_NUM(m->get_source());
+ assert(hash_gather[dir].count(from) == 1);
+ hash_gather[dir].erase(from);
+ delete m;
+
+ // did we send our shout out?
+ if (from == dir->authority()) {
+ // send notify to everyone else in weird chatter storm
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i == from) continue;
+ if (i == mds->get_nodeid()) continue;
+ mds->send_message_mds(new MUnhashDirNotify(dir->ino()), i, MDS_PORT_MIGRATOR);
+ }
+ }
+
+ // are we done?
+ if (!hash_gather[dir].empty()) {
+ dout(7) << "still waiting for notify from " << hash_gather[dir] << endl;
+ return;
+ }
+ hash_gather.erase(dir);
+
+ // all done!
+ dout(7) << "all mds links flushed, unpinning unhash proxies" << endl;
+
+ // unpin proxies
+ for (list<CInode*>::iterator it = hash_proxy_inos[dir].begin();
+ it != hash_proxy_inos[dir].end();
+ it++) {
+ CInode *in = *it;
+ assert(in->state_test(CINODE_STATE_PROXY));
+ in->state_clear(CINODE_STATE_PROXY);
+ in->put(CINODE_PIN_PROXY);
+ }
+
+ // unfreeze
+ dir->unfreeze_dir();
+
+ // ack
+ dout(7) << "sending notify_ack to auth for unhash of " << *dir << endl;
+ mds->send_message_mds(new MUnhashDirNotifyAck(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR);
+
+}
+
+
+
+
+void Migrator::show_imports()
+{
+ mds->balancer->show_imports();
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MDS_MIGRATOR_H
+#define __MDS_MIGRATOR_H
+
+#include "include/types.h"
+
+#include <map>
+#include <list>
+#include <set>
+using std::map;
+using std::list;
+using std::set;
+
+
+class MDS;
+class CDir;
+class CInode;
+class CDentry;
+
+class MExportDirDiscover;
+class MExportDirDiscoverAck;
+class MExportDirPrep;
+class MExportDirPrepAck;
+class MExportDirWarning;
+class MExportDir;
+class MExportDirNotify;
+class MExportDirNotifyAck;
+class MExportDirFinish;
+
+class MHashDirDiscover;
+class MHashDirDiscoverAck;
+class MHashDirPrep;
+class MHashDirPrepAck;
+class MHashDir;
+class MHashDirAck;
+class MHashDirNotify;
+
+class MUnhashDirPrep;
+class MUnhashDirPrepAck;
+class MUnhashDir;
+class MUnhashDirAck;
+class MUnhashDirNotify;
+class MUnhashDirNotifyAck;
+
+class Migrator {
+private:
+ MDS *mds;
+ MDCache *cache;
+
+ // export fun
+ map<CDir*, set<int> > export_notify_ack_waiting; // nodes i am waiting to get export_notify_ack's from
+ map<CDir*, list<inodeno_t> > export_proxy_inos;
+ map<CDir*, list<inodeno_t> > export_proxy_dirinos;
+
+ set<inodeno_t> stray_export_warnings; // notifies i haven't seen
+ map<inodeno_t, MExportDirNotify*> stray_export_notifies;
+
+ // hashing madness
+ multimap<CDir*, int> unhash_waiting; // nodes i am waiting for UnhashDirAck's from
+ multimap<inodeno_t, inodeno_t> import_hashed_replicate_waiting; // nodes i am waiting to discover to complete my import of a hashed dir
+ // maps frozen_dir_ino's to waiting-for-discover ino's.
+ multimap<inodeno_t, inodeno_t> import_hashed_frozen_waiting; // dirs i froze (for the above)
+
+public:
+ // -- cons --
+ Migrator(MDS *m, MDCache *c) : mds(m), cache(c) {}
+
+ void dispatch(Message*);
+
+ // -- import/export --
+ // exporter
+ public:
+ void export_dir(CDir *dir,
+ int mds);
+ void export_empty_import(CDir *dir);
+
+ void encode_export_inode(CInode *in, bufferlist& enc_state, int newauth);
+ void decode_import_inode(CDentry *dn, bufferlist& bl, int &off, int oldauth);
+
+ protected:
+ map< CDir*, set<int> > export_gather;
+ void handle_export_dir_discover_ack(MExportDirDiscoverAck *m);
+ void export_dir_frozen(CDir *dir, int dest);
+ void handle_export_dir_prep_ack(MExportDirPrepAck *m);
+ void export_dir_go(CDir *dir,
+ int dest);
+ int export_dir_walk(MExportDir *req,
+ class C_Contexts *fin,
+ CDir *basedir,
+ CDir *dir,
+ int newauth);
+ void export_dir_finish(CDir *dir);
+ void handle_export_dir_notify_ack(MExportDirNotifyAck *m);
+
+
+ friend class C_MDC_ExportFreeze;
+
+ // importer
+ void handle_export_dir_discover(MExportDirDiscover *m);
+ void handle_export_dir_discover_2(MExportDirDiscover *m, CInode *in, int r);
+ void handle_export_dir_prep(MExportDirPrep *m);
+ void handle_export_dir(MExportDir *m);
+ void import_dir_finish(CDir *dir);
+ void handle_export_dir_finish(MExportDirFinish *m);
+ int import_dir_block(bufferlist& bl,
+ int& off,
+ int oldauth,
+ CDir *import_root,
+ list<inodeno_t>& imported_subdirs);
+ void got_hashed_replica(CDir *import,
+ inodeno_t dir_ino,
+ inodeno_t replica_ino);
+
+
+ friend class C_MDC_ExportDirDiscover;
+
+ // bystander
+ void handle_export_dir_warning(MExportDirWarning *m);
+ void handle_export_dir_notify(MExportDirNotify *m);
+
+ void show_imports();
+
+ // -- hashed directories --
+
+ // HASH
+ public:
+ void hash_dir(CDir *dir); // on auth
+ protected:
+ map< CDir*, set<int> > hash_gather;
+ map< CDir*, map< int, set<int> > > hash_notify_gather;
+ map< CDir*, list<CInode*> > hash_proxy_inos;
+
+ // hash on auth
+ void handle_hash_dir_discover_ack(MHashDirDiscoverAck *m);
+ void hash_dir_complete(CDir *dir);
+ void hash_dir_frozen(CDir *dir);
+ void handle_hash_dir_prep_ack(MHashDirPrepAck *m);
+ void hash_dir_go(CDir *dir);
+ void handle_hash_dir_ack(MHashDirAck *m);
+ void hash_dir_finish(CDir *dir);
+ friend class C_MDC_HashFreeze;
+ friend class C_MDC_HashComplete;
+
+ // auth and non-auth
+ void handle_hash_dir_notify(MHashDirNotify *m);
+
+ // hash on non-auth
+ void handle_hash_dir_discover(MHashDirDiscover *m);
+ void handle_hash_dir_discover_2(MHashDirDiscover *m, CInode *in, int r);
+ void handle_hash_dir_prep(MHashDirPrep *m);
+ void handle_hash_dir(MHashDir *m);
+ friend class C_MDC_HashDirDiscover;
+
+ // UNHASH
+ public:
+ void unhash_dir(CDir *dir); // on auth
+ protected:
+ map< CDir*, list<MUnhashDirAck*> > unhash_content;
+ void import_hashed_content(CDir *dir, bufferlist& bl, int nden, int oldauth);
+
+ // unhash on auth
+ void unhash_dir_frozen(CDir *dir);
+ void unhash_dir_prep(CDir *dir);
+ void handle_unhash_dir_prep_ack(MUnhashDirPrepAck *m);
+ void unhash_dir_go(CDir *dir);
+ void handle_unhash_dir_ack(MUnhashDirAck *m);
+ void handle_unhash_dir_notify_ack(MUnhashDirNotifyAck *m);
+ void unhash_dir_finish(CDir *dir);
+ friend class C_MDC_UnhashFreeze;
+ friend class C_MDC_UnhashComplete;
+
+ // unhash on all
+ void unhash_dir_complete(CDir *dir);
+
+ // unhash on non-auth
+ void handle_unhash_dir_prep(MUnhashDirPrep *m);
+ void unhash_dir_prep_frozen(CDir *dir);
+ void unhash_dir_prep_finish(CDir *dir);
+ void handle_unhash_dir(MUnhashDir *m);
+ void handle_unhash_dir_notify(MUnhashDirNotify *m);
+ friend class C_MDC_UnhashPrepFreeze;
+
+
+};
+
+
+#endif
#define __MHASHDIR_H
#include "msg/Message.h"
-#include "include/bufferlist.h"
class MHashDir : public Message {
inodeno_t ino;
bool want_ack;
bool want_commit;
-
- size_t _data_len;
-
} MOSDOp_st;
class MOSDOp : public Message {
bufferlist& get_data() {
return data;
}
- size_t get_data_len() { return st._data_len; }
+ size_t get_data_len() { return data.length(); }
// keep a pcid (procedure call id) to match up request+reply
payload.copy(off, sizeof(st), (char*)&st);
off += sizeof(st);
::_decode(attrset, payload, off);
- if (st._data_len)
- payload.splice(off, st._data_len, &data);
+ ::_decode(data, payload, off);
}
virtual void encode_payload() {
- st._data_len = data.length();
- payload.push_back( new buffer((char*)&st, sizeof(st)) );
+ payload.append((char*)&st, sizeof(st));
::_encode(attrset, payload);
- payload.claim_append( data );
+ ::_encode(data, payload);
}
virtual char *get_type_name() { return "oop"; }
eversion_t pg_complete_thru;
epoch_t map_epoch;
-
- size_t _data_len;//, _oc_len;
} MOSDOpReply_st;
// data payload
void set_data(bufferlist &d) {
data.claim(d);
- //st._data_len = data.length();
}
bufferlist& get_data() {
return data;
payload.splice(0, sizeof(st));
int off = 0;
::_decode(attrset, payload, off);
- if (st._data_len) payload.splice(off, st._data_len, &data);
+ ::_decode(data, payload, off);
}
virtual void encode_payload() {
- st._data_len = data.length();
- payload.push_back( new buffer((char*)&st, sizeof(st)) );
+ payload.append((char*)&st, sizeof(st));
::_encode(attrset, payload);
- payload.claim_append( data );
+ ::_encode(data, payload);
}
virtual char *get_type_name() { return "oopr"; }
#define __MUNHASHDIRACK_H
#include "msg/Message.h"
-#include "include/bufferlist.h"
class MUnhashDirAck : public Message {
inodeno_t ino;
read_num++;
status_msg_count = 0;
old_views = views; // TODO deep copy
- for (int i=0; i<processes.size(); i++) {
+ for (unsigned i=0; i<processes.size(); i++) {
mon->messenger->send_message(new MMonElectionCollect(read_num),
MSG_ADDR_MON(processes[i]));
}
}
status_msg_count++;
- if (status_msg_count >= processes.size() - f) { // Responses from quorum collected
- for (int i=0; i<processes.size(); i++) {
+ if (status_msg_count >= (int)processes.size() - f) { // Responses from quorum collected
+ for (unsigned i=0; i<processes.size(); i++) {
int r = processes[i];
// Check if r has refreshed its epoch number
if (!( views[r].state > old_views[r].state )) {
#define MSG_MDS_SHUTDOWNFINISH 901
+#include <stdlib.h>
+#include <cassert>
+
+#include <iostream>
+#include <list>
+using std::list;
+
+#include <ext/hash_map>
+#include <ext/rope>
+
+using __gnu_cxx::crope;
+
+#include "include/buffer.h"
+
+#include "tcp.h"
+
-#include "include/bufferlist.h"
// use fixed offsets and static entity -> logical addr mapping!
//typedef struct msg_addr msg_addr_t;
-inline ostream& operator<<(ostream& out, const msg_addr_t& addr) {
+inline std::ostream& operator<<(std::ostream& out, const msg_addr_t& addr) {
//if (addr.is_namer()) return out << "namer";
return out << addr.type_str() << addr.num();
}
+
namespace __gnu_cxx {
template<> struct hash< msg_addr_t >
{
-#include <stdlib.h>
-#include <cassert>
-
-#include <iostream>
-using namespace std;
-
-#include <ext/rope>
-using namespace __gnu_cxx;
-
-
-#include "tcp.h"
-
class entity_inst_t {
public:
virtual void decode_payload() {
// use a crope for convenience, small messages, etc. FIXME someday.
crope ser;
- payload._rope(ser);
+ for (list<bufferptr>::const_iterator it = payload.buffers().begin();
+ it != payload.buffers().end();
+ it++)
+ ser.append((*it).c_str(), (*it).length());
int off = 0;
decode_payload(ser, off);
encode_payload(r);
// copy payload
- payload.push_back( new buffer(r.c_str(), r.length()) );
+ payload.push_back( buffer::copy(r.c_str(), r.length()) );
}
virtual void print(ostream& out) {
if (size == 0) continue;
- bufferptr bp = new buffer(size);
+ bufferptr bp(size);
if (!tcp_read( sd, bp.c_str(), size )) return 0;
#ifdef TCP_KEEP_CHUNKS
// send chunk-wise
int i = 0;
- for (list<bufferptr>::iterator it = blist.buffers().begin();
+ for (list<bufferptr>::const_iterator it = blist.buffers().begin();
it != blist.buffers().end();
it++) {
dout(10) << "pipe(" << peer_inst << ' ' << this << ").writer tcp_sending frag " << i << " len " << (*it).length() << endl;
}
dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer data len is " << size << " in " << blist.buffers().size() << " buffers" << endl;
- for (list<bufferptr>::iterator it = blist.buffers().begin();
+ for (list<bufferptr>::const_iterator it = blist.buffers().begin();
it != blist.buffers().end();
it++) {
if ((*it).length() == 0) continue; // blank buffer.
- r = tcp_write( sd, (*it).c_str(), (*it).length() );
+ r = tcp_write( sd, (char*)(*it).c_str(), (*it).length() );
if (r < 0) {
derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data megachunk for " << *m << " to " << m->get_dest() << " : len " << (*it).length() << endl;
return -1;
typedef struct sockaddr_in tcpaddr_t;
+using std::ostream;
+
inline ostream& operator<<(ostream& out, const tcpaddr_t &a)
{
unsigned char addr[4];
} else {
int l = g_conf.num_mon * 1000; // nice'n big.
- bufferptr bp = new buffer(l);
+ bufferptr bp(l);
bl.append(bp);
}
__uint64_t Ager::age_fill(float pc, utime_t until) {
int max = 1024*1024;
- char *buf = new char[max];
+ bufferptr bp(max);
+ bp.zero();
bufferlist bl;
- bl.push_back(new buffer(buf, max));
+ bl.push_back(bp);
__uint64_t wrote = 0;
while (1) {
if (g_clock.now() > until) break;
}
oid.bno++;
}
- delete[] buf;
return wrote*4; // KB
}
int r = ::stat("ebofs.freelist", &st);
assert(r == 0);
- bufferptr bp = new buffer(st.st_size);
+ bufferptr bp(st.st_size);
bufferlist bl;
bl.push_back(bp);
int fd = ::open("ebofs.freelist", O_RDONLY);
int setattr(const char *name, const void *value, size_t size) {
string n = name;
- bufferptr bp(new buffer((char*)value,size));
+ bufferptr bp = buffer::copy((char*)value, size);
attrs[n] = bp;
return 0;
}
#undef dout
#define dout(l) if (l<=g_conf.debug) cout << "osd" << whoami << ".fakestore "
-#include "include/bufferlist.h"
+#include "include/buffer.h"
#include <map>
#include <ext/hash_map>
}
if (actual == offset) {
- bufferptr bptr = new buffer(len); // prealloc space for entire read
+ bufferptr bptr(len); // prealloc space for entire read
got = ::read(fd, bptr.c_str(), len);
bptr.set_length(got); // properly size the buffer
if (got > 0) bl.push_back( bptr ); // put it in the target bufferlist
assert(actual == offset);
// write buffers
- for (list<bufferptr>::iterator it = bl.buffers().begin();
+ for (list<bufferptr>::const_iterator it = bl.buffers().begin();
it != bl.buffers().end();
it++) {
- int r = ::write(fd, (*it).c_str(), (*it).length());
+ int r = ::write(fd, (char*)(*it).c_str(), (*it).length());
if (r > 0)
did += r;
else {
}
void ObjectStore::age_fill(float pc, utime_t until) {
- static char buf[1024*1024];
+ bufferptr bp(1024*1024);
+ bp.zero();
bufferlist bl;
- bl.push_back(new buffer(buf, 1024*1024));
+ bl.push_back(bp);
while (1) {
if (g_clock.now() > until) break;
#include "include/types.h"
#include "include/Context.h"
-#include "include/bufferlist.h"
+#include "include/buffer.h"
#include "include/Distribution.h"
bufferlist bl;
bl.append( (char*)&logentry, sizeof(logentry) );
if (g_conf.osd_pad_pg_log) { // pad to 4k, until i fix ebofs reallocation crap. FIXME.
- bufferptr bp = new buffer(4096 - sizeof(logentry));
+ bufferptr bp(4096 - sizeof(logentry));
bl.push_back(bp);
}
t.write( object_t(1,info.pgid), ondisklog.top, bl.length(), bl );
#include "include/types.h"
-#include "include/bufferlist.h"
+#include "include/buffer.h"
#include "OSDMap.h"
#include "ObjectStore.h"
// zero end of bx
dout(21) << " adding some zeros to the end " << ox_off + bit->second-ox_len << endl;
- bufferptr z = new buffer(ox_off + bit->second - ox_len);
- memset(z.c_str(), 0, z.length());
+ bufferptr z(ox_off + bit->second - ox_len);
+ z.zero();
by_off[bit->first]->append( z );
} else {
// we got none of this bx. zero whole thing.
assert(ox_off >= ox_len);
dout(21) << " adding all zeros for this bit " << bit->second << endl;
- bufferptr z = new buffer(bit->second);
- assert(z.length() == bit->second);
- memset(z.c_str(), 0, z.length());
+ bufferptr z(bit->second);
+ z.zero();
by_off[bit->first]->append( z );
}
ox_off += bit->second;
#define __OBJECTER_H
#include "include/types.h"
-#include "include/bufferlist.h"
+#include "include/buffer.h"
#include "osd/OSDMap.h"
#include "messages/MOSDOp.h"
--- /dev/null
+
+#include <list>
+#include <iostream>
+using namespace std;
+
+
+#include "include/newbuffer.h"
+//#include "include/bufferlist.h"
+
+#include "common/Thread.h"
+
+
+ class Th : public Thread {
+ public:
+ bufferlist bl;
+ Th(bufferlist& o) : bl(o) { }
+
+ void *entry() {
+ //cout << "start" << endl;
+ // thrash it a bit.
+ for (int n=0; n<10000; n++) {
+ bufferlist bl2;
+ unsigned off = rand() % (bl.length() -1);
+ unsigned len = 1 + rand() % (bl.length() - off - 1);
+ bl2.substr_of(bl, off, len);
+ bufferlist bl3;
+ bl3.append(bl);
+ bl3.append(bl2);
+ //cout << bl3 << endl;
+ bl2.clear();
+ bl3.clear();
+ }
+ //cout << "end" << endl;
+ }
+ };
+
+int main()
+{
+
+ bufferptr p1 = buffer::copy("123456",7);
+ //bufferptr p1 = new buffer("123456",7);
+ bufferptr p2 = p1;
+
+ cout << "p1 is '" << p1.c_str() << "'" << " " << p1 << endl;
+ cout << "p2 is '" << p2.c_str() << "'" << " " << p2 << endl;
+
+ bufferptr p3 = buffer::copy("abcdef",7);
+ //bufferptr p3 = new buffer("abcdef",7);
+
+ cout << "p3 is " << p3.c_str() << " " << p3 << endl;
+
+ bufferlist bl;
+ bl.push_back(p2);
+ bl.push_back(p1);
+ bl.push_back(p3);
+
+ cout << "bl is " << bl << endl;
+
+ bufferlist took;
+ bl.splice(10,4,&took);
+
+ cout << "took out " << took << ", leftover is " << bl << endl;
+ //cout << "len is " << bl.length() << endl;
+
+ bufferlist bl2;
+ bl2.substr_of(bl, 3, 5);
+ cout << "bl2 is " << bl2 << endl;
+
+
+ cout << "bl before " << bl << endl;
+
+ list<Th*> ls;
+ for (int t=0; t<40; t++) {
+ Th *t = new Th(bl);
+ cout << "create" << endl;
+ t->create();
+ ls.push_back(t);
+ }
+
+ bl.clear();
+
+ while (!ls.empty()) {
+ cout << "join" << endl;
+ ls.front()->join();
+ delete ls.front();
+ ls.pop_front();
+ }
+
+ cout << "bl after " << bl << endl;
+
+}