== todo
+papers to read
+- gribble et al 2000, scalable distributed hash table
+- sagiv blink trees
+- johnson and colbrook's DE and DB-trees (maybe fewer locks?)
+
+
+
- messenger lookup() and failure() upcalls
- how to get usage feedback to monitor?
- no more rank! make it a uniquish nonce?
+osd
+- pull out "object" service
+- btree service
+- interject snapshot abstraction. inbetween?
+ - how to generalize pg log entries?
+ - or put snapshotting in apply_operation?
+
+osdc
+- distributed btree thing: Blinker!
+
+mds
+- rewrite mdstore to use osd btree service
+
+journaler
+- should we pad with zeros to avoid splitting individual entries?
+ - make it a g_conf flag?
+ - have to fix reader to skip over zeros (either <4 bytes for size, or zeroed sizes)
+- need to truncate at detected (valid) write_pos to clear out any other partial trailing writes
+
+
monitor
?- monitor user lib that handles resending, redirection of mon requests.
mdsmon
osd/rados
+- pg_num instead of pg_bits
- flag missing log entries on crash recovery --> WRNOOP? or WRLOST?
- consider implications of nvram writeahead logs
- fix heartbeat wrt new replication
- mark residual pgs obsolete ???
- rdlocks
- optimize remove wrt recovery pushes
-- pg_bit changes
+- pg_bit/pg_num changes
- report crashed pgs?
messenger
general
- timer needs cancel sets, schedulers need to cancel outstanding events on shutdown
- - well, just figure out general timer cancellation strategy that avoids races
+- well, just figure out general timer cancellation strategy that avoids races
+ - use updated Timer as a model?
remaining hard problems
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __BLINKER_H
+#define __BLINKER_H
+
+class Blinker {
+
+ public:
+
+ class Op {
+ int op;
+ static const int LOOKUP = 1;
+ static const int INSERT = 2;
+ static const int REMOVE = 3;
+ static const int CLEAR = 4;
+ Op(int o) : op(o) {}
+ };
+
+ class OpLookup : public Op {
+ public:
+ bufferptr key;
+ OpLookup(bufferptr& k) : Op(Op::LOOKUP), key(k) {}
+ };
+
+ class OpInsert : public Op {
+ bufferptr key;
+ bufferlist val;
+ OpInsert(bufferptr& k, bufferlist& v) : Op(Op::INSERT), key(k), val(v) {}
+ };
+
+ class OpRemove : public Op {
+ public:
+ bufferptr key;
+ OpRemove(bufferptr& k) : Op(Op::REMOVE), key(k) {}
+ };
+
+ class OpClear : public Op {
+ public:
+ OpClear() : Op(Op::CLEAR) {}
+ };
+
+
+
+private:
+ Objecter *objecter;
+
+ // in-flight operations.
+
+
+ // cache information about tree structure.
+
+
+
+public:
+ // public interface
+
+ // simple accessors
+ void lookup(inode_t& inode, bufferptr& key, bufferlist *pval, Context *onfinish);
+
+ // simple modifiers
+ void insert(inode_t& inode, bufferptr& key, bufferlist& val, Context *onack, Context *onsafe);
+ void remove(inode_t& inode, bufferptr& key, Context *onack, Context *onsafe);
+ void clear(inode_t& inode, Context *onack, Context *onsafe);
+
+ // these are dangerous: the table may be large.
+ void listkeys(inode_t& inode, list<bufferptr>* pkeys, Context *onfinish);
+ void listvals(inode_t& inode, list<bufferptr>* pkeys, list<bufferlist>* pvals, Context *onfinish);
+
+ // fetch *at least* key, but also anything else that is convenient.
+ // include lexical bounds for which this is a complete result.
+ // (if *start and *end are empty, it's the entire table)
+ void prefetch(inode_t& inode, bufferptr& key,
+ list<bufferptr>* pkeys, list<bufferlist>* pvals,
+ bufferptr *start, bufferptr *end,
+ Context *onfinish);
+
+
+};
+
+#endif