#ifndef _FS_CEPH_CEPH_FS_H
#define _FS_CEPH_CEPH_FS_H
-
-#define CEPH_MON_PORT 6789 /* default monitor port */
-
-/*
- * client-side processes will try to bind to ports in this
- * range, simply for the benefit of tools like nmap or wireshark
- * that would like to identify the protocol.
- */
-#define CEPH_PORT_FIRST 6789
-#define CEPH_PORT_START 6800 /* non-monitors start here */
-#define CEPH_PORT_LAST 6900
+#include "msgr.h"
+#include "rados.h"
/*
* Max file size is a policy choice; in reality we are limited
*/
#define CEPH_FILE_MAX_SIZE (1ULL << 40) /* 1 TB */
-/*
- * tcp connection banner. include a protocol version. and adjust
- * whenever the wire protocol changes. try to keep this string length
- * constant.
- */
-#define CEPH_BANNER "ceph 013\n"
-#define CEPH_BANNER_MAX_LEN 30
-
/*
* subprotocol versions. when specific messages types or high-level
* protocols change, bump the affected components. we keep rev
* over the wire or that are stored on disk.
*/
-/*
- * some basics
- */
-typedef __le64 ceph_version_t;
-typedef __le64 ceph_tid_t; /* transaction id */
-typedef __le32 ceph_epoch_t;
-
-/*
- * fs id
- */
-typedef struct { unsigned char fsid[16]; } ceph_fsid_t;
-
-static inline int ceph_fsid_compare(const ceph_fsid_t *a,
- const ceph_fsid_t *b)
-{
- return memcmp(a, b, sizeof(*a));
-}
-
-/*
- * ino, object, etc.
- */
#define CEPH_INO_ROOT 1
-typedef __le64 ceph_snapid_t;
-#define CEPH_MAXSNAP ((__u64)(-3))
-#define CEPH_SNAPDIR ((__u64)(-1))
-#define CEPH_NOSNAP ((__u64)(-2))
-
-struct ceph_object {
- union {
- __u8 raw[20]; /* fits a sha1 hash */
- struct {
- __le64 ino; /* inode "file" identifier */
- __le32 bno; /* "block" (object) in that "file" */
- __le64 snap; /* snapshot id. usually NOSNAP. */
- } __attribute__ ((packed));
- };
-} __attribute__ ((packed));
-
-struct ceph_timespec {
- __le32 tv_sec;
- __le32 tv_nsec;
-} __attribute__ ((packed));
-
-
-/*
- * Rollover-safe type and comparator for 32-bit sequence numbers.
- * Comparator returns -1, 0, or 1.
- */
-typedef __u32 ceph_seq_t;
-
-static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
-{
- return ((__s32)a - (__s32)b);
-}
-
/*
* "Frags" are a way to describe a subset of a 32-bit number space,
return 0;
}
-/*
- * object layout - how objects are mapped into PGs
- */
-#define CEPH_OBJECT_LAYOUT_HASH 1
-#define CEPH_OBJECT_LAYOUT_LINEAR 2
-#define CEPH_OBJECT_LAYOUT_HASHINO 3
-
-/*
- * pg layout -- how PGs are mapped onto (sets of) OSDs
- */
-#define CEPH_PG_LAYOUT_CRUSH 0
-#define CEPH_PG_LAYOUT_HASH 1
-#define CEPH_PG_LAYOUT_LINEAR 2
-#define CEPH_PG_LAYOUT_HYBRID 3
-
/*
* ceph_file_layout - describe data layout for a file/inode
*/
#define ceph_file_layout_period(l) (le32_to_cpu((l).fl_object_size) * \
le32_to_cpu((l).fl_stripe_count))
-/*
- * placement group.
- * we encode this into one __le64.
- */
-#define CEPH_PG_TYPE_REP 1
-#define CEPH_PG_TYPE_RAID4 2
-union ceph_pg {
- __u64 pg64;
- struct {
- __s16 preferred; /* preferred primary osd */
- __u16 ps; /* placement seed */
- __u32 pool; /* implies crush ruleset */
- } pg;
-} __attribute__ ((packed));
-
-#define ceph_pg_is_rep(pg) ((pg).pg.type == CEPH_PG_TYPE_REP)
-#define ceph_pg_is_raid4(pg) ((pg).pg.type == CEPH_PG_TYPE_RAID4)
-
-/*
- * pg_pool is a set of pgs storing a pool of objects
- *
- * pg_num -- base number of pseudorandomly placed pgs
- *
- * pgp_num -- effective number when calculating pg placement. this
- * is used for pg_num increases. new pgs result in data being "split"
- * into new pgs. for this to proceed smoothly, new pgs are intiially
- * colocated with their parents; that is, pgp_num doesn't increase
- * until the new pgs have successfully split. only _then_ are the new
- * pgs placed independently.
- *
- * lpg_num -- localized pg count (per device). replicas are randomly
- * selected.
- *
- * lpgp_num -- as above.
- */
-struct ceph_pg_pool {
- __u8 type;
- __u8 size;
- __u8 crush_ruleset;
- __le32 pg_num, pgp_num;
- __le32 lpg_num, lpgp_num;
- __le32 last_change; /* most recent epoch changed */
-} __attribute__ ((packed));
-
-/*
- * stable_mod func is used to control number of placement groups.
- * similar to straight-up modulo, but produces a stable mapping as b
- * increases over time. b is the number of bins, and bmask is the
- * containing power of 2 minus 1.
- *
- * b <= bmask and bmask=(2**n)-1
- * e.g., b=12 -> bmask=15, b=123 -> bmask=127
- */
-static inline int ceph_stable_mod(int x, int b, int bmask)
-{
- if ((x & bmask) < b)
- return x & bmask;
- else
- return x & (bmask >> 1);
-}
-
-/*
- * object layout - how a given object should be stored.
- */
-struct ceph_object_layout {
- __le64 ol_pgid; /* raw pg, with _full_ ps precision. */
- __le32 ol_stripe_unit;
-} __attribute__ ((packed));
-
-/*
- * compound epoch+version, used by storage layer to serialize mutations
- */
-struct ceph_eversion {
- ceph_epoch_t epoch;
- __le64 version;
-} __attribute__ ((packed));
-
-/*
- * osd map bits
- */
-
-/* status bits */
-#define CEPH_OSD_EXISTS 1
-#define CEPH_OSD_UP 2
-
-/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
-#define CEPH_OSD_IN 0x10000
-#define CEPH_OSD_OUT 0
/*
* message layer
*/
-/*
- * entity_name
- */
-struct ceph_entity_name {
- __le32 type;
- __le32 num;
-} __attribute__ ((packed));
-
-#define CEPH_ENTITY_TYPE_MON 1
-#define CEPH_ENTITY_TYPE_MDS 2
-#define CEPH_ENTITY_TYPE_OSD 3
-#define CEPH_ENTITY_TYPE_CLIENT 4
-#define CEPH_ENTITY_TYPE_ADMIN 5
-
-/* used by message exchange protocol */
-#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
-#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
-#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
- incoming connection */
-#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
- with higher cseq */
-#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
- with higher gseq */
-#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
-#define CEPH_MSGR_TAG_MSG 10 /* message */
-#define CEPH_MSGR_TAG_ACK 11 /* message ack */
-
-
-/*
- * entity_addr -- network address
- */
-struct ceph_entity_addr {
- __le32 erank; /* entity's rank in process */
- __le32 nonce; /* unique id for process (e.g. pid) */
- struct sockaddr_in ipaddr;
-} __attribute__ ((packed));
-
-static inline bool ceph_entity_addr_is_local(const struct ceph_entity_addr *a,
- const struct ceph_entity_addr *b)
-{
- return a->nonce == b->nonce &&
- a->ipaddr.sin_addr.s_addr == b->ipaddr.sin_addr.s_addr;
-}
-
-static inline bool ceph_entity_addr_equal(const struct ceph_entity_addr *a,
- const struct ceph_entity_addr *b)
-{
- return memcmp(a, b, sizeof(*a)) == 0;
-}
-
-struct ceph_entity_inst {
- struct ceph_entity_name name;
- struct ceph_entity_addr addr;
-} __attribute__ ((packed));
-
-
-/*
- * connection negotiation
- */
-struct ceph_msg_connect {
- __le32 host_type; /* CEPH_ENTITY_TYPE_* */
- __le32 global_seq;
- __le32 connect_seq;
- __u8 flags;
-} __attribute__ ((packed));
-
-struct ceph_msg_connect_reply {
- __u8 tag;
- __le32 global_seq;
- __le32 connect_seq;
- __u8 flags;
-} __attribute__ ((packed));
-
-#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
-
-
-/*
- * message header
- */
-struct ceph_msg_header {
- __le64 seq; /* message seq# for this session */
- __le16 type; /* message type */
- __le16 priority; /* priority. higher value == higher priority */
-
- __le32 front_len; /* bytes in main payload */
- __le32 data_len; /* bytes of data payload */
- __le16 data_off; /* sender: include full offset;
- receiver: mask against ~PAGE_MASK */
-
- __u8 mon_protocol, monc_protocol; /* protocol versions, */
- __u8 osd_protocol, osdc_protocol; /* internal and public */
- __u8 mds_protocol, mdsc_protocol;
-
- struct ceph_entity_inst src, orig_src, dst;
- __le32 crc; /* header crc32c */
-} __attribute__ ((packed));
-
-#define CEPH_MSG_PRIO_LOW 64
-#define CEPH_MSG_PRIO_DEFAULT 127
-#define CEPH_MSG_PRIO_HIGH 196
-#define CEPH_MSG_PRIO_HIGHEST 255
-
-/*
- * follows data payload
- */
-struct ceph_msg_footer {
- __le32 flags;
- __le32 front_crc;
- __le32 data_crc;
-} __attribute__ ((packed));
-
-#define CEPH_MSG_FOOTER_ABORTED (1<<0) /* drop this message */
-#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
-
-
/*
* message types
*/
#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
struct ceph_mds_request_head {
- ceph_tid_t tid, oldest_client_tid;
- ceph_epoch_t mdsmap_epoch; /* on client */
+ __le64 tid, oldest_client_tid;
+ __le32 mdsmap_epoch; /* on client */
__le32 flags;
__u8 num_retry, num_fwd;
__le16 num_releases;
/* client reply */
struct ceph_mds_reply_head {
- ceph_tid_t tid;
+ __le64 tid;
__le32 op;
__le32 result;
__le32 mdsmap_epoch;
} __attribute__ ((packed));
/* followed by my snap list, then prior parent snap list */
-/*
- * osd map flag bits
- */
-#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
-#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
-#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
-#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
-#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
-
-/*
- * osd ops
- */
-#define CEPH_OSD_OP_MODE 0xf000
-#define CEPH_OSD_OP_MODE_RD 0x1000
-#define CEPH_OSD_OP_MODE_WR 0x2000
-#define CEPH_OSD_OP_MODE_SUB 0x4000
-
-#define CEPH_OSD_OP_TYPE 0x0f00
-#define CEPH_OSD_OP_TYPE_LOCK 0x0100
-#define CEPH_OSD_OP_TYPE_DATA 0x0200
-#define CEPH_OSD_OP_TYPE_ATTR 0x0300
-
-enum {
- /** data **/
- /* read */
- CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
- CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
-
- /* fancy read */
- CEPH_OSD_OP_GREP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 3,
- CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
-
- /* write */
- CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
- CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
- CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
- CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
- CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
-
- /* fancy write */
- CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
- CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
- CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
- CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
-
- /** attrs **/
- /* read */
- CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
- CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
-
- /* write */
- CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
- CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
- CEPH_OSD_OP_RESETXATTRS= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 3,
- CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
-
- /** subop **/
- CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
- CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
- CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
- CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
- CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
-
- /** lock **/
- CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
- CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
- CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
- CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
- CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
- CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
-};
-
-static inline int ceph_osd_op_type_lock(int op)
-{
- return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
-}
-static inline int ceph_osd_op_type_data(int op)
-{
- return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
-}
-static inline int ceph_osd_op_type_attr(int op)
-{
- return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
-}
-
-static inline int ceph_osd_op_mode_subop(int op)
-{
- return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
-}
-static inline int ceph_osd_op_mode_read(int op)
-{
- return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
-}
-static inline int ceph_osd_op_mode_modify(int op)
-{
- return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
-}
-
-static inline const char *ceph_osd_op_name(int op)
-{
- switch (op) {
- case CEPH_OSD_OP_READ: return "read";
- case CEPH_OSD_OP_STAT: return "stat";
-
- case CEPH_OSD_OP_GREP: return "grep";
- case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
-
- case CEPH_OSD_OP_WRITE: return "write";
- case CEPH_OSD_OP_DELETE: return "delete";
- case CEPH_OSD_OP_TRUNCATE: return "truncate";
- case CEPH_OSD_OP_ZERO: return "zero";
- case CEPH_OSD_OP_WRITEFULL: return "writefull";
-
- case CEPH_OSD_OP_APPEND: return "append";
- case CEPH_OSD_OP_STARTSYNC: return "startsync";
- case CEPH_OSD_OP_SETTRUNC: return "settrunc";
- case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
-
- case CEPH_OSD_OP_GETXATTR: return "getxattr";
- case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
- case CEPH_OSD_OP_SETXATTR: return "setxattr";
- case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
- case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
- case CEPH_OSD_OP_RMXATTR: return "rmxattr";
-
- case CEPH_OSD_OP_PULL: return "pull";
- case CEPH_OSD_OP_PUSH: return "push";
- case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
- case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
- case CEPH_OSD_OP_SCRUB: return "scrub";
-
- case CEPH_OSD_OP_WRLOCK: return "wrlock";
- case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
- case CEPH_OSD_OP_RDLOCK: return "rdlock";
- case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
- case CEPH_OSD_OP_UPLOCK: return "uplock";
- case CEPH_OSD_OP_DNLOCK: return "dnlock";
-
- default: return "???";
- }
-}
-
-
-/*
- * osd op flags
- */
-enum {
- CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */
- CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */
- CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */
- CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */
- CEPH_OSD_FLAG_INCLOCK_FAIL = 16, /* fail on inclock collision */
- CEPH_OSD_FLAG_MODIFY = 32, /* op is/was a mutation */
- CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */
- CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */
- CEPH_OSD_FLAG_BALANCE_READS = 256,
-};
-
-#define EOLDSNAPC ERESTART /* ORDERSNAP flag set and writer has old snap context*/
-#define EBLACKLISTED ESHUTDOWN /* blacklisted */
-
-struct ceph_osd_op {
- __le16 op;
- union {
- struct {
- __le64 offset, length;
- };
- struct {
- __le32 name_len;
- __le32 value_len;
- };
- struct {
- __le64 truncate_size;
- __le32 truncate_seq;
- };
- };
-} __attribute__ ((packed));
-
-struct ceph_osd_request_head {
- ceph_tid_t tid;
- __le32 client_inc;
- struct ceph_object oid;
- struct ceph_object_layout layout;
- ceph_epoch_t osdmap_epoch;
-
- __le32 flags;
- __le32 inc_lock;
-
- struct ceph_timespec mtime;
- struct ceph_eversion reassert_version;
-
- /* writer's snap context */
- __le64 snap_seq;
- __le32 num_snaps;
-
- /* read or mutation */
- __le16 num_ops;
- __u16 object_type;
- struct ceph_osd_op ops[]; /* followed by snaps */
-} __attribute__ ((packed));
-
-struct ceph_osd_reply_head {
- ceph_tid_t tid;
- __le32 client_inc;
- __le32 flags;
- struct ceph_object oid;
- struct ceph_object_layout layout;
- ceph_epoch_t osdmap_epoch;
- struct ceph_eversion reassert_version;
-
- __le32 result;
-
- __le32 num_ops;
- struct ceph_osd_op ops[0];
-} __attribute__ ((packed));
#endif
--- /dev/null
+#ifndef __MSGR_H
+#define __MSGR_H
+
+
+#define CEPH_MON_PORT 6789 /* default monitor port */
+
+/*
+ * client-side processes will try to bind to ports in this
+ * range, simply for the benefit of tools like nmap or wireshark
+ * that would like to identify the protocol.
+ */
+#define CEPH_PORT_FIRST 6789
+#define CEPH_PORT_START 6800 /* non-monitors start here */
+#define CEPH_PORT_LAST 6900
+
+/*
+ * tcp connection banner. include a protocol version. and adjust
+ * whenever the wire protocol changes. try to keep this string length
+ * constant.
+ */
+#define CEPH_BANNER "ceph 013\n"
+#define CEPH_BANNER_MAX_LEN 30
+
+
+/*
+ * Rollover-safe type and comparator for 32-bit sequence numbers.
+ * Comparator returns -1, 0, or 1.
+ */
+typedef __u32 ceph_seq_t;
+
+static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
+{
+ return ((__s32)a - (__s32)b);
+}
+
+
+/*
+ * entity_name
+ */
+struct ceph_entity_name {
+ __le32 type;
+ __le32 num;
+} __attribute__ ((packed));
+
+#define CEPH_ENTITY_TYPE_MON 1
+#define CEPH_ENTITY_TYPE_MDS 2
+#define CEPH_ENTITY_TYPE_OSD 3
+#define CEPH_ENTITY_TYPE_CLIENT 4
+#define CEPH_ENTITY_TYPE_ADMIN 5
+
+/* used by message exchange protocol */
+#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
+#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
+#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
+ incoming connection */
+#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
+ with higher cseq */
+#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
+ with higher gseq */
+#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
+#define CEPH_MSGR_TAG_MSG 10 /* message */
+#define CEPH_MSGR_TAG_ACK 11 /* message ack */
+
+
+/*
+ * entity_addr -- network address
+ */
+struct ceph_entity_addr {
+ __le32 erank; /* entity's rank in process */
+ __le32 nonce; /* unique id for process (e.g. pid) */
+ struct sockaddr_in ipaddr;
+} __attribute__ ((packed));
+
+static inline bool ceph_entity_addr_is_local(const struct ceph_entity_addr *a,
+ const struct ceph_entity_addr *b)
+{
+ return a->nonce == b->nonce &&
+ a->ipaddr.sin_addr.s_addr == b->ipaddr.sin_addr.s_addr;
+}
+
+static inline bool ceph_entity_addr_equal(const struct ceph_entity_addr *a,
+ const struct ceph_entity_addr *b)
+{
+ return memcmp(a, b, sizeof(*a)) == 0;
+}
+
+struct ceph_entity_inst {
+ struct ceph_entity_name name;
+ struct ceph_entity_addr addr;
+} __attribute__ ((packed));
+
+
+/*
+ * connection negotiation
+ */
+struct ceph_msg_connect {
+ __le32 host_type; /* CEPH_ENTITY_TYPE_* */
+ __le32 global_seq;
+ __le32 connect_seq;
+ __u8 flags;
+} __attribute__ ((packed));
+
+struct ceph_msg_connect_reply {
+ __u8 tag;
+ __le32 global_seq;
+ __le32 connect_seq;
+ __u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
+
+
+/*
+ * message header
+ */
+struct ceph_msg_header {
+ __le64 seq; /* message seq# for this session */
+ __le16 type; /* message type */
+ __le16 priority; /* priority. higher value == higher priority */
+
+ __le32 front_len; /* bytes in main payload */
+ __le32 data_len; /* bytes of data payload */
+ __le16 data_off; /* sender: include full offset;
+ receiver: mask against ~PAGE_MASK */
+
+ __u8 mon_protocol, monc_protocol; /* protocol versions, */
+ __u8 osd_protocol, osdc_protocol; /* internal and public */
+ __u8 mds_protocol, mdsc_protocol;
+
+ struct ceph_entity_inst src, orig_src, dst;
+ __le32 crc; /* header crc32c */
+} __attribute__ ((packed));
+
+#define CEPH_MSG_PRIO_LOW 64
+#define CEPH_MSG_PRIO_DEFAULT 127
+#define CEPH_MSG_PRIO_HIGH 196
+#define CEPH_MSG_PRIO_HIGHEST 255
+
+/*
+ * follows data payload
+ */
+struct ceph_msg_footer {
+ __le32 flags;
+ __le32 front_crc;
+ __le32 data_crc;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_FOOTER_ABORTED (1<<0) /* drop this message */
+#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
+
+
+#endif
--- /dev/null
+#ifndef __RADOS_H
+#define __RADOS_H
+
+#include "msgr.h"
+
+
+/*
+ * fs id
+ */
+typedef struct { unsigned char fsid[16]; } ceph_fsid_t;
+
+static inline int ceph_fsid_compare(const ceph_fsid_t *a,
+ const ceph_fsid_t *b)
+{
+ return memcmp(a, b, sizeof(*a));
+}
+
+/*
+ * ino, object, etc.
+ */
+typedef __le64 ceph_snapid_t;
+#define CEPH_MAXSNAP ((__u64)(-3))
+#define CEPH_SNAPDIR ((__u64)(-1))
+#define CEPH_NOSNAP ((__u64)(-2))
+
+struct ceph_object {
+ union {
+ __u8 raw[20]; /* fits a sha1 hash */
+ struct {
+ __le64 ino; /* inode "file" identifier */
+ __le32 bno; /* "block" (object) in that "file" */
+ __le64 snap; /* snapshot id. usually NOSNAP. */
+ } __attribute__ ((packed));
+ };
+} __attribute__ ((packed));
+
+struct ceph_timespec {
+ __le32 tv_sec;
+ __le32 tv_nsec;
+} __attribute__ ((packed));
+
+
+/*
+ * object layout - how objects are mapped into PGs
+ */
+#define CEPH_OBJECT_LAYOUT_HASH 1
+#define CEPH_OBJECT_LAYOUT_LINEAR 2
+#define CEPH_OBJECT_LAYOUT_HASHINO 3
+
+/*
+ * pg layout -- how PGs are mapped onto (sets of) OSDs
+ */
+#define CEPH_PG_LAYOUT_CRUSH 0
+#define CEPH_PG_LAYOUT_HASH 1
+#define CEPH_PG_LAYOUT_LINEAR 2
+#define CEPH_PG_LAYOUT_HYBRID 3
+
+
+/*
+ * placement group.
+ * we encode this into one __le64.
+ */
+#define CEPH_PG_TYPE_REP 1
+#define CEPH_PG_TYPE_RAID4 2
+union ceph_pg {
+ __u64 pg64;
+ struct {
+ __s16 preferred; /* preferred primary osd */
+ __u16 ps; /* placement seed */
+ __u32 pool; /* implies crush ruleset */
+ } pg;
+} __attribute__ ((packed));
+
+#define ceph_pg_is_rep(pg) ((pg).pg.type == CEPH_PG_TYPE_REP)
+#define ceph_pg_is_raid4(pg) ((pg).pg.type == CEPH_PG_TYPE_RAID4)
+
+/*
+ * pg_pool is a set of pgs storing a pool of objects
+ *
+ * pg_num -- base number of pseudorandomly placed pgs
+ *
+ * pgp_num -- effective number when calculating pg placement. this
+ * is used for pg_num increases. new pgs result in data being "split"
+ * into new pgs. for this to proceed smoothly, new pgs are intiially
+ * colocated with their parents; that is, pgp_num doesn't increase
+ * until the new pgs have successfully split. only _then_ are the new
+ * pgs placed independently.
+ *
+ * lpg_num -- localized pg count (per device). replicas are randomly
+ * selected.
+ *
+ * lpgp_num -- as above.
+ */
+struct ceph_pg_pool {
+ __u8 type;
+ __u8 size;
+ __u8 crush_ruleset;
+ __le32 pg_num, pgp_num;
+ __le32 lpg_num, lpgp_num;
+ __le32 last_change; /* most recent epoch changed */
+} __attribute__ ((packed));
+
+/*
+ * stable_mod func is used to control number of placement groups.
+ * similar to straight-up modulo, but produces a stable mapping as b
+ * increases over time. b is the number of bins, and bmask is the
+ * containing power of 2 minus 1.
+ *
+ * b <= bmask and bmask=(2**n)-1
+ * e.g., b=12 -> bmask=15, b=123 -> bmask=127
+ */
+static inline int ceph_stable_mod(int x, int b, int bmask)
+{
+ if ((x & bmask) < b)
+ return x & bmask;
+ else
+ return x & (bmask >> 1);
+}
+
+/*
+ * object layout - how a given object should be stored.
+ */
+struct ceph_object_layout {
+ __le64 ol_pgid; /* raw pg, with _full_ ps precision. */
+ __le32 ol_stripe_unit;
+} __attribute__ ((packed));
+
+/*
+ * compound epoch+version, used by storage layer to serialize mutations
+ */
+struct ceph_eversion {
+ __le32 epoch;
+ __le64 version;
+} __attribute__ ((packed));
+
+/*
+ * osd map bits
+ */
+
+/* status bits */
+#define CEPH_OSD_EXISTS 1
+#define CEPH_OSD_UP 2
+
+/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
+#define CEPH_OSD_IN 0x10000
+#define CEPH_OSD_OUT 0
+
+
+/*
+ * osd map flag bits
+ */
+#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
+#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
+#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
+#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
+#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
+
+/*
+ * osd ops
+ */
+#define CEPH_OSD_OP_MODE 0xf000
+#define CEPH_OSD_OP_MODE_RD 0x1000
+#define CEPH_OSD_OP_MODE_WR 0x2000
+#define CEPH_OSD_OP_MODE_SUB 0x4000
+
+#define CEPH_OSD_OP_TYPE 0x0f00
+#define CEPH_OSD_OP_TYPE_LOCK 0x0100
+#define CEPH_OSD_OP_TYPE_DATA 0x0200
+#define CEPH_OSD_OP_TYPE_ATTR 0x0300
+
+enum {
+ /** data **/
+ /* read */
+ CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
+ CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
+
+ /* fancy read */
+ CEPH_OSD_OP_GREP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 3,
+ CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
+
+ /* write */
+ CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
+ CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
+ CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
+ CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
+ CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
+
+ /* fancy write */
+ CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
+ CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
+ CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
+ CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
+
+ /** attrs **/
+ /* read */
+ CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
+ CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
+
+ /* write */
+ CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
+ CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
+ CEPH_OSD_OP_RESETXATTRS= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 3,
+ CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
+
+ /** subop **/
+ CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
+ CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
+ CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
+ CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
+ CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
+
+ /** lock **/
+ CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
+ CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
+ CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
+ CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
+ CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
+ CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
+};
+
+static inline int ceph_osd_op_type_lock(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
+}
+static inline int ceph_osd_op_type_data(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
+}
+static inline int ceph_osd_op_type_attr(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
+}
+
+static inline int ceph_osd_op_mode_subop(int op)
+{
+ return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
+}
+static inline int ceph_osd_op_mode_read(int op)
+{
+ return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
+}
+static inline int ceph_osd_op_mode_modify(int op)
+{
+ return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
+}
+
+static inline const char *ceph_osd_op_name(int op)
+{
+ switch (op) {
+ case CEPH_OSD_OP_READ: return "read";
+ case CEPH_OSD_OP_STAT: return "stat";
+
+ case CEPH_OSD_OP_GREP: return "grep";
+ case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
+
+ case CEPH_OSD_OP_WRITE: return "write";
+ case CEPH_OSD_OP_DELETE: return "delete";
+ case CEPH_OSD_OP_TRUNCATE: return "truncate";
+ case CEPH_OSD_OP_ZERO: return "zero";
+ case CEPH_OSD_OP_WRITEFULL: return "writefull";
+
+ case CEPH_OSD_OP_APPEND: return "append";
+ case CEPH_OSD_OP_STARTSYNC: return "startsync";
+ case CEPH_OSD_OP_SETTRUNC: return "settrunc";
+ case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
+
+ case CEPH_OSD_OP_GETXATTR: return "getxattr";
+ case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
+ case CEPH_OSD_OP_SETXATTR: return "setxattr";
+ case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
+ case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
+ case CEPH_OSD_OP_RMXATTR: return "rmxattr";
+
+ case CEPH_OSD_OP_PULL: return "pull";
+ case CEPH_OSD_OP_PUSH: return "push";
+ case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
+ case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
+ case CEPH_OSD_OP_SCRUB: return "scrub";
+
+ case CEPH_OSD_OP_WRLOCK: return "wrlock";
+ case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
+ case CEPH_OSD_OP_RDLOCK: return "rdlock";
+ case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
+ case CEPH_OSD_OP_UPLOCK: return "uplock";
+ case CEPH_OSD_OP_DNLOCK: return "dnlock";
+
+ default: return "???";
+ }
+}
+
+
+/*
+ * osd op flags
+ */
+enum {
+ CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */
+ CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */
+ CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */
+ CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */
+ CEPH_OSD_FLAG_INCLOCK_FAIL = 16, /* fail on inclock collision */
+ CEPH_OSD_FLAG_MODIFY = 32, /* op is/was a mutation */
+ CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */
+ CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */
+ CEPH_OSD_FLAG_BALANCE_READS = 256,
+};
+
+#define EOLDSNAPC ERESTART /* ORDERSNAP flag set and writer has old snap context*/
+#define EBLACKLISTED ESHUTDOWN /* blacklisted */
+
+struct ceph_osd_op {
+ __le16 op;
+ union {
+ struct {
+ __le64 offset, length;
+ };
+ struct {
+ __le32 name_len;
+ __le32 value_len;
+ };
+ struct {
+ __le64 truncate_size;
+ __le32 truncate_seq;
+ };
+ };
+} __attribute__ ((packed));
+
+struct ceph_osd_request_head {
+ __le64 tid;
+ __le32 client_inc;
+ struct ceph_object oid;
+ struct ceph_object_layout layout;
+ __le32 osdmap_epoch;
+
+ __le32 flags;
+ __le32 inc_lock;
+
+ struct ceph_timespec mtime;
+ struct ceph_eversion reassert_version;
+
+ /* writer's snap context */
+ __le64 snap_seq;
+ __le32 num_snaps;
+
+ /* read or mutation */
+ __le16 num_ops;
+ __u16 object_type;
+ struct ceph_osd_op ops[]; /* followed by snaps */
+} __attribute__ ((packed));
+
+struct ceph_osd_reply_head {
+ __le64 tid;
+ __le32 client_inc;
+ __le32 flags;
+ struct ceph_object oid;
+ struct ceph_object_layout layout;
+ __le32 osdmap_epoch;
+ struct ceph_eversion reassert_version;
+
+ __le32 result;
+
+ __le32 num_ops;
+ struct ceph_osd_op ops[0];
+} __attribute__ ((packed));
+
+
+#endif