From e34f6d1a6a36b948ea19ab46bf37e647f8452849 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 1 May 2009 06:43:53 -0700 Subject: [PATCH] ceph: break up ceph_fs.h header into msgr.h, rados.h --- src/Makefile.am | 2 + src/include/ceph_fs.h | 514 +----------------- src/include/msgr.h | 152 ++++++ src/include/rados.h | 366 +++++++++++++ src/kernel/import_patch_set_into_linux_git.sh | 4 +- src/kernel/msgr.h | 1 + src/kernel/rados.h | 1 + 7 files changed, 530 insertions(+), 510 deletions(-) create mode 100644 src/include/msgr.h create mode 100644 src/include/rados.h create mode 120000 src/kernel/msgr.h create mode 120000 src/kernel/rados.h diff --git a/src/Makefile.am b/src/Makefile.am index aeaa3b51ca2be..283986e99be0b 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -366,11 +366,13 @@ noinst_HEADERS = \ include/interval_set.h\ include/inttypes.h\ include/lru.h\ + include/msgr.h\ include/nstring.h\ include/object.h\ include/page.h\ include/pobject.h\ include/rangeset.h\ + include/rados.h\ include/statlite.h\ include/triple.h\ include/tstring.h\ diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index d161e0eeb5353..1c890b8d1e366 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -8,17 +8,8 @@ #ifndef _FS_CEPH_CEPH_FS_H #define _FS_CEPH_CEPH_FS_H - -#define CEPH_MON_PORT 6789 /* default monitor port */ - -/* - * client-side processes will try to bind to ports in this - * range, simply for the benefit of tools like nmap or wireshark - * that would like to identify the protocol. - */ -#define CEPH_PORT_FIRST 6789 -#define CEPH_PORT_START 6800 /* non-monitors start here */ -#define CEPH_PORT_LAST 6900 +#include "msgr.h" +#include "rados.h" /* * Max file size is a policy choice; in reality we are limited @@ -26,14 +17,6 @@ */ #define CEPH_FILE_MAX_SIZE (1ULL << 40) /* 1 TB */ -/* - * tcp connection banner. include a protocol version. and adjust - * whenever the wire protocol changes. try to keep this string length - * constant. - */ -#define CEPH_BANNER "ceph 013\n" -#define CEPH_BANNER_MAX_LEN 30 - /* * subprotocol versions. when specific messages types or high-level * protocols change, bump the affected components. we keep rev @@ -54,63 +37,9 @@ * over the wire or that are stored on disk. */ -/* - * some basics - */ -typedef __le64 ceph_version_t; -typedef __le64 ceph_tid_t; /* transaction id */ -typedef __le32 ceph_epoch_t; - -/* - * fs id - */ -typedef struct { unsigned char fsid[16]; } ceph_fsid_t; - -static inline int ceph_fsid_compare(const ceph_fsid_t *a, - const ceph_fsid_t *b) -{ - return memcmp(a, b, sizeof(*a)); -} - -/* - * ino, object, etc. - */ #define CEPH_INO_ROOT 1 -typedef __le64 ceph_snapid_t; -#define CEPH_MAXSNAP ((__u64)(-3)) -#define CEPH_SNAPDIR ((__u64)(-1)) -#define CEPH_NOSNAP ((__u64)(-2)) - -struct ceph_object { - union { - __u8 raw[20]; /* fits a sha1 hash */ - struct { - __le64 ino; /* inode "file" identifier */ - __le32 bno; /* "block" (object) in that "file" */ - __le64 snap; /* snapshot id. usually NOSNAP. */ - } __attribute__ ((packed)); - }; -} __attribute__ ((packed)); - -struct ceph_timespec { - __le32 tv_sec; - __le32 tv_nsec; -} __attribute__ ((packed)); - - -/* - * Rollover-safe type and comparator for 32-bit sequence numbers. - * Comparator returns -1, 0, or 1. - */ -typedef __u32 ceph_seq_t; - -static inline __s32 ceph_seq_cmp(__u32 a, __u32 b) -{ - return ((__s32)a - (__s32)b); -} - /* * "Frags" are a way to describe a subset of a 32-bit number space, @@ -232,21 +161,6 @@ static inline int frag_compare(__u32 a, __u32 b) return 0; } -/* - * object layout - how objects are mapped into PGs - */ -#define CEPH_OBJECT_LAYOUT_HASH 1 -#define CEPH_OBJECT_LAYOUT_LINEAR 2 -#define CEPH_OBJECT_LAYOUT_HASHINO 3 - -/* - * pg layout -- how PGs are mapped onto (sets of) OSDs - */ -#define CEPH_PG_LAYOUT_CRUSH 0 -#define CEPH_PG_LAYOUT_HASH 1 -#define CEPH_PG_LAYOUT_LINEAR 2 -#define CEPH_PG_LAYOUT_HYBRID 3 - /* * ceph_file_layout - describe data layout for a file/inode */ @@ -284,94 +198,6 @@ struct ceph_file_layout { #define ceph_file_layout_period(l) (le32_to_cpu((l).fl_object_size) * \ le32_to_cpu((l).fl_stripe_count)) -/* - * placement group. - * we encode this into one __le64. - */ -#define CEPH_PG_TYPE_REP 1 -#define CEPH_PG_TYPE_RAID4 2 -union ceph_pg { - __u64 pg64; - struct { - __s16 preferred; /* preferred primary osd */ - __u16 ps; /* placement seed */ - __u32 pool; /* implies crush ruleset */ - } pg; -} __attribute__ ((packed)); - -#define ceph_pg_is_rep(pg) ((pg).pg.type == CEPH_PG_TYPE_REP) -#define ceph_pg_is_raid4(pg) ((pg).pg.type == CEPH_PG_TYPE_RAID4) - -/* - * pg_pool is a set of pgs storing a pool of objects - * - * pg_num -- base number of pseudorandomly placed pgs - * - * pgp_num -- effective number when calculating pg placement. this - * is used for pg_num increases. new pgs result in data being "split" - * into new pgs. for this to proceed smoothly, new pgs are intiially - * colocated with their parents; that is, pgp_num doesn't increase - * until the new pgs have successfully split. only _then_ are the new - * pgs placed independently. - * - * lpg_num -- localized pg count (per device). replicas are randomly - * selected. - * - * lpgp_num -- as above. - */ -struct ceph_pg_pool { - __u8 type; - __u8 size; - __u8 crush_ruleset; - __le32 pg_num, pgp_num; - __le32 lpg_num, lpgp_num; - __le32 last_change; /* most recent epoch changed */ -} __attribute__ ((packed)); - -/* - * stable_mod func is used to control number of placement groups. - * similar to straight-up modulo, but produces a stable mapping as b - * increases over time. b is the number of bins, and bmask is the - * containing power of 2 minus 1. - * - * b <= bmask and bmask=(2**n)-1 - * e.g., b=12 -> bmask=15, b=123 -> bmask=127 - */ -static inline int ceph_stable_mod(int x, int b, int bmask) -{ - if ((x & bmask) < b) - return x & bmask; - else - return x & (bmask >> 1); -} - -/* - * object layout - how a given object should be stored. - */ -struct ceph_object_layout { - __le64 ol_pgid; /* raw pg, with _full_ ps precision. */ - __le32 ol_stripe_unit; -} __attribute__ ((packed)); - -/* - * compound epoch+version, used by storage layer to serialize mutations - */ -struct ceph_eversion { - ceph_epoch_t epoch; - __le64 version; -} __attribute__ ((packed)); - -/* - * osd map bits - */ - -/* status bits */ -#define CEPH_OSD_EXISTS 1 -#define CEPH_OSD_UP 2 - -/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */ -#define CEPH_OSD_IN 0x10000 -#define CEPH_OSD_OUT 0 /* @@ -417,121 +243,6 @@ ceph_full_name_hash(const char *name, unsigned int len) * message layer */ -/* - * entity_name - */ -struct ceph_entity_name { - __le32 type; - __le32 num; -} __attribute__ ((packed)); - -#define CEPH_ENTITY_TYPE_MON 1 -#define CEPH_ENTITY_TYPE_MDS 2 -#define CEPH_ENTITY_TYPE_OSD 3 -#define CEPH_ENTITY_TYPE_CLIENT 4 -#define CEPH_ENTITY_TYPE_ADMIN 5 - -/* used by message exchange protocol */ -#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */ -#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */ -#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing - incoming connection */ -#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again - with higher cseq */ -#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again - with higher gseq */ -#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */ -#define CEPH_MSGR_TAG_MSG 10 /* message */ -#define CEPH_MSGR_TAG_ACK 11 /* message ack */ - - -/* - * entity_addr -- network address - */ -struct ceph_entity_addr { - __le32 erank; /* entity's rank in process */ - __le32 nonce; /* unique id for process (e.g. pid) */ - struct sockaddr_in ipaddr; -} __attribute__ ((packed)); - -static inline bool ceph_entity_addr_is_local(const struct ceph_entity_addr *a, - const struct ceph_entity_addr *b) -{ - return a->nonce == b->nonce && - a->ipaddr.sin_addr.s_addr == b->ipaddr.sin_addr.s_addr; -} - -static inline bool ceph_entity_addr_equal(const struct ceph_entity_addr *a, - const struct ceph_entity_addr *b) -{ - return memcmp(a, b, sizeof(*a)) == 0; -} - -struct ceph_entity_inst { - struct ceph_entity_name name; - struct ceph_entity_addr addr; -} __attribute__ ((packed)); - - -/* - * connection negotiation - */ -struct ceph_msg_connect { - __le32 host_type; /* CEPH_ENTITY_TYPE_* */ - __le32 global_seq; - __le32 connect_seq; - __u8 flags; -} __attribute__ ((packed)); - -struct ceph_msg_connect_reply { - __u8 tag; - __le32 global_seq; - __le32 connect_seq; - __u8 flags; -} __attribute__ ((packed)); - -#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */ - - -/* - * message header - */ -struct ceph_msg_header { - __le64 seq; /* message seq# for this session */ - __le16 type; /* message type */ - __le16 priority; /* priority. higher value == higher priority */ - - __le32 front_len; /* bytes in main payload */ - __le32 data_len; /* bytes of data payload */ - __le16 data_off; /* sender: include full offset; - receiver: mask against ~PAGE_MASK */ - - __u8 mon_protocol, monc_protocol; /* protocol versions, */ - __u8 osd_protocol, osdc_protocol; /* internal and public */ - __u8 mds_protocol, mdsc_protocol; - - struct ceph_entity_inst src, orig_src, dst; - __le32 crc; /* header crc32c */ -} __attribute__ ((packed)); - -#define CEPH_MSG_PRIO_LOW 64 -#define CEPH_MSG_PRIO_DEFAULT 127 -#define CEPH_MSG_PRIO_HIGH 196 -#define CEPH_MSG_PRIO_HIGHEST 255 - -/* - * follows data payload - */ -struct ceph_msg_footer { - __le32 flags; - __le32 front_crc; - __le32 data_crc; -} __attribute__ ((packed)); - -#define CEPH_MSG_FOOTER_ABORTED (1<<0) /* drop this message */ -#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */ - - /* * message types */ @@ -814,8 +525,8 @@ union ceph_mds_request_args { #define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */ struct ceph_mds_request_head { - ceph_tid_t tid, oldest_client_tid; - ceph_epoch_t mdsmap_epoch; /* on client */ + __le64 tid, oldest_client_tid; + __le32 mdsmap_epoch; /* on client */ __le32 flags; __u8 num_retry, num_fwd; __le16 num_releases; @@ -835,7 +546,7 @@ struct ceph_mds_request_release { /* client reply */ struct ceph_mds_reply_head { - ceph_tid_t tid; + __le64 tid; __le32 op; __le32 result; __le32 mdsmap_epoch; @@ -1196,220 +907,5 @@ struct ceph_mds_snap_realm { } __attribute__ ((packed)); /* followed by my snap list, then prior parent snap list */ -/* - * osd map flag bits - */ -#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */ -#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */ -#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */ -#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */ -#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */ - -/* - * osd ops - */ -#define CEPH_OSD_OP_MODE 0xf000 -#define CEPH_OSD_OP_MODE_RD 0x1000 -#define CEPH_OSD_OP_MODE_WR 0x2000 -#define CEPH_OSD_OP_MODE_SUB 0x4000 - -#define CEPH_OSD_OP_TYPE 0x0f00 -#define CEPH_OSD_OP_TYPE_LOCK 0x0100 -#define CEPH_OSD_OP_TYPE_DATA 0x0200 -#define CEPH_OSD_OP_TYPE_ATTR 0x0300 - -enum { - /** data **/ - /* read */ - CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1, - CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2, - - /* fancy read */ - CEPH_OSD_OP_GREP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 3, - CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4, - - /* write */ - CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1, - CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2, - CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3, - CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4, - CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5, - - /* fancy write */ - CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6, - CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7, - CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8, - CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9, - - /** attrs **/ - /* read */ - CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, - CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2, - - /* write */ - CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1, - CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2, - CEPH_OSD_OP_RESETXATTRS= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 3, - CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4, - - /** subop **/ - CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1, - CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2, - CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3, - CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4, - CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5, - - /** lock **/ - CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1, - CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2, - CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3, - CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4, - CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5, - CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6, -}; - -static inline int ceph_osd_op_type_lock(int op) -{ - return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK; -} -static inline int ceph_osd_op_type_data(int op) -{ - return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA; -} -static inline int ceph_osd_op_type_attr(int op) -{ - return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR; -} - -static inline int ceph_osd_op_mode_subop(int op) -{ - return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB; -} -static inline int ceph_osd_op_mode_read(int op) -{ - return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD; -} -static inline int ceph_osd_op_mode_modify(int op) -{ - return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR; -} - -static inline const char *ceph_osd_op_name(int op) -{ - switch (op) { - case CEPH_OSD_OP_READ: return "read"; - case CEPH_OSD_OP_STAT: return "stat"; - - case CEPH_OSD_OP_GREP: return "grep"; - case CEPH_OSD_OP_MASKTRUNC: return "masktrunc"; - - case CEPH_OSD_OP_WRITE: return "write"; - case CEPH_OSD_OP_DELETE: return "delete"; - case CEPH_OSD_OP_TRUNCATE: return "truncate"; - case CEPH_OSD_OP_ZERO: return "zero"; - case CEPH_OSD_OP_WRITEFULL: return "writefull"; - - case CEPH_OSD_OP_APPEND: return "append"; - case CEPH_OSD_OP_STARTSYNC: return "startsync"; - case CEPH_OSD_OP_SETTRUNC: return "settrunc"; - case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc"; - - case CEPH_OSD_OP_GETXATTR: return "getxattr"; - case CEPH_OSD_OP_GETXATTRS: return "getxattrs"; - case CEPH_OSD_OP_SETXATTR: return "setxattr"; - case CEPH_OSD_OP_SETXATTRS: return "setxattrs"; - case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs"; - case CEPH_OSD_OP_RMXATTR: return "rmxattr"; - - case CEPH_OSD_OP_PULL: return "pull"; - case CEPH_OSD_OP_PUSH: return "push"; - case CEPH_OSD_OP_BALANCEREADS: return "balance-reads"; - case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads"; - case CEPH_OSD_OP_SCRUB: return "scrub"; - - case CEPH_OSD_OP_WRLOCK: return "wrlock"; - case CEPH_OSD_OP_WRUNLOCK: return "wrunlock"; - case CEPH_OSD_OP_RDLOCK: return "rdlock"; - case CEPH_OSD_OP_RDUNLOCK: return "rdunlock"; - case CEPH_OSD_OP_UPLOCK: return "uplock"; - case CEPH_OSD_OP_DNLOCK: return "dnlock"; - - default: return "???"; - } -} - - -/* - * osd op flags - */ -enum { - CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */ - CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */ - CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */ - CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */ - CEPH_OSD_FLAG_INCLOCK_FAIL = 16, /* fail on inclock collision */ - CEPH_OSD_FLAG_MODIFY = 32, /* op is/was a mutation */ - CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */ - CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */ - CEPH_OSD_FLAG_BALANCE_READS = 256, -}; - -#define EOLDSNAPC ERESTART /* ORDERSNAP flag set and writer has old snap context*/ -#define EBLACKLISTED ESHUTDOWN /* blacklisted */ - -struct ceph_osd_op { - __le16 op; - union { - struct { - __le64 offset, length; - }; - struct { - __le32 name_len; - __le32 value_len; - }; - struct { - __le64 truncate_size; - __le32 truncate_seq; - }; - }; -} __attribute__ ((packed)); - -struct ceph_osd_request_head { - ceph_tid_t tid; - __le32 client_inc; - struct ceph_object oid; - struct ceph_object_layout layout; - ceph_epoch_t osdmap_epoch; - - __le32 flags; - __le32 inc_lock; - - struct ceph_timespec mtime; - struct ceph_eversion reassert_version; - - /* writer's snap context */ - __le64 snap_seq; - __le32 num_snaps; - - /* read or mutation */ - __le16 num_ops; - __u16 object_type; - struct ceph_osd_op ops[]; /* followed by snaps */ -} __attribute__ ((packed)); - -struct ceph_osd_reply_head { - ceph_tid_t tid; - __le32 client_inc; - __le32 flags; - struct ceph_object oid; - struct ceph_object_layout layout; - ceph_epoch_t osdmap_epoch; - struct ceph_eversion reassert_version; - - __le32 result; - - __le32 num_ops; - struct ceph_osd_op ops[0]; -} __attribute__ ((packed)); #endif diff --git a/src/include/msgr.h b/src/include/msgr.h new file mode 100644 index 0000000000000..a0a7d11142bbe --- /dev/null +++ b/src/include/msgr.h @@ -0,0 +1,152 @@ +#ifndef __MSGR_H +#define __MSGR_H + + +#define CEPH_MON_PORT 6789 /* default monitor port */ + +/* + * client-side processes will try to bind to ports in this + * range, simply for the benefit of tools like nmap or wireshark + * that would like to identify the protocol. + */ +#define CEPH_PORT_FIRST 6789 +#define CEPH_PORT_START 6800 /* non-monitors start here */ +#define CEPH_PORT_LAST 6900 + +/* + * tcp connection banner. include a protocol version. and adjust + * whenever the wire protocol changes. try to keep this string length + * constant. + */ +#define CEPH_BANNER "ceph 013\n" +#define CEPH_BANNER_MAX_LEN 30 + + +/* + * Rollover-safe type and comparator for 32-bit sequence numbers. + * Comparator returns -1, 0, or 1. + */ +typedef __u32 ceph_seq_t; + +static inline __s32 ceph_seq_cmp(__u32 a, __u32 b) +{ + return ((__s32)a - (__s32)b); +} + + +/* + * entity_name + */ +struct ceph_entity_name { + __le32 type; + __le32 num; +} __attribute__ ((packed)); + +#define CEPH_ENTITY_TYPE_MON 1 +#define CEPH_ENTITY_TYPE_MDS 2 +#define CEPH_ENTITY_TYPE_OSD 3 +#define CEPH_ENTITY_TYPE_CLIENT 4 +#define CEPH_ENTITY_TYPE_ADMIN 5 + +/* used by message exchange protocol */ +#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */ +#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */ +#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing + incoming connection */ +#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again + with higher cseq */ +#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again + with higher gseq */ +#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */ +#define CEPH_MSGR_TAG_MSG 10 /* message */ +#define CEPH_MSGR_TAG_ACK 11 /* message ack */ + + +/* + * entity_addr -- network address + */ +struct ceph_entity_addr { + __le32 erank; /* entity's rank in process */ + __le32 nonce; /* unique id for process (e.g. pid) */ + struct sockaddr_in ipaddr; +} __attribute__ ((packed)); + +static inline bool ceph_entity_addr_is_local(const struct ceph_entity_addr *a, + const struct ceph_entity_addr *b) +{ + return a->nonce == b->nonce && + a->ipaddr.sin_addr.s_addr == b->ipaddr.sin_addr.s_addr; +} + +static inline bool ceph_entity_addr_equal(const struct ceph_entity_addr *a, + const struct ceph_entity_addr *b) +{ + return memcmp(a, b, sizeof(*a)) == 0; +} + +struct ceph_entity_inst { + struct ceph_entity_name name; + struct ceph_entity_addr addr; +} __attribute__ ((packed)); + + +/* + * connection negotiation + */ +struct ceph_msg_connect { + __le32 host_type; /* CEPH_ENTITY_TYPE_* */ + __le32 global_seq; + __le32 connect_seq; + __u8 flags; +} __attribute__ ((packed)); + +struct ceph_msg_connect_reply { + __u8 tag; + __le32 global_seq; + __le32 connect_seq; + __u8 flags; +} __attribute__ ((packed)); + +#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */ + + +/* + * message header + */ +struct ceph_msg_header { + __le64 seq; /* message seq# for this session */ + __le16 type; /* message type */ + __le16 priority; /* priority. higher value == higher priority */ + + __le32 front_len; /* bytes in main payload */ + __le32 data_len; /* bytes of data payload */ + __le16 data_off; /* sender: include full offset; + receiver: mask against ~PAGE_MASK */ + + __u8 mon_protocol, monc_protocol; /* protocol versions, */ + __u8 osd_protocol, osdc_protocol; /* internal and public */ + __u8 mds_protocol, mdsc_protocol; + + struct ceph_entity_inst src, orig_src, dst; + __le32 crc; /* header crc32c */ +} __attribute__ ((packed)); + +#define CEPH_MSG_PRIO_LOW 64 +#define CEPH_MSG_PRIO_DEFAULT 127 +#define CEPH_MSG_PRIO_HIGH 196 +#define CEPH_MSG_PRIO_HIGHEST 255 + +/* + * follows data payload + */ +struct ceph_msg_footer { + __le32 flags; + __le32 front_crc; + __le32 data_crc; +} __attribute__ ((packed)); + +#define CEPH_MSG_FOOTER_ABORTED (1<<0) /* drop this message */ +#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */ + + +#endif diff --git a/src/include/rados.h b/src/include/rados.h new file mode 100644 index 0000000000000..7718dcef77f11 --- /dev/null +++ b/src/include/rados.h @@ -0,0 +1,366 @@ +#ifndef __RADOS_H +#define __RADOS_H + +#include "msgr.h" + + +/* + * fs id + */ +typedef struct { unsigned char fsid[16]; } ceph_fsid_t; + +static inline int ceph_fsid_compare(const ceph_fsid_t *a, + const ceph_fsid_t *b) +{ + return memcmp(a, b, sizeof(*a)); +} + +/* + * ino, object, etc. + */ +typedef __le64 ceph_snapid_t; +#define CEPH_MAXSNAP ((__u64)(-3)) +#define CEPH_SNAPDIR ((__u64)(-1)) +#define CEPH_NOSNAP ((__u64)(-2)) + +struct ceph_object { + union { + __u8 raw[20]; /* fits a sha1 hash */ + struct { + __le64 ino; /* inode "file" identifier */ + __le32 bno; /* "block" (object) in that "file" */ + __le64 snap; /* snapshot id. usually NOSNAP. */ + } __attribute__ ((packed)); + }; +} __attribute__ ((packed)); + +struct ceph_timespec { + __le32 tv_sec; + __le32 tv_nsec; +} __attribute__ ((packed)); + + +/* + * object layout - how objects are mapped into PGs + */ +#define CEPH_OBJECT_LAYOUT_HASH 1 +#define CEPH_OBJECT_LAYOUT_LINEAR 2 +#define CEPH_OBJECT_LAYOUT_HASHINO 3 + +/* + * pg layout -- how PGs are mapped onto (sets of) OSDs + */ +#define CEPH_PG_LAYOUT_CRUSH 0 +#define CEPH_PG_LAYOUT_HASH 1 +#define CEPH_PG_LAYOUT_LINEAR 2 +#define CEPH_PG_LAYOUT_HYBRID 3 + + +/* + * placement group. + * we encode this into one __le64. + */ +#define CEPH_PG_TYPE_REP 1 +#define CEPH_PG_TYPE_RAID4 2 +union ceph_pg { + __u64 pg64; + struct { + __s16 preferred; /* preferred primary osd */ + __u16 ps; /* placement seed */ + __u32 pool; /* implies crush ruleset */ + } pg; +} __attribute__ ((packed)); + +#define ceph_pg_is_rep(pg) ((pg).pg.type == CEPH_PG_TYPE_REP) +#define ceph_pg_is_raid4(pg) ((pg).pg.type == CEPH_PG_TYPE_RAID4) + +/* + * pg_pool is a set of pgs storing a pool of objects + * + * pg_num -- base number of pseudorandomly placed pgs + * + * pgp_num -- effective number when calculating pg placement. this + * is used for pg_num increases. new pgs result in data being "split" + * into new pgs. for this to proceed smoothly, new pgs are intiially + * colocated with their parents; that is, pgp_num doesn't increase + * until the new pgs have successfully split. only _then_ are the new + * pgs placed independently. + * + * lpg_num -- localized pg count (per device). replicas are randomly + * selected. + * + * lpgp_num -- as above. + */ +struct ceph_pg_pool { + __u8 type; + __u8 size; + __u8 crush_ruleset; + __le32 pg_num, pgp_num; + __le32 lpg_num, lpgp_num; + __le32 last_change; /* most recent epoch changed */ +} __attribute__ ((packed)); + +/* + * stable_mod func is used to control number of placement groups. + * similar to straight-up modulo, but produces a stable mapping as b + * increases over time. b is the number of bins, and bmask is the + * containing power of 2 minus 1. + * + * b <= bmask and bmask=(2**n)-1 + * e.g., b=12 -> bmask=15, b=123 -> bmask=127 + */ +static inline int ceph_stable_mod(int x, int b, int bmask) +{ + if ((x & bmask) < b) + return x & bmask; + else + return x & (bmask >> 1); +} + +/* + * object layout - how a given object should be stored. + */ +struct ceph_object_layout { + __le64 ol_pgid; /* raw pg, with _full_ ps precision. */ + __le32 ol_stripe_unit; +} __attribute__ ((packed)); + +/* + * compound epoch+version, used by storage layer to serialize mutations + */ +struct ceph_eversion { + __le32 epoch; + __le64 version; +} __attribute__ ((packed)); + +/* + * osd map bits + */ + +/* status bits */ +#define CEPH_OSD_EXISTS 1 +#define CEPH_OSD_UP 2 + +/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */ +#define CEPH_OSD_IN 0x10000 +#define CEPH_OSD_OUT 0 + + +/* + * osd map flag bits + */ +#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */ +#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */ +#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */ +#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */ +#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */ + +/* + * osd ops + */ +#define CEPH_OSD_OP_MODE 0xf000 +#define CEPH_OSD_OP_MODE_RD 0x1000 +#define CEPH_OSD_OP_MODE_WR 0x2000 +#define CEPH_OSD_OP_MODE_SUB 0x4000 + +#define CEPH_OSD_OP_TYPE 0x0f00 +#define CEPH_OSD_OP_TYPE_LOCK 0x0100 +#define CEPH_OSD_OP_TYPE_DATA 0x0200 +#define CEPH_OSD_OP_TYPE_ATTR 0x0300 + +enum { + /** data **/ + /* read */ + CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1, + CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2, + + /* fancy read */ + CEPH_OSD_OP_GREP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 3, + CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4, + + /* write */ + CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1, + CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2, + CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3, + CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4, + CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5, + + /* fancy write */ + CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6, + CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7, + CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8, + CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9, + + /** attrs **/ + /* read */ + CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, + CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2, + + /* write */ + CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1, + CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2, + CEPH_OSD_OP_RESETXATTRS= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 3, + CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4, + + /** subop **/ + CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1, + CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2, + CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3, + CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4, + CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5, + + /** lock **/ + CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1, + CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2, + CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3, + CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4, + CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5, + CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6, +}; + +static inline int ceph_osd_op_type_lock(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK; +} +static inline int ceph_osd_op_type_data(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA; +} +static inline int ceph_osd_op_type_attr(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR; +} + +static inline int ceph_osd_op_mode_subop(int op) +{ + return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB; +} +static inline int ceph_osd_op_mode_read(int op) +{ + return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD; +} +static inline int ceph_osd_op_mode_modify(int op) +{ + return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR; +} + +static inline const char *ceph_osd_op_name(int op) +{ + switch (op) { + case CEPH_OSD_OP_READ: return "read"; + case CEPH_OSD_OP_STAT: return "stat"; + + case CEPH_OSD_OP_GREP: return "grep"; + case CEPH_OSD_OP_MASKTRUNC: return "masktrunc"; + + case CEPH_OSD_OP_WRITE: return "write"; + case CEPH_OSD_OP_DELETE: return "delete"; + case CEPH_OSD_OP_TRUNCATE: return "truncate"; + case CEPH_OSD_OP_ZERO: return "zero"; + case CEPH_OSD_OP_WRITEFULL: return "writefull"; + + case CEPH_OSD_OP_APPEND: return "append"; + case CEPH_OSD_OP_STARTSYNC: return "startsync"; + case CEPH_OSD_OP_SETTRUNC: return "settrunc"; + case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc"; + + case CEPH_OSD_OP_GETXATTR: return "getxattr"; + case CEPH_OSD_OP_GETXATTRS: return "getxattrs"; + case CEPH_OSD_OP_SETXATTR: return "setxattr"; + case CEPH_OSD_OP_SETXATTRS: return "setxattrs"; + case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs"; + case CEPH_OSD_OP_RMXATTR: return "rmxattr"; + + case CEPH_OSD_OP_PULL: return "pull"; + case CEPH_OSD_OP_PUSH: return "push"; + case CEPH_OSD_OP_BALANCEREADS: return "balance-reads"; + case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads"; + case CEPH_OSD_OP_SCRUB: return "scrub"; + + case CEPH_OSD_OP_WRLOCK: return "wrlock"; + case CEPH_OSD_OP_WRUNLOCK: return "wrunlock"; + case CEPH_OSD_OP_RDLOCK: return "rdlock"; + case CEPH_OSD_OP_RDUNLOCK: return "rdunlock"; + case CEPH_OSD_OP_UPLOCK: return "uplock"; + case CEPH_OSD_OP_DNLOCK: return "dnlock"; + + default: return "???"; + } +} + + +/* + * osd op flags + */ +enum { + CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */ + CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */ + CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */ + CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */ + CEPH_OSD_FLAG_INCLOCK_FAIL = 16, /* fail on inclock collision */ + CEPH_OSD_FLAG_MODIFY = 32, /* op is/was a mutation */ + CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */ + CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */ + CEPH_OSD_FLAG_BALANCE_READS = 256, +}; + +#define EOLDSNAPC ERESTART /* ORDERSNAP flag set and writer has old snap context*/ +#define EBLACKLISTED ESHUTDOWN /* blacklisted */ + +struct ceph_osd_op { + __le16 op; + union { + struct { + __le64 offset, length; + }; + struct { + __le32 name_len; + __le32 value_len; + }; + struct { + __le64 truncate_size; + __le32 truncate_seq; + }; + }; +} __attribute__ ((packed)); + +struct ceph_osd_request_head { + __le64 tid; + __le32 client_inc; + struct ceph_object oid; + struct ceph_object_layout layout; + __le32 osdmap_epoch; + + __le32 flags; + __le32 inc_lock; + + struct ceph_timespec mtime; + struct ceph_eversion reassert_version; + + /* writer's snap context */ + __le64 snap_seq; + __le32 num_snaps; + + /* read or mutation */ + __le16 num_ops; + __u16 object_type; + struct ceph_osd_op ops[]; /* followed by snaps */ +} __attribute__ ((packed)); + +struct ceph_osd_reply_head { + __le64 tid; + __le32 client_inc; + __le32 flags; + struct ceph_object oid; + struct ceph_object_layout layout; + __le32 osdmap_epoch; + struct ceph_eversion reassert_version; + + __le32 result; + + __le32 num_ops; + struct ceph_osd_op ops[0]; +} __attribute__ ((packed)); + + +#endif diff --git a/src/kernel/import_patch_set_into_linux_git.sh b/src/kernel/import_patch_set_into_linux_git.sh index 7ee2210cbc576..6ec9fc2f7b9f9 100755 --- a/src/kernel/import_patch_set_into_linux_git.sh +++ b/src/kernel/import_patch_set_into_linux_git.sh @@ -29,10 +29,12 @@ Mount options, syntax. EOF git add fs/ceph/ceph_fs.h +git add fs/ceph/msgr.h +git add fs/ceph/rados.h git commit -F - <