From 5d184bd6ad12264617bc88aa76c8be7f332ed93a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 14 Oct 2008 16:15:54 -0700 Subject: [PATCH] kclient: super.h comments, cleanup, some minor cruft removed --- src/TODO | 3 + src/kernel/caps.c | 4 - src/kernel/super.c | 1 - src/kernel/super.h | 206 +++++++++++++++++++++++++++------------------ 4 files changed, 129 insertions(+), 85 deletions(-) diff --git a/src/TODO b/src/TODO index 99d53edd7bb26..45ff54fab180a 100644 --- a/src/TODO +++ b/src/TODO @@ -1,3 +1,6 @@ +kclient +- rip out STATIC_CAPS + v0.5 - debug restart, cosd reformat, etc. - finish btrfs ioctl interface diff --git a/src/kernel/caps.c b/src/kernel/caps.c index c33e94cec73dc..1f61255fe1627 100644 --- a/src/kernel/caps.c +++ b/src/kernel/caps.c @@ -134,8 +134,6 @@ retry: cap->issued = cap->implemented = 0; cap->mds = mds; - cap->flags = 0; - cap->flushed_snap = 0; is_first = RB_EMPTY_ROOT(&ci->i_caps); /* grab inode later */ cap->ci = ci; @@ -1185,8 +1183,6 @@ int __ceph_send_cap(struct ceph_mds_client *mdsc, atime = inode->i_atime; time_warp_seq = ci->i_time_warp_seq; follows = ci->i_snap_realm->cached_context->seq; - if (flush_snap) - cap->flushed_snap = follows; /* so we only flush it once */ spin_unlock(&inode->i_lock); if (dropping & CEPH_CAP_RDCACHE) { diff --git a/src/kernel/super.c b/src/kernel/super.c index a5b5c645b11c7..0eff5213e15ef 100644 --- a/src/kernel/super.c +++ b/src/kernel/super.c @@ -680,7 +680,6 @@ struct ceph_client *ceph_create_client(void) mutex_init(&client->mount_mutex); init_waitqueue_head(&client->mount_wq); - spin_lock_init(&client->sb_lock); client->sb = 0; client->mount_state = CEPH_MOUNT_MOUNTING; diff --git a/src/kernel/super.h b/src/kernel/super.h index a4769a56e7198..75efa1299eb6e 100644 --- a/src/kernel/super.h +++ b/src/kernel/super.h @@ -32,8 +32,6 @@ extern int ceph_debug_caps; extern int ceph_debug_mask; -#define CEPH_DUMP_ERROR_ALWAYS - #define dout_flag(x, mask, args...) do { \ if (((ceph_debug_mask | DOUT_UNMASKABLE) & mask) && \ ((DOUT_VAR >= 0 && x <= DOUT_VAR) || \ @@ -47,20 +45,13 @@ extern int ceph_debug_mask; #define dout(x, args...) dout_flag(x, DOUT_MASK, args) -#ifdef CEPH_DUMP_ERROR_ALWAYS #define derr(x, args...) do { \ printk(KERN_ERR "ceph: " args); \ } while (0) -#else -#define derr(x, args...) do { \ - if ((DOUT_VAR >= 0 && x <= DOUT_VAR) || \ - (DOUT_VAR < 0 && x <= ceph_debug)) \ - printk(KERN_ERR "ceph_" DOUT_PREFIX args); \ - } while (0) -#endif -#define CEPH_SUPER_MAGIC 0xc364c0de /* whatev */ -#define CEPH_BLOCK_SHIFT 20 /* 1 MB */ +#define CEPH_SUPER_MAGIC 0x00c36400 + +#define CEPH_BLOCK_SHIFT 20 /* 1 MB blocks for purposes of statfs*/ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) #define IPQUADPORT(n) \ @@ -71,18 +62,20 @@ extern int ceph_debug_mask; (unsigned int)(ntohs((n).sin_port)) -#define dput(dentry) \ +#if 0 +# define dput(dentry) \ do { \ dout(20, "dput %p %d -> %d\n", dentry, \ atomic_read(&dentry->d_count), \ atomic_read(&dentry->d_count)-1); \ dput(dentry); \ } while (0) -#define d_drop(dentry) \ +# define d_drop(dentry) \ do { \ dout(20, "d_drop %p\n", dentry); \ d_drop(dentry); \ } while (0) +#endif /* * subtract jiffies @@ -100,13 +93,13 @@ static inline unsigned long time_sub(unsigned long a, unsigned long b) #define CEPH_MOUNT_NOSHARE (1<<1) /* don't share client with other sbs */ #define CEPH_MOUNT_MYIP (1<<2) /* specified my ip */ #define CEPH_MOUNT_UNSAFE_WRITEBACK (1<<3) -#define CEPH_MOUNT_DIRSTAT (1<<4) -#define CEPH_MOUNT_RBYTES (1<<5) -#define CEPH_MOUNT_NOCRC (1<<6) +#define CEPH_MOUNT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */ +#define CEPH_MOUNT_RBYTES (1<<5) /* dir st_bytes = rbytes */ +#define CEPH_MOUNT_NOCRC (1<<6) /* no data crc on writes */ #define CEPH_MOUNT_DEFAULT (CEPH_MOUNT_RBYTES) -#define CEPH_DEFAULT_READ_SIZE (128*1024) +#define CEPH_DEFAULT_READ_SIZE (128*1024) /* readahead */ #define MAX_MON_MOUNT_ADDR 5 @@ -119,9 +112,9 @@ struct ceph_mount_args { int num_mon; struct ceph_entity_addr mon_addr[MAX_MON_MOUNT_ADDR]; int wsize; - int rsize; + int rsize; /* max readahead */ int osd_timeout; - char *snapdir_name; + char *snapdir_name; /* default ".snap" */ }; enum { @@ -165,25 +158,24 @@ struct ceph_client { struct kobject *client_kobj; struct backing_dev_info backing_dev_info; - - /* lets ignore all this until later */ - spinlock_t sb_lock; - int num_sb; /* ref count (for each sb_info that points to me) */ - struct list_head sb_list; }; -/* - * CEPH per-mount superblock info - */ static inline struct ceph_client *ceph_client(struct super_block *sb) { return sb->s_fs_info; } /* - * file i/o capability + * File i/o capability. This tracks shared state with the metadata + * server that allows us to read and write data to this file. For any + * given inode, we may have multiple capabilities, one issued by each + * metadata server, and our cumulative access is the OR of all issued + * capabilities. + * + * Each cap is referenced by the inode's i_caps tree and by a per-mds + * session capability list. */ -#define STATIC_CAPS 1 +#define STATIC_CAPS 1 /* how many to embed in each inode? */ struct ceph_cap { struct ceph_inode_info *ci; @@ -192,14 +184,14 @@ struct ceph_cap { struct list_head session_caps; /* per-session caplist */ int mds; /* must be -1 if not in use */ int issued; /* latest, from the mds */ - int implemented; /* what we've implemneted (for tracking revocation) */ + int implemented; /* what we've implemented (for tracking revocation) */ u32 seq, mseq, gen; - int flags; /* stale, etc.? */ - u64 flushed_snap; }; /* - * snapped cap state, pending flush to mds + * Snapped cap state that is pending flush to mds. When a snapshot occurs, + * we first complete any in-process sync writes and writeback any dirty + * data before flushing the snapped state (tracked here) back to the MDS. */ struct ceph_cap_snap { struct list_head ci_item; @@ -209,13 +201,17 @@ struct ceph_cap_snap { struct timespec mtime, atime, ctime; u64 time_warp_seq; struct ceph_snap_context *context; - int writing; - int dirty; + int writing; /* a sync write is still in progress */ + int dirty; /* dirty pages awaiting writeback */ }; /* - * a _leaf_ frag will be present in the i_fragtree IFF there is - * delegation info. that is, if mds >= 0 || ndist > 0. + * The frag tree describes how a directory is fragmented, potentially across + * multiple metadata servers. It is also used to indicate points where + * metadata authority is delegated, and whether/where metadata is replicated. + * + * A _leaf_ frag will be present in the i_fragtree IFF there is + * delegation info. That is, if mds >= 0 || ndist > 0. */ #define MAX_DIRFRAG_REP 4 @@ -224,20 +220,18 @@ struct ceph_inode_frag { /* fragtree state */ u32 frag; - int split_by; + int split_by; /* i.e. 2^(split_by) children */ /* delegation info */ - int mds; /* -1 if parent */ - int ndist; + int mds; /* -1 if same authority as parent */ + int ndist; /* >0 if replicated */ int dist[MAX_DIRFRAG_REP]; }; -struct ceph_dir_info { - u64 nfiles, nsubdirs; - u64 bytes; -}; - +/* + * Ceph inode. + */ struct ceph_inode_info { struct ceph_vino i_vino; /* ceph ino + snap */ @@ -252,46 +246,51 @@ struct ceph_inode_info { u64 i_rbytes, i_rfiles, i_rsubdirs; u64 i_files, i_subdirs; + /* inode lease state */ int i_lease_mask; struct ceph_mds_session *i_lease_session; - long unsigned i_lease_ttl; /* jiffies */ + long unsigned i_lease_ttl; /* jiffies */ u32 i_lease_gen; struct list_head i_lease_item; /* mds session list */ struct rb_root i_fragtree; struct mutex i_fragtree_mutex; + /* (still encoded) xattr blob */ int i_xattr_len; char *i_xattr_data; - struct rb_root i_caps; + /* capabilities */ + struct rb_root i_caps; /* cap list */ struct ceph_cap i_static_caps[STATIC_CAPS]; - wait_queue_head_t i_cap_wq; + wait_queue_head_t i_cap_wq; /* threads waiting on a capability */ unsigned long i_hold_caps_until; /* jiffies */ - struct list_head i_cap_delay_list; - int i_cap_exporting_mds; - unsigned i_cap_exporting_mseq; + struct list_head i_cap_delay_list; /* for delayed cap release to mds */ + int i_cap_exporting_mds; /* to handle cap migration between */ + unsigned i_cap_exporting_mseq; /* mds's. */ unsigned i_cap_exporting_issued; - struct list_head i_cap_snaps; - unsigned i_snap_caps; /* cap bits for snap i/o */ + struct list_head i_cap_snaps; /* snapped state pending flush to mds */ + unsigned i_snap_caps; /* cap bits for snapped files */ - int i_nr_by_mode[CEPH_FILE_MODE_NUM]; - loff_t i_max_size; /* size authorized by mds */ + int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ + + loff_t i_max_size; /* max file size authorized by mds */ loff_t i_reported_size; /* (max_)size reported to or requested of mds */ loff_t i_wanted_max_size; /* offset we'd like to write too */ loff_t i_requested_max_size; /* max_size we've requested */ + struct timespec i_old_atime; /* held references to caps */ int i_rd_ref, i_rdcache_ref, i_wr_ref; int i_wrbuffer_ref, i_wrbuffer_ref_head; - struct ceph_snap_realm *i_snap_realm; + struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */ struct list_head i_snap_realm_item; struct work_struct i_wb_work; /* writeback work */ - loff_t i_vmtruncate_to; + loff_t i_vmtruncate_to; /* delayed truncate work */ struct work_struct i_vmtruncate_work; struct inode vfs_inode; /* at end */ @@ -302,6 +301,7 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode) return list_entry(inode, struct ceph_inode_info, vfs_inode); } +/* find a specific frag @f */ static inline struct ceph_inode_frag * __ceph_find_frag(struct ceph_inode_info *ci, u32 f) { @@ -321,10 +321,17 @@ __ceph_find_frag(struct ceph_inode_info *ci, u32 f) return NULL; } +/* + * choose fragment for value @v. copy frag content to pfrag, if leaf + * exists + */ extern __u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, struct ceph_inode_frag *pfrag, int *found); +/* + * Ceph dentry state + */ struct ceph_dentry_info { struct dentry *dentry; struct ceph_mds_session *lease_session; @@ -340,7 +347,8 @@ static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) /* * ino_t is <64 bits on many architectures, blech. - * don't include snap in hash... just for now! + * + * don't include snap in ino hash, at leaset for now. */ static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) { @@ -362,6 +370,8 @@ static inline struct ceph_vino ceph_vino(struct inode *inode) { return ceph_inode(inode)->i_vino; } + +/* for printf-style formatting */ #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap static inline u64 ceph_ino(struct inode *inode) @@ -417,6 +427,9 @@ static inline int __ceph_caps_used(struct ceph_inode_info *ci) return used; } +/* + * wanted, by virtue of open file modes + */ static inline int __ceph_caps_file_wanted(struct ceph_inode_info *ci) { int want = 0; @@ -427,14 +440,18 @@ static inline int __ceph_caps_file_wanted(struct ceph_inode_info *ci) return want; } +/* + * wanted, by virtual of open file modes AND cap refs (buffered/cached data) + */ static inline int __ceph_caps_wanted(struct ceph_inode_info *ci) { int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); if (w & CEPH_CAP_WRBUFFER) - w |= CEPH_CAP_EXCL; /* want EXCL if we have dirty data */ + w |= CEPH_CAP_EXCL; /* we want EXCL if we have dirty data */ return w; } +/* for counting open files by mode */ static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode) { ci->i_nr_by_mode[mode]++; @@ -451,7 +468,6 @@ static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb) return (struct ceph_client *)sb->s_fs_info; } - static inline void ceph_queue_writeback(struct inode *inode) { queue_work(ceph_inode_to_client(inode)->wb_wq, @@ -476,6 +492,14 @@ struct ceph_file_info { * snapshots */ +/* + * A "snap context" is the set of existing snapshots when we + * write data. It is used by the OSD to guide its COW behavior. + * + * The ceph_snap_context is refcounted, and attached to each dirty + * page, indicating which context the dirty data belonged when it was + * dirtied. + */ struct ceph_snap_context { atomic_t nref; u64 seq; @@ -483,7 +507,8 @@ struct ceph_snap_context { u64 snaps[]; }; -static inline struct ceph_snap_context *ceph_get_snap_context(struct ceph_snap_context *sc) +static inline struct ceph_snap_context * +ceph_get_snap_context(struct ceph_snap_context *sc) { /* printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), @@ -508,27 +533,51 @@ static inline void ceph_put_snap_context(struct ceph_snap_context *sc) } } +/* + * A "snap realm" describes a subset of the file hierarchy sharing + * the same set of snapshots that apply to it. The realms themselves + * are organized into a hierarchy, such that children inherit (some of) + * the snapshots of their parents. + * + * All inodes within the realm that have capabilities are linked into a + * per-realm list. + */ struct ceph_snap_realm { u64 ino; int nref; u64 created, seq; u64 parent_ino; - u64 parent_since; + u64 parent_since; /* snapid when our current parent became so */ - u64 *prior_parent_snaps; - int num_prior_parent_snaps; - u64 *snaps; + u64 *prior_parent_snaps; /* snaps inherited from any parents we */ + int num_prior_parent_snaps; /* had prior to parent_since */ + u64 *snaps; /* snaps specific to this realm */ int num_snaps; struct ceph_snap_realm *parent; + struct list_head children; /* list of child realms */ struct list_head child_item; - struct list_head children; + /* the current set of snaps for this realm */ struct ceph_snap_context *cached_context; struct list_head inodes_with_caps; }; + + +/* + * calculate the number of pages a given length and offset map onto, + * if we align the data. + */ +static inline int calc_pages_for(u64 off, u64 len) +{ + return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) - + (off >> PAGE_CACHE_SHIFT); +} + + + /* snap.c */ extern void ceph_put_snap_realm(struct ceph_snap_realm *realm); extern struct ceph_snap_realm *ceph_update_snap_trace(struct ceph_mds_client *m, @@ -542,6 +591,10 @@ extern void __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap, int used); +/* + * a cap_snap is "pending" if it is still awaiting an in-progress + * sync write (that may/may not still update size, mtime, etc.). + */ inline static bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) { return !list_empty(&ci->i_cap_snaps) && @@ -550,18 +603,6 @@ inline static bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) } - -/* - * calculate the number of pages a given length and offset map onto, - * if we align the data. - */ -static inline int calc_pages_for(u64 off, u64 len) -{ - return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) - - (off >> PAGE_CACHE_SHIFT); -} - - /* super.c */ extern const char *ceph_msg_type_name(int type); @@ -613,7 +654,8 @@ extern void ceph_remove_cap(struct ceph_cap *cap); extern void ceph_remove_all_caps(struct ceph_inode_info *ci); extern int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq); extern int ceph_get_cap_mds(struct inode *inode); -extern int ceph_get_cap_refs(struct ceph_inode_info *ci, int need, int want, int *got, loff_t offset); +extern int ceph_get_cap_refs(struct ceph_inode_info *ci, int need, int want, + int *got, loff_t offset); extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int got); extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, @@ -658,6 +700,10 @@ extern struct dentry *ceph_do_lookup(struct super_block *sb, extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err); +/* + * our d_ops vary depending on whether the inode is live, + * snapshotted (read-only), or a virtual ".snap" directory. + */ static inline void ceph_init_dentry(struct dentry *dentry) { if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) dentry->d_op = &ceph_dentry_ops; -- 2.39.5