From 3d14c5d2b6e15c21d8e5467dc62d33127c23a644 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Tue, 6 Apr 2010 15:14:15 -0700 Subject: ceph: factor out libceph from Ceph file system This factors out protocol and low-level storage parts of ceph into a separate libceph module living in net/ceph and include/linux/ceph. This is mostly a matter of moving files around. However, a few key pieces of the interface change as well: - ceph_client becomes ceph_fs_client and ceph_client, where the latter captures the mon and osd clients, and the fs_client gets the mds client and file system specific pieces. - Mount option parsing and debugfs setup is correspondingly broken into two pieces. - The mon client gets a generic handler callback for otherwise unknown messages (mds map, in this case). - The basic supported/required feature bits can be expanded (and are by ceph_fs_client). No functional change, aside from some subtle error handling cases that got cleaned up in the refactoring process. Signed-off-by: Sage Weil --- fs/ceph/super.h | 397 ++++++++++++++++++++------------------------------------ 1 file changed, 143 insertions(+), 254 deletions(-) (limited to 'fs/ceph/super.h') diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b87638e..e2e9044 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1,7 +1,7 @@ #ifndef _FS_CEPH_SUPER_H #define _FS_CEPH_SUPER_H -#include "ceph_debug.h" +#include #include #include @@ -14,13 +14,7 @@ #include #include -#include "types.h" -#include "messenger.h" -#include "msgpool.h" -#include "mon_client.h" -#include "mds_client.h" -#include "osd_client.h" -#include "ceph_fs.h" +#include /* f_type in struct statfs */ #define CEPH_SUPER_MAGIC 0x00c36400 @@ -30,42 +24,25 @@ #define CEPH_BLOCK_SHIFT 20 /* 1 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) -/* - * Supported features - */ -#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK -#define CEPH_FEATURE_REQUIRED CEPH_FEATURE_NOSRCADDR +#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ +#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ +#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ -/* - * mount options - */ -#define CEPH_OPT_FSID (1<<0) -#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */ -#define CEPH_OPT_MYIP (1<<2) /* specified my ip */ -#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */ -#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ -#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */ -#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ +#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) -#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES) +#define ceph_set_mount_opt(fsc, opt) \ + (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt; +#define ceph_test_mount_opt(fsc, opt) \ + (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) -#define ceph_set_opt(client, opt) \ - (client)->mount_args->flags |= CEPH_OPT_##opt; -#define ceph_test_opt(client, opt) \ - (!!((client)->mount_args->flags & CEPH_OPT_##opt)) +#define CEPH_MAX_READDIR_DEFAULT 1024 +#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) +#define CEPH_SNAPDIRNAME_DEFAULT ".snap" - -struct ceph_mount_args { - int sb_flags; +struct ceph_mount_options { int flags; - struct ceph_fsid fsid; - struct ceph_entity_addr my_addr; - int num_mon; - struct ceph_entity_addr *mon_addr; - int mount_timeout; - int osd_idle_ttl; - int osd_timeout; - int osd_keepalive_timeout; + int sb_flags; + int wsize; int rsize; /* max readahead */ int congestion_kb; /* max writeback in flight */ @@ -73,82 +50,25 @@ struct ceph_mount_args { int cap_release_safety; int max_readdir; /* max readdir result (entires) */ int max_readdir_bytes; /* max readdir result (bytes) */ - char *snapdir_name; /* default ".snap" */ - char *name; - char *secret; -}; - -/* - * defaults - */ -#define CEPH_MOUNT_TIMEOUT_DEFAULT 60 -#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */ -#define CEPH_OSD_KEEPALIVE_DEFAULT 5 -#define CEPH_OSD_IDLE_TTL_DEFAULT 60 -#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ -#define CEPH_MAX_READDIR_DEFAULT 1024 -#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) - -#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) -#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) - -#define CEPH_SNAPDIRNAME_DEFAULT ".snap" -#define CEPH_AUTH_NAME_DEFAULT "guest" -/* - * Delay telling the MDS we no longer want caps, in case we reopen - * the file. Delay a minimum amount of time, even if we send a cap - * message for some other reason. Otherwise, take the oppotunity to - * update the mds to avoid sending another message later. - */ -#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ -#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ - -#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4) - -/* mount state */ -enum { - CEPH_MOUNT_MOUNTING, - CEPH_MOUNT_MOUNTED, - CEPH_MOUNT_UNMOUNTING, - CEPH_MOUNT_UNMOUNTED, - CEPH_MOUNT_SHUTDOWN, -}; -/* - * subtract jiffies - */ -static inline unsigned long time_sub(unsigned long a, unsigned long b) -{ - BUG_ON(time_after(b, a)); - return (long)a - (long)b; -} - -/* - * per-filesystem client state - * - * possibly shared by multiple mount points, if they are - * mounting the same ceph filesystem/cluster. - */ -struct ceph_client { - struct ceph_fsid fsid; - bool have_fsid; + /* + * everything above this point can be memcmp'd; everything below + * is handled in compare_mount_options() + */ - struct mutex mount_mutex; /* serialize mount attempts */ - struct ceph_mount_args *mount_args; + char *snapdir_name; /* default ".snap" */ +}; +struct ceph_fs_client { struct super_block *sb; - unsigned long mount_state; - wait_queue_head_t auth_wq; - - int auth_err; + struct ceph_mount_options *mount_options; + struct ceph_client *client; + unsigned long mount_state; int min_caps; /* min caps i added */ - struct ceph_messenger *msgr; /* messenger instance */ - struct ceph_mon_client monc; - struct ceph_mds_client mdsc; - struct ceph_osd_client osdc; + struct ceph_mds_client *mdsc; /* writeback */ mempool_t *wb_pagevec_pool; @@ -160,14 +80,14 @@ struct ceph_client { struct backing_dev_info backing_dev_info; #ifdef CONFIG_DEBUG_FS - struct dentry *debugfs_monmap; - struct dentry *debugfs_mdsmap, *debugfs_osdmap; - struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps; + struct dentry *debugfs_dentry_lru, *debugfs_caps; struct dentry *debugfs_congestion_kb; struct dentry *debugfs_bdi; + struct dentry *debugfs_mdsc, *debugfs_mdsmap; #endif }; + /* * File i/o capability. This tracks shared state with the metadata * server that allows us to cache or writeback attributes or to read @@ -275,6 +195,20 @@ struct ceph_inode_xattr { int should_free_val; }; +/* + * Ceph dentry state + */ +struct ceph_dentry_info { + struct ceph_mds_session *lease_session; + u32 lease_gen, lease_shared_gen; + u32 lease_seq; + unsigned long lease_renew_after, lease_renew_from; + struct list_head lru; + struct dentry *dentry; + u64 time; + u64 offset; +}; + struct ceph_inode_xattrs_info { /* * (still encoded) xattr blob. we avoid the overhead of parsing @@ -296,11 +230,6 @@ struct ceph_inode_xattrs_info { /* * Ceph inode. */ -#define CEPH_I_COMPLETE 1 /* we have complete directory cached */ -#define CEPH_I_NODELAY 4 /* do not delay cap release */ -#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ -#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ - struct ceph_inode_info { struct ceph_vino i_vino; /* ceph ino + snap */ @@ -391,6 +320,63 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode) return container_of(inode, struct ceph_inode_info, vfs_inode); } +static inline struct ceph_vino ceph_vino(struct inode *inode) +{ + return ceph_inode(inode)->i_vino; +} + +/* + * ino_t is <64 bits on many architectures, blech. + * + * don't include snap in ino hash, at least for now. + */ +static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) +{ + ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */ +#if BITS_PER_LONG == 32 + ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8; + if (!ino) + ino = 1; +#endif + return ino; +} + +/* for printf-style formatting */ +#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap + +static inline u64 ceph_ino(struct inode *inode) +{ + return ceph_inode(inode)->i_vino.ino; +} +static inline u64 ceph_snap(struct inode *inode) +{ + return ceph_inode(inode)->i_vino.snap; +} + +static inline int ceph_ino_compare(struct inode *inode, void *data) +{ + struct ceph_vino *pvino = (struct ceph_vino *)data; + struct ceph_inode_info *ci = ceph_inode(inode); + return ci->i_vino.ino == pvino->ino && + ci->i_vino.snap == pvino->snap; +} + +static inline struct inode *ceph_find_inode(struct super_block *sb, + struct ceph_vino vino) +{ + ino_t t = ceph_vino_to_ino(vino); + return ilookup5(sb, t, ceph_ino_compare, &vino); +} + + +/* + * Ceph inode. + */ +#define CEPH_I_COMPLETE 1 /* we have complete directory cached */ +#define CEPH_I_NODELAY 4 /* do not delay cap release */ +#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ +#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ + static inline void ceph_i_clear(struct inode *inode, unsigned mask) { struct ceph_inode_info *ci = ceph_inode(inode); @@ -432,20 +418,6 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, struct ceph_inode_frag *pfrag, int *found); -/* - * Ceph dentry state - */ -struct ceph_dentry_info { - struct ceph_mds_session *lease_session; - u32 lease_gen, lease_shared_gen; - u32 lease_seq; - unsigned long lease_renew_after, lease_renew_from; - struct list_head lru; - struct dentry *dentry; - u64 time; - u64 offset; -}; - static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) { return (struct ceph_dentry_info *)dentry->d_fsdata; @@ -456,22 +428,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) return ((loff_t)frag << 32) | (loff_t)off; } -/* - * ino_t is <64 bits on many architectures, blech. - * - * don't include snap in ino hash, at least for now. - */ -static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) -{ - ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */ -#if BITS_PER_LONG == 32 - ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8; - if (!ino) - ino = 1; -#endif - return ino; -} - static inline int ceph_set_ino_cb(struct inode *inode, void *data) { ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; @@ -479,39 +435,6 @@ static inline int ceph_set_ino_cb(struct inode *inode, void *data) return 0; } -static inline struct ceph_vino ceph_vino(struct inode *inode) -{ - return ceph_inode(inode)->i_vino; -} - -/* for printf-style formatting */ -#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap - -static inline u64 ceph_ino(struct inode *inode) -{ - return ceph_inode(inode)->i_vino.ino; -} -static inline u64 ceph_snap(struct inode *inode) -{ - return ceph_inode(inode)->i_vino.snap; -} - -static inline int ceph_ino_compare(struct inode *inode, void *data) -{ - struct ceph_vino *pvino = (struct ceph_vino *)data; - struct ceph_inode_info *ci = ceph_inode(inode); - return ci->i_vino.ino == pvino->ino && - ci->i_vino.snap == pvino->snap; -} - -static inline struct inode *ceph_find_inode(struct super_block *sb, - struct ceph_vino vino) -{ - ino_t t = ceph_vino_to_ino(vino); - return ilookup5(sb, t, ceph_ino_compare, &vino); -} - - /* * caps helpers */ @@ -576,18 +499,18 @@ extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need); extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx); -extern void ceph_reservation_status(struct ceph_client *client, +extern void ceph_reservation_status(struct ceph_fs_client *client, int *total, int *avail, int *used, int *reserved, int *min); -static inline struct ceph_client *ceph_inode_to_client(struct inode *inode) +static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode) { - return (struct ceph_client *)inode->i_sb->s_fs_info; + return (struct ceph_fs_client *)inode->i_sb->s_fs_info; } -static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb) +static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb) { - return (struct ceph_client *)sb->s_fs_info; + return (struct ceph_fs_client *)sb->s_fs_info; } @@ -617,51 +540,6 @@ struct ceph_file_info { /* - * snapshots - */ - -/* - * A "snap context" is the set of existing snapshots when we - * write data. It is used by the OSD to guide its COW behavior. - * - * The ceph_snap_context is refcounted, and attached to each dirty - * page, indicating which context the dirty data belonged when it was - * dirtied. - */ -struct ceph_snap_context { - atomic_t nref; - u64 seq; - int num_snaps; - u64 snaps[]; -}; - -static inline struct ceph_snap_context * -ceph_get_snap_context(struct ceph_snap_context *sc) -{ - /* - printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), - atomic_read(&sc->nref)+1); - */ - if (sc) - atomic_inc(&sc->nref); - return sc; -} - -static inline void ceph_put_snap_context(struct ceph_snap_context *sc) -{ - if (!sc) - return; - /* - printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), - atomic_read(&sc->nref)-1); - */ - if (atomic_dec_and_test(&sc->nref)) { - /*printk(" deleting snap_context %p\n", sc);*/ - kfree(sc); - } -} - -/* * A "snap realm" describes a subset of the file hierarchy sharing * the same set of snapshots that apply to it. The realms themselves * are organized into a hierarchy, such that children inherit (some of) @@ -699,16 +577,33 @@ struct ceph_snap_realm { spinlock_t inodes_with_caps_lock; }; - - -/* - * calculate the number of pages a given length and offset map onto, - * if we align the data. - */ -static inline int calc_pages_for(u64 off, u64 len) +static inline int default_congestion_kb(void) { - return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) - - (off >> PAGE_CACHE_SHIFT); + int congestion_kb; + + /* + * Copied from NFS + * + * congestion size, scale with available memory. + * + * 64MB: 8192k + * 128MB: 11585k + * 256MB: 16384k + * 512MB: 23170k + * 1GB: 32768k + * 2GB: 46340k + * 4GB: 65536k + * 8GB: 92681k + * 16GB: 131072k + * + * This allows larger machines to have larger/more transfers. + * Limit the default to 256M + */ + congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); + if (congestion_kb > 256*1024) + congestion_kb = 256*1024; + + return congestion_kb; } @@ -741,16 +636,6 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) ci_item)->writing; } - -/* super.c */ -extern struct kmem_cache *ceph_inode_cachep; -extern struct kmem_cache *ceph_cap_cachep; -extern struct kmem_cache *ceph_dentry_cachep; -extern struct kmem_cache *ceph_file_cachep; - -extern const char *ceph_msg_type_name(int type); -extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); - /* inode.c */ extern const struct inode_operations ceph_file_iops; @@ -857,12 +742,18 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); /* file.c */ extern const struct file_operations ceph_file_fops; extern const struct address_space_operations ceph_aops; +extern int ceph_copy_to_page_vector(struct page **pages, + const char *data, + loff_t off, size_t len); +extern int ceph_copy_from_page_vector(struct page **pages, + char *data, + loff_t off, size_t len); +extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); extern int ceph_open(struct inode *inode, struct file *file); extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd, int mode, int locked_dir); extern int ceph_release(struct inode *inode, struct file *filp); -extern void ceph_release_page_vector(struct page **pages, int num_pages); /* dir.c */ extern const struct file_operations ceph_dir_fops; @@ -892,12 +783,6 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg); /* export.c */ extern const struct export_operations ceph_export_ops; -/* debugfs.c */ -extern int ceph_debugfs_init(void); -extern void ceph_debugfs_cleanup(void); -extern int ceph_debugfs_client_init(struct ceph_client *client); -extern void ceph_debugfs_client_cleanup(struct ceph_client *client); - /* locks.c */ extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); @@ -914,4 +799,8 @@ static inline struct inode *get_dentry_parent_inode(struct dentry *dentry) return NULL; } +/* debugfs.c */ +extern int ceph_fs_debugfs_init(struct ceph_fs_client *client); +extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); + #endif /* _FS_CEPH_SUPER_H */ -- cgit v1.1