+++ /dev/null
-*.o.cmd
-*.ko
-*.ko.cmd
-*.mod.c
+++ /dev/null
-config CEPH_FS
- tristate "Ceph distributed file system (EXPERIMENTAL)"
- depends on INET && EXPERIMENTAL
- select LIBCRC32C
- help
- Choose Y or M here to include support for mounting the
- experimental Ceph distributed file system. Ceph is an extremely
- scalable file system designed to provide high performance,
- reliable access to petabytes of storage.
-
- More information at http://ceph.newdream.net/.
-
- If unsure, say N.
-
-config CEPH_FS_PRETTYDEBUG
- bool "Include file:line in ceph debug output"
- depends on CEPH_FS
- default n
- help
- If you say Y here, debug output will include a filename and
- line to aid debugging. This icnreases kernel size and slows
- execution slightly when debug call sites are enabled (e.g.,
- via CONFIG_DYNAMIC_DEBUG).
-
- If unsure, say N.
-
+++ /dev/null
-#
-# Makefile for CEPH filesystem.
-#
-
-ifneq ($(KERNELRELEASE),)
-
-obj-$(CONFIG_CEPH_FS) += ceph.o
-
-ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
- export.o caps.o snap.o xattr.o \
- messenger.o msgpool.o buffer.o \
- mds_client.o mdsmap.o \
- mon_client.o \
- osd_client.o osdmap.o crush/crush.o crush/mapper.o \
- debugfs.o \
- ceph_fs.o ceph_strings.o ceph_frag.o
-
-else
-#Otherwise we were called directly from the command
-# line; invoke the kernel build system.
-
-KERNELDIR ?= /lib/modules/$(shell uname -r)/build
-PWD := $(shell pwd)
-
-default: all
-
-all:
- $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
-
-modules_install:
- $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
-
-clean:
- $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
-
-endif
+++ /dev/null
-Quick and dirty instructions on building this into the kernel
-
--- as a separate module
-
-$ make
-or
-$ make KERNELDIR=/path/to/kernel
-
-
--- built in
-
-1) Patch kernel (to add to fs/Kconfig, fs/Makefile):
-
-$ cd linux
-$ patch -p1 < ~/ceph/src/kernel/kconfig.patch
-patching file fs/Kconfig
-patching file fs/Makefile
-
-2) Symlink (adjust path to ceph source accordingly)
-
-$ ln -s ~/ceph/src/kernel fs/ceph
-$ ln -s ~/ceph/src/include/ceph_fs.h fs/ceph
-
-3) Enable CONFIG_CEPH_FS in .config. Ceph should now be the first item under File Systems -> Network File Systems.
-
-4) Build!
-
--
+++ /dev/null
-#include "ceph_debug.h"
-
-#include <linux/backing-dev.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/writeback.h> /* generic_writepages */
-#include <linux/pagevec.h>
-#include <linux/task_io_accounting_ops.h>
-
-#include "super.h"
-#include "osd_client.h"
-
-/*
- * Ceph address space ops.
- *
- * There are a few funny things going on here.
- *
- * The page->private field is used to reference a struct
- * ceph_snap_context for _every_ dirty page. This indicates which
- * snapshot the page was logically dirtied in, and thus which snap
- * context needs to be associated with the osd write during writeback.
- *
- * Similarly, struct ceph_inode_info maintains a set of counters to
- * count dirty pages on the inode. In the absense of snapshots,
- * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
- *
- * When a snapshot is taken (that is, when the client receives
- * notification that a snapshot was taken), each inode with caps and
- * with dirty pages (dirty pages implies there is a cap) gets a new
- * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
- * order, new snaps go to the tail). The i_wrbuffer_ref_head count is
- * moved to capsnap->dirty. (Unless a sync write is currently in
- * progress. In that case, the capsnap is said to be "pending", new
- * writes cannot start, and the capsnap isn't "finalized" until the
- * write completes (or fails) and a final size/mtime for the inode for
- * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.
- *
- * On writeback, we must submit writes to the osd IN SNAP ORDER. So,
- * we look for the first capsnap in i_cap_snaps and write out pages in
- * that snap context _only_. Then we move on to the next capsnap,
- * eventually reaching the "live" or "head" context (i.e., pages that
- * are not yet snapped) and are writing the most recently dirtied
- * pages.
- *
- * Invalidate and so forth must take care to ensure the dirty page
- * accounting is preserved.
- */
-
-
-/*
- * Dirty a page. Optimistically adjust accounting, on the assumption
- * that we won't race with invalidate. If we do, readjust.
- */
-static int ceph_set_page_dirty(struct page *page)
-{
- struct address_space *mapping = page->mapping;
- struct inode *inode;
- struct ceph_inode_info *ci;
- int undo = 0;
- struct ceph_snap_context *snapc;
-
- if (unlikely(!mapping))
- return !TestSetPageDirty(page);
-
- if (TestSetPageDirty(page)) {
- dout("%p set_page_dirty %p idx %lu -- already dirty\n",
- mapping->host, page, page->index);
- return 0;
- }
-
- inode = mapping->host;
- ci = ceph_inode(inode);
-
- /*
- * Note that we're grabbing a snapc ref here without holding
- * any locks!
- */
- snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
-
- /* dirty the head */
- spin_lock(&inode->i_lock);
- if (ci->i_wrbuffer_ref_head == 0)
- ci->i_head_snapc = ceph_get_snap_context(snapc);
- ++ci->i_wrbuffer_ref_head;
- if (ci->i_wrbuffer_ref == 0)
- igrab(inode);
- ++ci->i_wrbuffer_ref;
- dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
- "snapc %p seq %lld (%d snaps)\n",
- mapping->host, page, page->index,
- ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
- ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
- snapc, snapc->seq, snapc->num_snaps);
- spin_unlock(&inode->i_lock);
-
- /* now adjust page */
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27)
- spin_lock_irq(&mapping->tree_lock);
-#else
- write_lock_irq(&mapping->tree_lock);
-#endif
- if (page->mapping) { /* Race with truncate? */
- WARN_ON_ONCE(!PageUptodate(page));
-
- if (mapping_cap_account_dirty(mapping)) {
- __inc_zone_page_state(page, NR_FILE_DIRTY);
- __inc_bdi_stat(mapping->backing_dev_info,
- BDI_RECLAIMABLE);
- task_io_account_write(PAGE_CACHE_SIZE);
- }
- radix_tree_tag_set(&mapping->page_tree,
- page_index(page), PAGECACHE_TAG_DIRTY);
-
- /*
- * Reference snap context in page->private. Also set
- * PagePrivate so that we get invalidatepage callback.
- */
- page->private = (unsigned long)snapc;
- SetPagePrivate(page);
- } else {
- dout("ANON set_page_dirty %p (raced truncate?)\n", page);
- undo = 1;
- }
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27)
- spin_unlock_irq(&mapping->tree_lock);
-#else
- write_unlock_irq(&mapping->tree_lock);
-#endif
-
- if (undo)
- /* whoops, we failed to dirty the page */
- ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
-
- __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-
- BUG_ON(!PageDirty(page));
- return 1;
-}
-
-/*
- * If we are truncating the full page (i.e. offset == 0), adjust the
- * dirty page counters appropriately. Only called if there is private
- * data on the page.
- */
-static void ceph_invalidatepage(struct page *page, unsigned long offset)
-{
- struct inode *inode = page->mapping->host;
- struct ceph_inode_info *ci;
- struct ceph_snap_context *snapc = (void *)page->private;
-
- BUG_ON(!PageLocked(page));
- BUG_ON(!page->private);
- BUG_ON(!PagePrivate(page));
- BUG_ON(!page->mapping);
-
- /*
- * We can get non-dirty pages here due to races between
- * set_page_dirty and truncate_complete_page; just spit out a
- * warning, in case we end up with accounting problems later.
- */
- if (!PageDirty(page))
- pr_err("%p invalidatepage %p page not dirty\n", inode, page);
-
- if (offset == 0)
- ClearPageChecked(page);
-
- ci = ceph_inode(inode);
- if (offset == 0) {
- dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
- inode, page, page->index, offset);
- ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
- ceph_put_snap_context(snapc);
- page->private = 0;
- ClearPagePrivate(page);
- } else {
- dout("%p invalidatepage %p idx %lu partial dirty page\n",
- inode, page, page->index);
- }
-}
-
-/* just a sanity check */
-static int ceph_releasepage(struct page *page, gfp_t g)
-{
- struct inode *inode = page->mapping ? page->mapping->host : NULL;
- dout("%p releasepage %p idx %lu\n", inode, page, page->index);
- WARN_ON(PageDirty(page));
- WARN_ON(page->private);
- WARN_ON(PagePrivate(page));
- return 0;
-}
-
-/*
- * read a single page, without unlocking it.
- */
-static int readpage_nounlock(struct file *filp, struct page *page)
-{
- struct inode *inode = filp->f_dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
- int err = 0;
- u64 len = PAGE_CACHE_SIZE;
-
- dout("readpage inode %p file %p page %p index %lu\n",
- inode, filp, page, page->index);
- err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
- page->index << PAGE_CACHE_SHIFT, &len,
- ci->i_truncate_seq, ci->i_truncate_size,
- &page, 1);
- if (err == -ENOENT)
- err = 0;
- if (err < 0) {
- SetPageError(page);
- goto out;
- } else if (err < PAGE_CACHE_SIZE) {
- /* zero fill remainder of page */
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25)
- zero_user_segment(page, err, PAGE_CACHE_SIZE);
-#else
- zero_user_page(page, err, PAGE_CACHE_SIZE - err, KM_USER0);
-#endif
- }
- SetPageUptodate(page);
-
-out:
- return err < 0 ? err : 0;
-}
-
-static int ceph_readpage(struct file *filp, struct page *page)
-{
- int r = readpage_nounlock(filp, page);
- unlock_page(page);
- return r;
-}
-
-/*
- * Build a vector of contiguous pages from the provided page list.
- */
-static struct page **page_vector_from_list(struct list_head *page_list,
- unsigned *nr_pages)
-{
- struct page **pages;
- struct page *page;
- int next_index, contig_pages = 0;
-
- /* build page vector */
- pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
- if (!pages)
- return ERR_PTR(-ENOMEM);
-
- BUG_ON(list_empty(page_list));
- next_index = list_entry(page_list->prev, struct page, lru)->index;
- list_for_each_entry_reverse(page, page_list, lru) {
- if (page->index == next_index) {
- dout("readpages page %d %p\n", contig_pages, page);
- pages[contig_pages] = page;
- contig_pages++;
- next_index++;
- } else {
- break;
- }
- }
- *nr_pages = contig_pages;
- return pages;
-}
-
-/*
- * Read multiple pages. Leave pages we don't read + unlock in page_list;
- * the caller (VM) cleans them up.
- */
-static int ceph_readpages(struct file *file, struct address_space *mapping,
- struct list_head *page_list, unsigned nr_pages)
-{
- struct inode *inode = file->f_dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
- int rc = 0;
- struct page **pages;
- struct pagevec pvec;
- loff_t offset;
- u64 len;
-
- dout("readpages %p file %p nr_pages %d\n",
- inode, file, nr_pages);
-
- pages = page_vector_from_list(page_list, &nr_pages);
- if (IS_ERR(pages))
- return PTR_ERR(pages);
-
- /* guess read extent */
- offset = pages[0]->index << PAGE_CACHE_SHIFT;
- len = nr_pages << PAGE_CACHE_SHIFT;
- rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
- offset, &len,
- ci->i_truncate_seq, ci->i_truncate_size,
- pages, nr_pages);
- if (rc == -ENOENT)
- rc = 0;
- if (rc < 0)
- goto out;
-
- /* set uptodate and add to lru in pagevec-sized chunks */
- pagevec_init(&pvec, 0);
- for (; !list_empty(page_list) && len > 0;
- rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
- struct page *page =
- list_entry(page_list->prev, struct page, lru);
-
- list_del(&page->lru);
-
- if (rc < (int)PAGE_CACHE_SIZE) {
- /* zero (remainder of) page */
- int s = rc < 0 ? 0 : rc;
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25)
- zero_user_segment(page, s, PAGE_CACHE_SIZE);
-#else
- zero_user_page(page, s, PAGE_CACHE_SIZE-s, KM_USER0);
-#endif
- }
-
- if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
- page_cache_release(page);
- dout("readpages %p add_to_page_cache failed %p\n",
- inode, page);
- continue;
- }
- dout("readpages %p adding %p idx %lu\n", inode, page,
- page->index);
- flush_dcache_page(page);
- SetPageUptodate(page);
- unlock_page(page);
- if (pagevec_add(&pvec, page) == 0)
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28)
- pagevec_lru_add_file(&pvec); /* add to lru */
-#else
- pagevec_lru_add(&pvec); /* add to lru */
-#endif
- }
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28)
- pagevec_lru_add_file(&pvec);
-#else
- pagevec_lru_add(&pvec);
-#endif
- rc = 0;
-
-out:
- kfree(pages);
- return rc;
-}
-
-/*
- * Get ref for the oldest snapc for an inode with dirty data... that is, the
- * only snap context we are allowed to write back.
- *
- * Caller holds i_lock.
- */
-static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
- u64 *snap_size)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_snap_context *snapc = NULL;
- struct ceph_cap_snap *capsnap = NULL;
-
- list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
- capsnap->context, capsnap->dirty_pages);
- if (capsnap->dirty_pages) {
- snapc = ceph_get_snap_context(capsnap->context);
- if (snap_size)
- *snap_size = capsnap->size;
- break;
- }
- }
- if (!snapc && ci->i_snap_realm) {
- snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
- dout(" head snapc %p has %d dirty pages\n",
- snapc, ci->i_wrbuffer_ref_head);
- }
- return snapc;
-}
-
-static struct ceph_snap_context *get_oldest_context(struct inode *inode,
- u64 *snap_size)
-{
- struct ceph_snap_context *snapc = NULL;
-
- spin_lock(&inode->i_lock);
- snapc = __get_oldest_context(inode, snap_size);
- spin_unlock(&inode->i_lock);
- return snapc;
-}
-
-/*
- * Write a single page, but leave the page locked.
- *
- * If we get a write error, set the page error bit, but still adjust the
- * dirty page accounting (i.e., page is no longer dirty).
- */
-static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
-{
- struct inode *inode;
- struct ceph_inode_info *ci;
- struct ceph_osd_client *osdc;
- loff_t page_off = page->index << PAGE_CACHE_SHIFT;
- int len = PAGE_CACHE_SIZE;
- loff_t i_size;
- int err = 0;
- struct ceph_snap_context *snapc;
- u64 snap_size = 0;
-
- dout("writepage %p idx %lu\n", page, page->index);
-
- if (!page->mapping || !page->mapping->host) {
- dout("writepage %p - no mapping\n", page);
- return -EFAULT;
- }
- inode = page->mapping->host;
- ci = ceph_inode(inode);
- osdc = &ceph_inode_to_client(inode)->osdc;
-
- /* verify this is a writeable snap context */
- snapc = (void *)page->private;
- if (snapc == NULL) {
- dout("writepage %p page %p not dirty?\n", inode, page);
- goto out;
- }
- if (snapc != get_oldest_context(inode, &snap_size)) {
- dout("writepage %p page %p snapc %p not writeable - noop\n",
- inode, page, (void *)page->private);
- /* we should only noop if called by kswapd */
- WARN_ON((current->flags & PF_MEMALLOC) == 0);
- goto out;
- }
-
- /* is this a partial page at end of file? */
- if (snap_size)
- i_size = snap_size;
- else
- i_size = i_size_read(inode);
- if (i_size < page_off + len)
- len = i_size - page_off;
-
- dout("writepage %p page %p index %lu on %llu~%u\n",
- inode, page, page->index, page_off, len);
-
- set_page_writeback(page);
- err = ceph_osdc_writepages(osdc, ceph_vino(inode),
- &ci->i_layout, snapc,
- page_off, len,
- ci->i_truncate_seq, ci->i_truncate_size,
- &inode->i_mtime,
- &page, 1, 0, 0, true);
- if (err < 0) {
- dout("writepage setting page/mapping error %d %p\n", err, page);
- SetPageError(page);
- mapping_set_error(&inode->i_data, err);
- if (wbc)
- wbc->pages_skipped++;
- } else {
- dout("writepage cleaned page %p\n", page);
- err = 0; /* vfs expects us to return 0 */
- }
- page->private = 0;
- ClearPagePrivate(page);
- end_page_writeback(page);
- ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
- ceph_put_snap_context(snapc);
-out:
- return err;
-}
-
-static int ceph_writepage(struct page *page, struct writeback_control *wbc)
-{
- int err = writepage_nounlock(page, wbc);
- unlock_page(page);
- return err;
-}
-
-
-/*
- * lame release_pages helper. release_pages() isn't exported to
- * modules.
- */
-static void ceph_release_pages(struct page **pages, int num)
-{
- struct pagevec pvec;
- int i;
-
- pagevec_init(&pvec, 0);
- for (i = 0; i < num; i++) {
- if (pagevec_add(&pvec, pages[i]) == 0)
- pagevec_release(&pvec);
- }
- pagevec_release(&pvec);
-}
-
-
-/*
- * async writeback completion handler.
- *
- * If we get an error, set the mapping error bit, but not the individual
- * page error bits.
- */
-static void writepages_finish(struct ceph_osd_request *req,
- struct ceph_msg *msg)
-{
- struct inode *inode = req->r_inode;
- struct ceph_osd_reply_head *replyhead;
- struct ceph_osd_op *op;
- struct ceph_inode_info *ci = ceph_inode(inode);
- unsigned wrote;
- loff_t offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
- struct page *page;
- int i;
- struct ceph_snap_context *snapc = req->r_snapc;
- struct address_space *mapping = inode->i_mapping;
- struct writeback_control *wbc = req->r_wbc;
- __s32 rc = -EIO;
- u64 bytes = 0;
-
- /* parse reply */
- replyhead = msg->front.iov_base;
- WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
- op = (void *)(replyhead + 1);
- rc = le32_to_cpu(replyhead->result);
- bytes = le64_to_cpu(op->extent.length);
-
- if (rc >= 0) {
- wrote = (bytes + (offset & ~PAGE_CACHE_MASK) + ~PAGE_CACHE_MASK)
- >> PAGE_CACHE_SHIFT;
- WARN_ON(wrote != req->r_num_pages);
- } else {
- wrote = 0;
- mapping_set_error(mapping, rc);
- }
- dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
- inode, rc, bytes, wrote);
-
- /* clean all pages */
- for (i = 0; i < req->r_num_pages; i++) {
- page = req->r_pages[i];
- BUG_ON(!page);
- WARN_ON(!PageUptodate(page));
-
- if (i >= wrote) {
- dout("inode %p skipping page %p\n", inode, page);
- wbc->pages_skipped++;
- }
- page->private = 0;
- ClearPagePrivate(page);
- ceph_put_snap_context(snapc);
- dout("unlocking %d %p\n", i, page);
- end_page_writeback(page);
- unlock_page(page);
- }
- dout("%p wrote+cleaned %d pages\n", inode, wrote);
- ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
-
- ceph_release_pages(req->r_pages, req->r_num_pages);
- if (req->r_pages_from_pool)
- mempool_free(req->r_pages,
- ceph_client(inode->i_sb)->wb_pagevec_pool);
- else
- kfree(req->r_pages);
- ceph_osdc_put_request(req);
-}
-
-/*
- * allocate a page vec, either directly, or if necessary, via a the
- * mempool. we avoid the mempool if we can because req->r_num_pages
- * may be less than the maximum write size.
- */
-static void alloc_page_vec(struct ceph_client *client,
- struct ceph_osd_request *req)
-{
- req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
- GFP_NOFS);
- if (!req->r_pages) {
- req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
- req->r_pages_from_pool = 1;
- WARN_ON(!req->r_pages);
- }
-}
-
-/*
- * initiate async writeback
- */
-static int ceph_writepages_start(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- struct inode *inode = mapping->host;
- struct backing_dev_info *bdi = mapping->backing_dev_info;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_client *client = ceph_inode_to_client(inode);
- pgoff_t index, start, end;
- int range_whole = 0;
- int should_loop = 1;
- pgoff_t max_pages = 0, max_pages_ever = 0;
- struct ceph_snap_context *snapc = NULL, *last_snapc = NULL;
- struct pagevec pvec;
- int done = 0;
- int rc = 0;
- unsigned wsize = 1 << inode->i_blkbits;
- struct ceph_osd_request *req = NULL;
- int do_sync;
- u64 snap_size = 0;
-
- /*
- * Include a 'sync' in the OSD request if this is a data
- * integrity write (e.g., O_SYNC write or fsync()), or if our
- * cap is being revoked.
- */
- do_sync = wbc->sync_mode == WB_SYNC_ALL;
- if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
- do_sync = 1;
- dout("writepages_start %p dosync=%d (mode=%s)\n",
- inode, do_sync,
- wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
- (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
-
- client = ceph_inode_to_client(inode);
- if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
- pr_warning("writepage_start %p on forced umount\n", inode);
- return -EIO; /* we're in a forced umount, don't write! */
- }
- if (client->mount_args.wsize && client->mount_args.wsize < wsize)
- wsize = client->mount_args.wsize;
- if (wsize < PAGE_CACHE_SIZE)
- wsize = PAGE_CACHE_SIZE;
- max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
-
- pagevec_init(&pvec, 0);
-
- /* ?? */
- if (wbc->nonblocking && bdi_write_congested(bdi)) {
- dout(" writepages congested\n");
- wbc->encountered_congestion = 1;
- goto out_final;
- }
-
- /* where to start/end? */
- if (wbc->range_cyclic) {
- start = mapping->writeback_index; /* Start from prev offset */
- end = -1;
- dout(" cyclic, start at %lu\n", start);
- } else {
- start = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
- if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
- range_whole = 1;
- should_loop = 0;
- dout(" not cyclic, %lu to %lu\n", start, end);
- }
- index = start;
-
-retry:
- /* find oldest snap context with dirty data */
- ceph_put_snap_context(snapc);
- snapc = get_oldest_context(inode, &snap_size);
- if (!snapc) {
- /* hmm, why does writepages get called when there
- is no dirty data? */
- dout(" no snap context with dirty data?\n");
- goto out;
- }
- dout(" oldest snapc is %p seq %lld (%d snaps)\n",
- snapc, snapc->seq, snapc->num_snaps);
- if (last_snapc && snapc != last_snapc) {
- /* if we switched to a newer snapc, restart our scan at the
- * start of the original file range. */
- dout(" snapc differs from last pass, restarting at %lu\n",
- index);
- index = start;
- }
- last_snapc = snapc;
-
- while (!done && index <= end) {
- unsigned i;
- int first;
- pgoff_t next;
- int pvec_pages, locked_pages;
- struct page *page;
- int want;
- u64 offset, len;
- struct ceph_osd_request_head *reqhead;
- struct ceph_osd_op *op;
-
- next = 0;
- locked_pages = 0;
- max_pages = max_pages_ever;
-
-get_more_pages:
- first = -1;
- want = min(end - index,
- min((pgoff_t)PAGEVEC_SIZE,
- max_pages - (pgoff_t)locked_pages) - 1)
- + 1;
- pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_DIRTY,
- want);
- dout("pagevec_lookup_tag got %d\n", pvec_pages);
- if (!pvec_pages && !locked_pages)
- break;
- for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
- page = pvec.pages[i];
- dout("? %p idx %lu\n", page, page->index);
- if (locked_pages == 0)
- lock_page(page); /* first page */
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27)
- else if (!trylock_page(page))
-#else
- else if (TestSetPageLocked(page))
-#endif
- break;
-
- /* only dirty pages, or our accounting breaks */
- if (unlikely(!PageDirty(page)) ||
- unlikely(page->mapping != mapping)) {
- dout("!dirty or !mapping %p\n", page);
- unlock_page(page);
- break;
- }
- if (!wbc->range_cyclic && page->index > end) {
- dout("end of range %p\n", page);
- done = 1;
- unlock_page(page);
- break;
- }
- if (next && (page->index != next)) {
- dout("not consecutive %p\n", page);
- unlock_page(page);
- break;
- }
- if (wbc->sync_mode != WB_SYNC_NONE) {
- dout("waiting on writeback %p\n", page);
- wait_on_page_writeback(page);
- }
- if ((snap_size && page_offset(page) > snap_size) ||
- (!snap_size &&
- page_offset(page) > i_size_read(inode))) {
- dout("%p page eof %llu\n", page, snap_size ?
- snap_size : i_size_read(inode));
- done = 1;
- unlock_page(page);
- break;
- }
- if (PageWriteback(page)) {
- dout("%p under writeback\n", page);
- unlock_page(page);
- break;
- }
-
- /* only if matching snap context */
- if (snapc != (void *)page->private) {
- dout("page snapc %p != oldest %p\n",
- (void *)page->private, snapc);
- unlock_page(page);
- if (!locked_pages)
- continue; /* keep looking for snap */
- break;
- }
-
- if (!clear_page_dirty_for_io(page)) {
- dout("%p !clear_page_dirty_for_io\n", page);
- unlock_page(page);
- break;
- }
-
- /* ok */
- if (locked_pages == 0) {
- /* prepare async write request */
- offset = page->index << PAGE_CACHE_SHIFT;
- len = wsize;
- req = ceph_osdc_new_request(&client->osdc,
- &ci->i_layout,
- ceph_vino(inode),
- offset, &len,
- CEPH_OSD_OP_WRITE,
- CEPH_OSD_FLAG_WRITE |
- CEPH_OSD_FLAG_ONDISK,
- snapc, do_sync,
- ci->i_truncate_seq,
- ci->i_truncate_size,
- &inode->i_mtime, true, 1);
- max_pages = req->r_num_pages;
-
- alloc_page_vec(client, req);
- req->r_callback = writepages_finish;
- req->r_inode = inode;
- req->r_wbc = wbc;
- }
-
- /* note position of first page in pvec */
- if (first < 0)
- first = i;
- dout("%p will write page %p idx %lu\n",
- inode, page, page->index);
- set_page_writeback(page);
- req->r_pages[locked_pages] = page;
- locked_pages++;
- next = page->index + 1;
- }
-
- /* did we get anything? */
- if (!locked_pages)
- goto release_pvec_pages;
- if (i) {
- int j;
- BUG_ON(!locked_pages || first < 0);
-
- if (pvec_pages && i == pvec_pages &&
- locked_pages < max_pages) {
- dout("reached end pvec, trying for more\n");
- pagevec_reinit(&pvec);
- goto get_more_pages;
- }
-
- /* shift unused pages over in the pvec... we
- * will need to release them below. */
- for (j = i; j < pvec_pages; j++) {
- dout(" pvec leftover page %p\n",
- pvec.pages[j]);
- pvec.pages[j-i+first] = pvec.pages[j];
- }
- pvec.nr -= i-first;
- }
-
- /* submit the write */
- offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
- len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
- (u64)locked_pages << PAGE_CACHE_SHIFT);
- dout("writepages got %d pages at %llu~%llu\n",
- locked_pages, offset, len);
-
- /* revise final length, page count */
- req->r_num_pages = locked_pages;
- reqhead = req->r_request->front.iov_base;
- op = (void *)(reqhead + 1);
- op->extent.length = cpu_to_le64(len);
- op->payload_len = cpu_to_le32(len);
- req->r_request->hdr.data_len = cpu_to_le32(len);
-
- ceph_osdc_start_request(&client->osdc, req, true);
- req = NULL;
-
- /* continue? */
- index = next;
- wbc->nr_to_write -= locked_pages;
- if (wbc->nr_to_write <= 0)
- done = 1;
-
-release_pvec_pages:
- dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
- pvec.nr ? pvec.pages[0] : NULL);
- pagevec_release(&pvec);
-
- if (locked_pages && !done)
- goto retry;
- }
-
- if (should_loop && !done) {
- /* more to do; loop back to beginning of file */
- dout("writepages looping back to beginning of file\n");
- should_loop = 0;
- index = 0;
- goto retry;
- }
-
- if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
- mapping->writeback_index = index;
-
-out:
- if (req)
- ceph_osdc_put_request(req);
- if (rc > 0)
- rc = 0; /* vfs expects us to return 0 */
- ceph_put_snap_context(snapc);
- dout("writepages done, rc = %d\n", rc);
-out_final:
- return rc;
-}
-
-
-
-/*
- * See if a given @snapc is either writeable, or already written.
- */
-static int context_is_writeable_or_written(struct inode *inode,
- struct ceph_snap_context *snapc)
-{
- struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
- return !oldest || snapc->seq <= oldest->seq;
-}
-
-/*
- * We are only allowed to write into/dirty the page if the page is
- * clean, or already dirty within the same snap context.
- */
-static int ceph_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
-{
- struct inode *inode = file->f_dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
- struct page *page;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- loff_t page_off = pos & PAGE_CACHE_MASK;
- int pos_in_page = pos & ~PAGE_CACHE_MASK;
- int end_in_page = pos_in_page + len;
- loff_t i_size;
- struct ceph_snap_context *snapc;
- int r;
-
- /* get a page*/
-retry:
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 29)
- page = grab_cache_page_write_begin(mapping, index, 0);
-#else
- page = __grab_cache_page(mapping, index);
-#endif
- if (!page)
- return -ENOMEM;
- *pagep = page;
-
- dout("write_begin file %p inode %p page %p %d~%d\n", file,
- inode, page, (int)pos, (int)len);
-
-retry_locked:
- /* writepages currently holds page lock, but if we change that later, */
- wait_on_page_writeback(page);
-
- /* check snap context */
- BUG_ON(!ci->i_snap_realm);
- down_read(&mdsc->snap_rwsem);
- BUG_ON(!ci->i_snap_realm->cached_context);
- if (page->private &&
- (void *)page->private != ci->i_snap_realm->cached_context) {
- /*
- * this page is already dirty in another (older) snap
- * context! is it writeable now?
- */
- snapc = get_oldest_context(inode, NULL);
- up_read(&mdsc->snap_rwsem);
-
- if (snapc != (void *)page->private) {
- dout(" page %p snapc %p not current or oldest\n",
- page, (void *)page->private);
- /*
- * queue for writeback, and wait for snapc to
- * be writeable or written
- */
- snapc = ceph_get_snap_context((void *)page->private);
- unlock_page(page);
- if (ceph_queue_writeback(inode))
- igrab(inode);
- wait_event_interruptible(ci->i_cap_wq,
- context_is_writeable_or_written(inode, snapc));
- ceph_put_snap_context(snapc);
- goto retry;
- }
-
- /* yay, writeable, do it now (without dropping page lock) */
- dout(" page %p snapc %p not current, but oldest\n",
- page, snapc);
- if (!clear_page_dirty_for_io(page))
- goto retry_locked;
- r = writepage_nounlock(page, NULL);
- if (r < 0)
- goto fail_nosnap;
- goto retry_locked;
- }
-
- if (PageUptodate(page)) {
- dout(" page %p already uptodate\n", page);
- return 0;
- }
-
- /* full page? */
- if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
- return 0;
-
- /* past end of file? */
- i_size = inode->i_size; /* caller holds i_mutex */
-
- if (i_size + len > inode->i_sb->s_maxbytes) {
- /* file is too big */
- r = -EINVAL;
- goto fail;
- }
-
- if (page_off >= i_size ||
- (pos_in_page == 0 && (pos+len) >= i_size &&
- end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
- dout(" zeroing %p 0 - %d and %d - %d\n",
- page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27)
- zero_user_segments(page,
- 0, pos_in_page,
- end_in_page, PAGE_CACHE_SIZE);
-#else
- simple_prepare_write(file, page, pos_in_page, end_in_page);
-#endif
- return 0;
- }
-
- /* we need to read it. */
- up_read(&mdsc->snap_rwsem);
- r = readpage_nounlock(file, page);
- if (r < 0)
- goto fail_nosnap;
- goto retry_locked;
-
-fail:
- up_read(&mdsc->snap_rwsem);
-fail_nosnap:
- unlock_page(page);
- return r;
-}
-
-/*
- * we don't do anything in here that simple_write_end doesn't do
- * except adjust dirty page accounting and drop read lock on
- * mdsc->snap_rwsem.
- */
-static int ceph_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- struct inode *inode = file->f_dentry->d_inode;
- struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
- int check_cap = 0;
-
- dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
- inode, page, (int)pos, (int)copied, (int)len);
-
- /* zero the stale part of the page if we did a short copy */
- if (copied < len) {
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25)
- zero_user_segment(page, from+copied, len);
-#else
- zero_user_page(page, from+copied, len-copied, KM_USER0);
-#endif
- }
-
- /* did file size increase? */
- /* (no need for i_size_read(); we caller holds i_mutex */
- if (pos+copied > inode->i_size)
- check_cap = ceph_inode_set_size(inode, pos+copied);
-
- if (!PageUptodate(page))
- SetPageUptodate(page);
-
- set_page_dirty(page);
-
- unlock_page(page);
- up_read(&mdsc->snap_rwsem);
- page_cache_release(page);
-
- if (check_cap)
- ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
-
- return copied;
-}
-
-/*
- * we set .direct_IO to indicate direct io is supported, but since we
- * intercept O_DIRECT reads and writes early, this function should
- * never get called.
- */
-static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
- const struct iovec *iov,
- loff_t pos, unsigned long nr_segs)
-{
- WARN_ON(1);
- return -EINVAL;
-}
-
-const struct address_space_operations ceph_aops = {
- .readpage = ceph_readpage,
- .readpages = ceph_readpages,
- .writepage = ceph_writepage,
- .writepages = ceph_writepages_start,
- .write_begin = ceph_write_begin,
- .write_end = ceph_write_end,
- .set_page_dirty = ceph_set_page_dirty,
- .invalidatepage = ceph_invalidatepage,
- .releasepage = ceph_releasepage,
- .direct_IO = ceph_direct_io,
-};
-
-
-/*
- * vm ops
- */
-
-/*
- * Reuse write_begin here for simplicity.
- */
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 30)
-static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
-#else
-static int ceph_page_mkwrite(struct vm_area_struct *vma, struct page *page)
-#endif
-{
- struct inode *inode = vma->vm_file->f_dentry->d_inode;
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 30)
- struct page *page = vmf->page;
- struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
-#endif
- loff_t off = page->index << PAGE_CACHE_SHIFT;
- loff_t size, len;
- struct page *locked_page = NULL;
- void *fsdata = NULL;
- int ret;
-
- size = i_size_read(inode);
- if (off + PAGE_CACHE_SIZE <= size)
- len = PAGE_CACHE_SIZE;
- else
- len = size & ~PAGE_CACHE_MASK;
-
- dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
- off, len, page, page->index);
- ret = ceph_write_begin(vma->vm_file, inode->i_mapping, off, len, 0,
- &locked_page, &fsdata);
- WARN_ON(page != locked_page);
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 30)
- if (!ret) {
- /*
- * doing the following, instead of calling
- * ceph_write_end. Note that we keep the
- * page locked
- */
- set_page_dirty(page);
- up_read(&mdsc->snap_rwsem);
- page_cache_release(page);
- ret = VM_FAULT_LOCKED;
- } else {
- ret = VM_FAULT_SIGBUS;
- }
-#else
- if (!ret)
- ceph_write_end(vma->vm_file, inode->i_mapping, off, len, len,
- locked_page, fsdata);
-#endif
- dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
- return ret;
-}
-
-static struct vm_operations_struct ceph_vmops = {
- .fault = filemap_fault,
- .page_mkwrite = ceph_page_mkwrite,
-};
-
-int ceph_mmap(struct file *file, struct vm_area_struct *vma)
-{
- struct address_space *mapping = file->f_mapping;
-
- if (!mapping->a_ops->readpage)
- return -ENOEXEC;
- file_accessed(file);
- vma->vm_ops = &ceph_vmops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
- return 0;
-}
+++ /dev/null
-
-#include "ceph_debug.h"
-#include "buffer.h"
-
-struct ceph_buffer *ceph_buffer_new(gfp_t gfp)
-{
- struct ceph_buffer *b;
-
- b = kmalloc(sizeof(*b), gfp);
- if (!b)
- return NULL;
- atomic_set(&b->nref, 1);
- b->vec.iov_base = NULL;
- b->vec.iov_len = 0;
- b->alloc_len = 0;
- return b;
-}
-
-int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
-{
- b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
- if (b->vec.iov_base) {
- b->is_vmalloc = false;
- } else {
- b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
- b->is_vmalloc = true;
- }
- if (!b->vec.iov_base)
- return -ENOMEM;
- b->alloc_len = len;
- b->vec.iov_len = len;
- return 0;
-}
-
+++ /dev/null
-#ifndef __FS_CEPH_BUFFER_H
-#define __FS_CEPH_BUFFER_H
-
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-#include <linux/types.h>
-#include <linux/uio.h>
-
-/*
- * a simple reference counted buffer.
- *
- * use kmalloc for small sizes (<= one page), vmalloc for larger
- * sizes.
- */
-struct ceph_buffer {
- atomic_t nref;
- struct kvec vec;
- size_t alloc_len;
- bool is_vmalloc;
-};
-
-struct ceph_buffer *ceph_buffer_new(gfp_t gfp);
-int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp);
-
-static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
-{
- atomic_inc(&b->nref);
- return b;
-}
-
-static inline void ceph_buffer_put(struct ceph_buffer *b)
-{
- if (b && atomic_dec_and_test(&b->nref)) {
- if (b->vec.iov_base) {
- if (b->is_vmalloc)
- vfree(b->vec.iov_base);
- else
- kfree(b->vec.iov_base);
- }
- kfree(b);
- }
-}
-
-static inline struct ceph_buffer *ceph_buffer_new_alloc(int len, gfp_t gfp)
-{
- struct ceph_buffer *b = ceph_buffer_new(gfp);
-
- if (b && ceph_buffer_alloc(b, len, gfp) < 0) {
- ceph_buffer_put(b);
- b = NULL;
- }
- return b;
-}
-
-#endif
+++ /dev/null
-#include "ceph_debug.h"
-
-#include <linux/fs.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/vmalloc.h>
-#include <linux/wait.h>
-
-#include "super.h"
-#include "decode.h"
-#include "messenger.h"
-
-/*
- * Capability management
- *
- * The Ceph metadata servers control client access to inode metadata
- * and file data by issuing capabilities, granting clients permission
- * to read and/or write both inode field and file data to OSDs
- * (storage nodes). Each capability consists of a set of bits
- * indicating which operations are allowed.
- *
- * If the client holds a *_SHARED cap, the client has a coherent value
- * that can be safely read from the cached inode.
- *
- * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
- * client is allowed to change inode attributes (e.g., file size,
- * mtime), note its dirty state in the ceph_cap, and asynchronously
- * flush that metadata change to the MDS.
- *
- * In the event of a conflicting operation (perhaps by another
- * client), the MDS will revoke the conflicting client capabilities.
- *
- * In order for a client to cache an inode, it must hold a capability
- * with at least one MDS server. When inodes are released, release
- * notifications are batched and periodically sent en masse to the MDS
- * cluster to release server state.
- */
-
-
-/*
- * Generate readable cap strings for debugging output.
- */
-#define MAX_CAP_STR 20
-static char cap_str[MAX_CAP_STR][40];
-static DEFINE_SPINLOCK(cap_str_lock);
-static int last_cap_str;
-
-static char *gcap_string(char *s, int c)
-{
- if (c & CEPH_CAP_GSHARED)
- *s++ = 's';
- if (c & CEPH_CAP_GEXCL)
- *s++ = 'x';
- if (c & CEPH_CAP_GCACHE)
- *s++ = 'c';
- if (c & CEPH_CAP_GRD)
- *s++ = 'r';
- if (c & CEPH_CAP_GWR)
- *s++ = 'w';
- if (c & CEPH_CAP_GBUFFER)
- *s++ = 'b';
- if (c & CEPH_CAP_GLAZYIO)
- *s++ = 'l';
- return s;
-}
-
-const char *ceph_cap_string(int caps)
-{
- int i;
- char *s;
- int c;
-
- spin_lock(&cap_str_lock);
- i = last_cap_str++;
- if (last_cap_str == MAX_CAP_STR)
- last_cap_str = 0;
- spin_unlock(&cap_str_lock);
-
- s = cap_str[i];
-
- if (caps & CEPH_CAP_PIN)
- *s++ = 'p';
-
- c = (caps >> CEPH_CAP_SAUTH) & 3;
- if (c) {
- *s++ = 'A';
- s = gcap_string(s, c);
- }
-
- c = (caps >> CEPH_CAP_SLINK) & 3;
- if (c) {
- *s++ = 'L';
- s = gcap_string(s, c);
- }
-
- c = (caps >> CEPH_CAP_SXATTR) & 3;
- if (c) {
- *s++ = 'X';
- s = gcap_string(s, c);
- }
-
- c = caps >> CEPH_CAP_SFILE;
- if (c) {
- *s++ = 'F';
- s = gcap_string(s, c);
- }
-
- if (s == cap_str[i])
- *s++ = '-';
- *s = 0;
- return cap_str[i];
-}
-
-/*
- * Cap reservations
- *
- * Maintain a global pool of preallocated struct ceph_caps, referenced
- * by struct ceph_caps_reservations. This ensures that we preallocate
- * memory needed to successfully process an MDS response. (If an MDS
- * sends us cap information and we fail to process it, we will have
- * problems due to the client and MDS being out of sync.)
- *
- * Reservations are 'owned' by a ceph_cap_reservation context.
- */
-static spinlock_t caps_list_lock;
-static struct list_head caps_list; /* unused (reserved or unreserved) */
-static int caps_total_count; /* total caps allocated */
-static int caps_use_count; /* in use */
-static int caps_reserve_count; /* unused, reserved */
-static int caps_avail_count; /* unused, unreserved */
-
-void __init ceph_caps_init(void)
-{
- INIT_LIST_HEAD(&caps_list);
- spin_lock_init(&caps_list_lock);
-}
-
-void ceph_caps_finalize(void)
-{
- struct ceph_cap *cap;
-
- spin_lock(&caps_list_lock);
- while (!list_empty(&caps_list)) {
- cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
- list_del(&cap->caps_item);
- kmem_cache_free(ceph_cap_cachep, cap);
- }
- caps_total_count = 0;
- caps_avail_count = 0;
- caps_use_count = 0;
- caps_reserve_count = 0;
- spin_unlock(&caps_list_lock);
-}
-
-int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
-{
- int i;
- struct ceph_cap *cap;
- int have;
- int alloc = 0;
- LIST_HEAD(newcaps);
- int ret = 0;
-
- dout("reserve caps ctx=%p need=%d\n", ctx, need);
-
- /* first reserve any caps that are already allocated */
- spin_lock(&caps_list_lock);
- if (caps_avail_count >= need)
- have = need;
- else
- have = caps_avail_count;
- caps_avail_count -= have;
- caps_reserve_count += have;
- BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
- caps_avail_count);
- spin_unlock(&caps_list_lock);
-
- for (i = have; i < need; i++) {
- cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
- if (!cap) {
- ret = -ENOMEM;
- goto out_alloc_count;
- }
- list_add(&cap->caps_item, &newcaps);
- alloc++;
- }
- BUG_ON(have + alloc != need);
-
- spin_lock(&caps_list_lock);
- caps_total_count += alloc;
- caps_reserve_count += alloc;
- list_splice(&newcaps, &caps_list);
-
- BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
- caps_avail_count);
- spin_unlock(&caps_list_lock);
-
- ctx->count = need;
- dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
- ctx, caps_total_count, caps_use_count, caps_reserve_count,
- caps_avail_count);
- return 0;
-
-out_alloc_count:
- /* we didn't manage to reserve as much as we needed */
- pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
- ctx, need, have);
- return ret;
-}
-
-int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
-{
- dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
- if (ctx->count) {
- spin_lock(&caps_list_lock);
- BUG_ON(caps_reserve_count < ctx->count);
- caps_reserve_count -= ctx->count;
- caps_avail_count += ctx->count;
- ctx->count = 0;
- dout("unreserve caps %d = %d used + %d resv + %d avail\n",
- caps_total_count, caps_use_count, caps_reserve_count,
- caps_avail_count);
- BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
- caps_avail_count);
- spin_unlock(&caps_list_lock);
- }
- return 0;
-}
-
-static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
-{
- struct ceph_cap *cap = NULL;
-
- /* temporary, until we do something about cap import/export */
- if (!ctx)
- return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
-
- spin_lock(&caps_list_lock);
- dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
- ctx, ctx->count, caps_total_count, caps_use_count,
- caps_reserve_count, caps_avail_count);
- BUG_ON(!ctx->count);
- BUG_ON(ctx->count > caps_reserve_count);
- BUG_ON(list_empty(&caps_list));
-
- ctx->count--;
- caps_reserve_count--;
- caps_use_count++;
-
- cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
- list_del(&cap->caps_item);
-
- BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
- caps_avail_count);
- spin_unlock(&caps_list_lock);
- return cap;
-}
-
-static void put_cap(struct ceph_cap *cap,
- struct ceph_cap_reservation *ctx)
-{
- spin_lock(&caps_list_lock);
- dout("put_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
- ctx, ctx ? ctx->count : 0, caps_total_count, caps_use_count,
- caps_reserve_count, caps_avail_count);
- caps_use_count--;
- /*
- * Keep some preallocated caps around, at least enough to do a
- * readdir (which needs to preallocate lots of them), to avoid
- * lots of free/alloc churn.
- */
- if (caps_avail_count >= caps_reserve_count +
- ceph_client(cap->ci->vfs_inode.i_sb)->mount_args.max_readdir) {
- caps_total_count--;
- kmem_cache_free(ceph_cap_cachep, cap);
- } else {
- if (ctx) {
- ctx->count++;
- caps_reserve_count++;
- } else {
- caps_avail_count++;
- }
- list_add(&cap->caps_item, &caps_list);
- }
-
- BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
- caps_avail_count);
- spin_unlock(&caps_list_lock);
-}
-
-void ceph_reservation_status(struct ceph_client *client,
- int *total, int *avail, int *used, int *reserved)
-{
- if (total)
- *total = caps_total_count;
- if (avail)
- *avail = caps_avail_count;
- if (used)
- *used = caps_use_count;
- if (reserved)
- *reserved = caps_reserve_count;
-}
-
-/*
- * Find ceph_cap for given mds, if any.
- *
- * Called with i_lock held.
- */
-static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
-{
- struct ceph_cap *cap;
- struct rb_node *n = ci->i_caps.rb_node;
-
- while (n) {
- cap = rb_entry(n, struct ceph_cap, ci_node);
- if (mds < cap->mds)
- n = n->rb_left;
- else if (mds > cap->mds)
- n = n->rb_right;
- else
- return cap;
- }
- return NULL;
-}
-
-/*
- * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
- * -1.
- */
-static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
-{
- struct ceph_cap *cap;
- int mds = -1;
- struct rb_node *p;
-
- /* prefer mds with WR|WRBUFFER|EXCL caps */
- for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
- cap = rb_entry(p, struct ceph_cap, ci_node);
- mds = cap->mds;
- if (mseq)
- *mseq = cap->mseq;
- if (cap->issued & (CEPH_CAP_FILE_WR |
- CEPH_CAP_FILE_BUFFER |
- CEPH_CAP_FILE_EXCL))
- break;
- }
- return mds;
-}
-
-int ceph_get_cap_mds(struct inode *inode)
-{
- int mds;
- spin_lock(&inode->i_lock);
- mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
- spin_unlock(&inode->i_lock);
- return mds;
-}
-
-/*
- * Called under i_lock.
- */
-static void __insert_cap_node(struct ceph_inode_info *ci,
- struct ceph_cap *new)
-{
- struct rb_node **p = &ci->i_caps.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_cap *cap = NULL;
-
- while (*p) {
- parent = *p;
- cap = rb_entry(parent, struct ceph_cap, ci_node);
- if (new->mds < cap->mds)
- p = &(*p)->rb_left;
- else if (new->mds > cap->mds)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&new->ci_node, parent, p);
- rb_insert_color(&new->ci_node, &ci->i_caps);
-}
-
-/*
- * (re)set cap hold timeouts, which control the delayed release
- * of unused caps back to the MDS. Should be called on cap use.
- */
-static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
- struct ceph_inode_info *ci)
-{
- struct ceph_mount_args *ma = &mdsc->client->mount_args;
-
- ci->i_hold_caps_min = round_jiffies(jiffies +
- ma->caps_wanted_delay_min * HZ);
- ci->i_hold_caps_max = round_jiffies(jiffies +
- ma->caps_wanted_delay_max * HZ);
- dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
- ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
-}
-
-/*
- * (Re)queue cap at the end of the delayed cap release list.
- *
- * If I_FLUSH is set, leave the inode at the front of the list.
- *
- * Caller holds i_lock
- * -> we take mdsc->cap_delay_lock
- */
-static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
- struct ceph_inode_info *ci)
-{
- __cap_set_timeouts(mdsc, ci);
- dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
- ci->i_ceph_flags, ci->i_hold_caps_max);
- if (!mdsc->stopping) {
- spin_lock(&mdsc->cap_delay_lock);
- if (!list_empty(&ci->i_cap_delay_list)) {
- if (ci->i_ceph_flags & CEPH_I_FLUSH)
- goto no_change;
- list_del_init(&ci->i_cap_delay_list);
- }
- list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
-no_change:
- spin_unlock(&mdsc->cap_delay_lock);
- }
-}
-
-/*
- * Queue an inode for immediate writeback. Mark inode with I_FLUSH,
- * indicating we should send a cap message to flush dirty metadata
- * asap, and move to the front of the delayed cap list.
- */
-static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
- struct ceph_inode_info *ci)
-{
- dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
- spin_lock(&mdsc->cap_delay_lock);
- ci->i_ceph_flags |= CEPH_I_FLUSH;
- if (!list_empty(&ci->i_cap_delay_list))
- list_del_init(&ci->i_cap_delay_list);
- list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
- spin_unlock(&mdsc->cap_delay_lock);
-}
-
-/*
- * Cancel delayed work on cap.
- *
- * Caller must hold i_lock.
- */
-static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
- struct ceph_inode_info *ci)
-{
- dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
- if (list_empty(&ci->i_cap_delay_list))
- return;
- spin_lock(&mdsc->cap_delay_lock);
- list_del_init(&ci->i_cap_delay_list);
- spin_unlock(&mdsc->cap_delay_lock);
-}
-
-/*
- * Common issue checks for add_cap, handle_cap_grant.
- */
-static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
- unsigned issued)
-{
- unsigned had = __ceph_caps_issued(ci, NULL);
-
- /*
- * Each time we receive FILE_CACHE anew, we increment
- * i_rdcache_gen.
- */
- if ((issued & CEPH_CAP_FILE_CACHE) &&
- (had & CEPH_CAP_FILE_CACHE) == 0)
- ci->i_rdcache_gen++;
-
- /*
- * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
- * don't know what happened to this directory while we didn't
- * have the cap.
- */
- if ((issued & CEPH_CAP_FILE_SHARED) &&
- (had & CEPH_CAP_FILE_SHARED) == 0) {
- ci->i_shared_gen++;
- if (S_ISDIR(ci->vfs_inode.i_mode)) {
- dout(" marking %p NOT complete\n", &ci->vfs_inode);
- ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
- }
- }
-}
-
-/*
- * Add a capability under the given MDS session.
- *
- * Caller should hold session snap_rwsem (read) and s_mutex.
- *
- * @fmode is the open file mode, if we are opening a file, otherwise
- * it is < 0. (This is so we can atomically add the cap and add an
- * open file reference to it.)
- */
-int ceph_add_cap(struct inode *inode,
- struct ceph_mds_session *session, u64 cap_id,
- int fmode, unsigned issued, unsigned wanted,
- unsigned seq, unsigned mseq, u64 realmino, int flags,
- struct ceph_cap_reservation *caps_reservation)
-{
- struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_cap *new_cap = NULL;
- struct ceph_cap *cap;
- int mds = session->s_mds;
- int actual_wanted;
-
- dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
- session->s_mds, cap_id, ceph_cap_string(issued), seq);
-
- /*
- * If we are opening the file, include file mode wanted bits
- * in wanted.
- */
- if (fmode >= 0)
- wanted |= ceph_caps_for_mode(fmode);
-
-retry:
- spin_lock(&inode->i_lock);
- cap = __get_cap_for_mds(ci, mds);
- if (!cap) {
- if (new_cap) {
- cap = new_cap;
- new_cap = NULL;
- } else {
- spin_unlock(&inode->i_lock);
- new_cap = get_cap(caps_reservation);
- if (new_cap == NULL)
- return -ENOMEM;
- goto retry;
- }
-
- cap->issued = 0;
- cap->implemented = 0;
- cap->mds = mds;
- cap->mds_wanted = 0;
-
- cap->ci = ci;
- __insert_cap_node(ci, cap);
-
- /* clear out old exporting info? (i.e. on cap import) */
- if (ci->i_cap_exporting_mds == mds) {
- ci->i_cap_exporting_issued = 0;
- ci->i_cap_exporting_mseq = 0;
- ci->i_cap_exporting_mds = -1;
- }
-
- /* add to session cap list */
- cap->session = session;
- spin_lock(&session->s_cap_lock);
- list_add_tail(&cap->session_caps, &session->s_caps);
- session->s_nr_caps++;
- spin_unlock(&session->s_cap_lock);
- }
-
- if (!ci->i_snap_realm) {
- /*
- * add this inode to the appropriate snap realm
- */
- struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
- realmino);
- if (realm) {
- ceph_get_snap_realm(mdsc, realm);
- spin_lock(&realm->inodes_with_caps_lock);
- ci->i_snap_realm = realm;
- list_add(&ci->i_snap_realm_item,
- &realm->inodes_with_caps);
- spin_unlock(&realm->inodes_with_caps_lock);
- } else {
- pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
- realmino);
- }
- }
-
- __check_cap_issue(ci, cap, issued);
-
- /*
- * If we are issued caps we don't want, or the mds' wanted
- * value appears to be off, queue a check so we'll release
- * later and/or update the mds wanted value.
- */
- actual_wanted = __ceph_caps_wanted(ci);
- if ((wanted & ~actual_wanted) ||
- (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
- dout(" issued %s, mds wanted %s, actual %s, queueing\n",
- ceph_cap_string(issued), ceph_cap_string(wanted),
- ceph_cap_string(actual_wanted));
- __cap_delay_requeue(mdsc, ci);
- }
-
- if (flags & CEPH_CAP_FLAG_AUTH)
- ci->i_auth_cap = cap;
- else if (ci->i_auth_cap == cap)
- ci->i_auth_cap = NULL;
-
- dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
- inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
- ceph_cap_string(issued|cap->issued), seq, mds);
- cap->cap_id = cap_id;
- cap->issued = issued;
- cap->implemented |= issued;
- cap->mds_wanted |= wanted;
- cap->seq = seq;
- cap->issue_seq = seq;
- cap->mseq = mseq;
- cap->gen = session->s_cap_gen;
-
- if (fmode >= 0)
- __ceph_get_fmode(ci, fmode);
- spin_unlock(&inode->i_lock);
- wake_up(&ci->i_cap_wq);
- return 0;
-}
-
-/*
- * Return true if cap has not timed out and belongs to the current
- * generation of the MDS session (i.e. has not gone 'stale' due to
- * us losing touch with the mds).
- */
-static int __cap_is_valid(struct ceph_cap *cap)
-{
- unsigned long ttl;
- u32 gen;
-
- spin_lock(&cap->session->s_cap_lock);
- gen = cap->session->s_cap_gen;
- ttl = cap->session->s_cap_ttl;
- spin_unlock(&cap->session->s_cap_lock);
-
- if (cap->gen < gen || time_after_eq(jiffies, ttl)) {
- dout("__cap_is_valid %p cap %p issued %s "
- "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
- cap, ceph_cap_string(cap->issued), cap->gen, gen);
- return 0;
- }
-
- return 1;
-}
-
-/*
- * Return set of valid cap bits issued to us. Note that caps time
- * out, and may be invalidated in bulk if the client session times out
- * and session->s_cap_gen is bumped.
- */
-int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
-{
- int have = ci->i_snap_caps;
- struct ceph_cap *cap;
- struct rb_node *p;
-
- if (implemented)
- *implemented = 0;
- for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
- cap = rb_entry(p, struct ceph_cap, ci_node);
- if (!__cap_is_valid(cap))
- continue;
- dout("__ceph_caps_issued %p cap %p issued %s\n",
- &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
- have |= cap->issued;
- if (implemented)
- *implemented |= cap->implemented;
- }
- return have;
-}
-
-/*
- * Get cap bits issued by caps other than @ocap
- */
-int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
-{
- int have = ci->i_snap_caps;
- struct ceph_cap *cap;
- struct rb_node *p;
-
- for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
- cap = rb_entry(p, struct ceph_cap, ci_node);
- if (cap == ocap)
- continue;
- if (!__cap_is_valid(cap))
- continue;
- have |= cap->issued;
- }
- return have;
-}
-
-/*
- * Move a cap to the end of the LRU (oldest caps at list head, newest
- * at list tail).
- */
-static void __touch_cap(struct ceph_cap *cap)
-{
- struct ceph_mds_session *s = cap->session;
-
- dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
- s->s_mds);
- spin_lock(&s->s_cap_lock);
- list_move_tail(&cap->session_caps, &s->s_caps);
- spin_unlock(&s->s_cap_lock);
-}
-
-/*
- * Check if we hold the given mask. If so, move the cap(s) to the
- * front of their respective LRUs. (This is the preferred way for
- * callers to check for caps they want.)
- */
-int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
-{
- struct ceph_cap *cap;
- struct rb_node *p;
- int have = ci->i_snap_caps;
-
- if ((have & mask) == mask) {
- dout("__ceph_caps_issued_mask %p snap issued %s"
- " (mask %s)\n", &ci->vfs_inode,
- ceph_cap_string(have),
- ceph_cap_string(mask));
- return 1;
- }
-
- for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
- cap = rb_entry(p, struct ceph_cap, ci_node);
- if (!__cap_is_valid(cap))
- continue;
- if ((cap->issued & mask) == mask) {
- dout("__ceph_caps_issued_mask %p cap %p issued %s"
- " (mask %s)\n", &ci->vfs_inode, cap,
- ceph_cap_string(cap->issued),
- ceph_cap_string(mask));
- if (touch)
- __touch_cap(cap);
- return 1;
- }
-
- /* does a combination of caps satisfy mask? */
- have |= cap->issued;
- if ((have & mask) == mask) {
- dout("__ceph_caps_issued_mask %p combo issued %s"
- " (mask %s)\n", &ci->vfs_inode,
- ceph_cap_string(cap->issued),
- ceph_cap_string(mask));
- if (touch) {
- struct rb_node *q;
-
- /* touch this + preceeding caps */
- __touch_cap(cap);
- for (q = rb_first(&ci->i_caps); q != p;
- q = rb_next(q)) {
- cap = rb_entry(q, struct ceph_cap,
- ci_node);
- if (!__cap_is_valid(cap))
- continue;
- __touch_cap(cap);
- }
- }
- return 1;
- }
- }
-
- return 0;
-}
-
-/*
- * Return true if mask caps are currently being revoked by an MDS.
- */
-int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
-{
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap *cap;
- struct rb_node *p;
- int ret = 0;
-
- spin_lock(&inode->i_lock);
- for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
- cap = rb_entry(p, struct ceph_cap, ci_node);
- if (__cap_is_valid(cap) &&
- (cap->implemented & ~cap->issued & mask)) {
- ret = 1;
- break;
- }
- }
- spin_unlock(&inode->i_lock);
- dout("ceph_caps_revoking %p %s = %d\n", inode,
- ceph_cap_string(mask), ret);
- return ret;
-}
-
-int __ceph_caps_used(struct ceph_inode_info *ci)
-{
- int used = 0;
- if (ci->i_pin_ref)
- used |= CEPH_CAP_PIN;
- if (ci->i_rd_ref)
- used |= CEPH_CAP_FILE_RD;
- if (ci->i_rdcache_ref || ci->i_rdcache_gen)
- used |= CEPH_CAP_FILE_CACHE;
- if (ci->i_wr_ref)
- used |= CEPH_CAP_FILE_WR;
- if (ci->i_wrbuffer_ref)
- used |= CEPH_CAP_FILE_BUFFER;
- return used;
-}
-
-/*
- * wanted, by virtue of open file modes
- */
-int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
-{
- int want = 0;
- int mode;
- for (mode = 0; mode < 4; mode++)
- if (ci->i_nr_by_mode[mode])
- want |= ceph_caps_for_mode(mode);
- return want;
-}
-
-/*
- * Return caps we have registered with the MDS(s) as 'wanted'.
- */
-int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
-{
- struct ceph_cap *cap;
- struct rb_node *p;
- int mds_wanted = 0;
-
- for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
- cap = rb_entry(p, struct ceph_cap, ci_node);
- if (!__cap_is_valid(cap))
- continue;
- mds_wanted |= cap->mds_wanted;
- }
- return mds_wanted;
-}
-
-/*
- * called under i_lock
- */
-static int __ceph_is_any_caps(struct ceph_inode_info *ci)
-{
- return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
-}
-
-/*
- * caller should hold i_lock, and session s_mutex.
- * returns true if this is the last cap. if so, caller should iput.
- */
-void __ceph_remove_cap(struct ceph_cap *cap,
- struct ceph_cap_reservation *ctx)
-{
- struct ceph_mds_session *session = cap->session;
- struct ceph_inode_info *ci = cap->ci;
- struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
-
- dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
-
- /* remove from session list */
- spin_lock(&session->s_cap_lock);
- list_del_init(&cap->session_caps);
- session->s_nr_caps--;
- spin_unlock(&session->s_cap_lock);
-
- /* remove from inode list */
- rb_erase(&cap->ci_node, &ci->i_caps);
- cap->session = NULL;
- if (ci->i_auth_cap == cap)
- ci->i_auth_cap = NULL;
-
- put_cap(cap, ctx);
-
- if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
- struct ceph_snap_realm *realm = ci->i_snap_realm;
- spin_lock(&realm->inodes_with_caps_lock);
- list_del_init(&ci->i_snap_realm_item);
- ci->i_snap_realm_counter++;
- ci->i_snap_realm = NULL;
- spin_unlock(&realm->inodes_with_caps_lock);
- ceph_put_snap_realm(mdsc, realm);
- }
- if (!__ceph_is_any_real_caps(ci))
- __cap_delay_cancel(mdsc, ci);
-}
-
-/*
- * Build and send a cap message to the given MDS.
- *
- * Caller should be holding s_mutex.
- */
-static int send_cap_msg(struct ceph_mds_session *session,
- u64 ino, u64 cid, int op,
- int caps, int wanted, int dirty,
- u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
- u64 size, u64 max_size,
- struct timespec *mtime, struct timespec *atime,
- u64 time_warp_seq,
- uid_t uid, gid_t gid, mode_t mode,
- u64 xattr_version,
- struct ceph_buffer *xattrs_buf,
- u64 follows)
-{
- struct ceph_mds_caps *fc;
- struct ceph_msg *msg;
-
- dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
- " seq %u/%u mseq %u follows %lld size %llu/%llu"
- " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
- cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
- ceph_cap_string(dirty),
- seq, issue_seq, mseq, follows, size, max_size,
- xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
-
- msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
- if (IS_ERR(msg))
- return PTR_ERR(msg);
-
- fc = msg->front.iov_base;
-
- memset(fc, 0, sizeof(*fc));
-
- fc->cap_id = cpu_to_le64(cid);
- fc->op = cpu_to_le32(op);
- fc->seq = cpu_to_le32(seq);
- fc->client_tid = cpu_to_le64(flush_tid);
- fc->issue_seq = cpu_to_le32(issue_seq);
- fc->migrate_seq = cpu_to_le32(mseq);
- fc->caps = cpu_to_le32(caps);
- fc->wanted = cpu_to_le32(wanted);
- fc->dirty = cpu_to_le32(dirty);
- fc->ino = cpu_to_le64(ino);
- fc->snap_follows = cpu_to_le64(follows);
-
- fc->size = cpu_to_le64(size);
- fc->max_size = cpu_to_le64(max_size);
- if (mtime)
- ceph_encode_timespec(&fc->mtime, mtime);
- if (atime)
- ceph_encode_timespec(&fc->atime, atime);
- fc->time_warp_seq = cpu_to_le32(time_warp_seq);
-
- fc->uid = cpu_to_le32(uid);
- fc->gid = cpu_to_le32(gid);
- fc->mode = cpu_to_le32(mode);
-
- fc->xattr_version = cpu_to_le64(xattr_version);
- if (xattrs_buf) {
- msg->middle = ceph_buffer_get(xattrs_buf);
- fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
- msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
- }
-
- ceph_con_send(&session->s_con, msg);
- return 0;
-}
-
-/*
- * Queue cap releases when an inode is dropped from our
- * cache.
- */
-void ceph_queue_caps_release(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct rb_node *p;
-
- spin_lock(&inode->i_lock);
- p = rb_first(&ci->i_caps);
- while (p) {
- struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
- struct ceph_mds_session *session = cap->session;
- struct ceph_msg *msg;
- struct ceph_mds_cap_release *head;
- struct ceph_mds_cap_item *item;
-
- spin_lock(&session->s_cap_lock);
- BUG_ON(!session->s_num_cap_releases);
- msg = list_first_entry(&session->s_cap_releases,
- struct ceph_msg, list_head);
-
- dout(" adding %p release to mds%d msg %p (%d left)\n",
- inode, session->s_mds, msg, session->s_num_cap_releases);
-
- BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
- head = msg->front.iov_base;
- head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
- item = msg->front.iov_base + msg->front.iov_len;
- item->ino = cpu_to_le64(ceph_ino(inode));
- item->cap_id = cpu_to_le64(cap->cap_id);
- item->migrate_seq = cpu_to_le32(cap->mseq);
- item->seq = cpu_to_le32(cap->issue_seq);
-
- session->s_num_cap_releases--;
-
- msg->front.iov_len += sizeof(*item);
- if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
- dout(" release msg %p full\n", msg);
- list_move_tail(&msg->list_head,
- &session->s_cap_releases_done);
- } else {
- dout(" release msg %p at %d/%d (%d)\n", msg,
- (int)le32_to_cpu(head->num),
- (int)CEPH_CAPS_PER_RELEASE,
- (int)msg->front.iov_len);
- }
- spin_unlock(&session->s_cap_lock);
- p = rb_next(p);
- __ceph_remove_cap(cap, NULL);
-
- }
- spin_unlock(&inode->i_lock);
-}
-
-/*
- * Send a cap msg on the given inode. Update our caps state, then
- * drop i_lock and send the message.
- *
- * Make note of max_size reported/requested from mds, revoked caps
- * that have now been implemented.
- *
- * Make half-hearted attempt ot to invalidate page cache if we are
- * dropping RDCACHE. Note that this will leave behind locked pages
- * that we'll then need to deal with elsewhere.
- *
- * Return non-zero if delayed release, or we experienced an error
- * such that the caller should requeue + retry later.
- *
- * called with i_lock, then drops it.
- * caller should hold snap_rwsem (read), s_mutex.
- */
-static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
- int op, int used, int want, int retain, int flushing,
- unsigned *pflush_tid)
- __releases(cap->ci->vfs_inode->i_lock)
-{
- struct ceph_inode_info *ci = cap->ci;
- struct inode *inode = &ci->vfs_inode;
- u64 cap_id = cap->cap_id;
- int held = cap->issued | cap->implemented;
- int revoking = cap->implemented & ~cap->issued;
- int dropping = cap->issued & ~retain;
- int keep;
- u64 seq, issue_seq, mseq, time_warp_seq, follows;
- u64 size, max_size;
- struct timespec mtime, atime;
- int wake = 0;
- mode_t mode;
- uid_t uid;
- gid_t gid;
- struct ceph_mds_session *session;
- u64 xattr_version = 0;
- int delayed = 0;
- u64 flush_tid = 0;
- int i;
- int ret;
-
- dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
- inode, cap, cap->session,
- ceph_cap_string(held), ceph_cap_string(held & retain),
- ceph_cap_string(revoking));
- BUG_ON((retain & CEPH_CAP_PIN) == 0);
-
- session = cap->session;
-
- /* don't release wanted unless we've waited a bit. */
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
- time_before(jiffies, ci->i_hold_caps_min)) {
- dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
- ceph_cap_string(cap->issued),
- ceph_cap_string(cap->issued & retain),
- ceph_cap_string(cap->mds_wanted),
- ceph_cap_string(want));
- want |= cap->mds_wanted;
- retain |= cap->issued;
- delayed = 1;
- }
- ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
-
- cap->issued &= retain; /* drop bits we don't want */
- if (cap->implemented & ~cap->issued) {
- /*
- * Wake up any waiters on wanted -> needed transition.
- * This is due to the weird transition from buffered
- * to sync IO... we need to flush dirty pages _before_
- * allowing sync writes to avoid reordering.
- */
- wake = 1;
- }
- cap->implemented &= cap->issued | used;
- cap->mds_wanted = want;
-
- if (flushing) {
- /*
- * assign a tid for flush operations so we can avoid
- * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
- * clean type races. track latest tid for every bit
- * so we can handle flush AxFw, flush Fw, and have the
- * first ack clean Ax.
- */
- flush_tid = ++ci->i_cap_flush_last_tid;
- if (pflush_tid)
- *pflush_tid = flush_tid;
- dout(" cap_flush_tid %d\n", (int)flush_tid);
- for (i = 0; i < CEPH_CAP_BITS; i++)
- if (flushing & (1 << i))
- ci->i_cap_flush_tid[i] = flush_tid;
- }
-
- keep = cap->implemented;
- seq = cap->seq;
- issue_seq = cap->issue_seq;
- mseq = cap->mseq;
- size = inode->i_size;
- ci->i_reported_size = size;
- max_size = ci->i_wanted_max_size;
- ci->i_requested_max_size = max_size;
- mtime = inode->i_mtime;
- atime = inode->i_atime;
- time_warp_seq = ci->i_time_warp_seq;
- follows = ci->i_snap_realm->cached_context->seq;
- uid = inode->i_uid;
- gid = inode->i_gid;
- mode = inode->i_mode;
-
- if (dropping & CEPH_CAP_XATTR_EXCL) {
- __ceph_build_xattrs_blob(ci);
- xattr_version = ci->i_xattrs.version + 1;
- }
-
- spin_unlock(&inode->i_lock);
-
- if (dropping & CEPH_CAP_FILE_CACHE) {
- /* invalidate what we can */
- dout("invalidating pages on %p\n", inode);
- invalidate_mapping_pages(&inode->i_data, 0, -1);
- }
-
- ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
- op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
- size, max_size, &mtime, &atime, time_warp_seq,
- uid, gid, mode,
- xattr_version,
- (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
- follows);
- if (ret < 0) {
- dout("error sending cap msg, must requeue %p\n", inode);
- delayed = 1;
- }
-
- if (wake)
- wake_up(&ci->i_cap_wq);
-
- return delayed;
-}
-
-/*
- * When a snapshot is taken, clients accumulate dirty metadata on
- * inodes with capabilities in ceph_cap_snaps to describe the file
- * state at the time the snapshot was taken. This must be flushed
- * asynchronously back to the MDS once sync writes complete and dirty
- * data is written out.
- *
- * Called under i_lock. Takes s_mutex as needed.
- */
-void __ceph_flush_snaps(struct ceph_inode_info *ci,
- struct ceph_mds_session **psession)
-{
- struct inode *inode = &ci->vfs_inode;
- int mds;
- struct ceph_cap_snap *capsnap;
- u32 mseq;
- struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
- struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
- session->s_mutex */
- u64 next_follows = 0; /* keep track of how far we've gotten through the
- i_cap_snaps list, and skip these entries next time
- around to avoid an infinite loop */
-
- if (psession)
- session = *psession;
-
- dout("__flush_snaps %p\n", inode);
-retry:
- list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- /* avoid an infiniute loop after retry */
- if (capsnap->follows < next_follows)
- continue;
- /*
- * we need to wait for sync writes to complete and for dirty
- * pages to be written out.
- */
- if (capsnap->dirty_pages || capsnap->writing)
- continue;
-
- /* pick mds, take s_mutex */
- mds = __ceph_get_cap_mds(ci, &mseq);
- if (session && session->s_mds != mds) {
- dout("oops, wrong session %p mutex\n", session);
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
- session = NULL;
- }
- if (!session) {
- spin_unlock(&inode->i_lock);
- mutex_lock(&mdsc->mutex);
- session = __ceph_lookup_mds_session(mdsc, mds);
- mutex_unlock(&mdsc->mutex);
- if (session) {
- dout("inverting session/ino locks on %p\n",
- session);
- mutex_lock(&session->s_mutex);
- }
- /*
- * if session == NULL, we raced against a cap
- * deletion. retry, and we'll get a better
- * @mds value next time.
- */
- spin_lock(&inode->i_lock);
- goto retry;
- }
-
- capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
- atomic_inc(&capsnap->nref);
- if (!list_empty(&capsnap->flushing_item))
- list_del_init(&capsnap->flushing_item);
- list_add_tail(&capsnap->flushing_item,
- &session->s_cap_snaps_flushing);
- spin_unlock(&inode->i_lock);
-
- dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
- inode, capsnap, next_follows, capsnap->size);
- send_cap_msg(session, ceph_vino(inode).ino, 0,
- CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
- capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
- capsnap->size, 0,
- &capsnap->mtime, &capsnap->atime,
- capsnap->time_warp_seq,
- capsnap->uid, capsnap->gid, capsnap->mode,
- 0, NULL,
- capsnap->follows);
-
- next_follows = capsnap->follows + 1;
- ceph_put_cap_snap(capsnap);
-
- spin_lock(&inode->i_lock);
- goto retry;
- }
-
- /* we flushed them all; remove this inode from the queue */
- spin_lock(&mdsc->snap_flush_lock);
- list_del_init(&ci->i_snap_flush_item);
- spin_unlock(&mdsc->snap_flush_lock);
-
- if (psession)
- *psession = session;
- else if (session) {
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
- }
-}
-
-static void ceph_flush_snaps(struct ceph_inode_info *ci)
-{
- struct inode *inode = &ci->vfs_inode;
-
- spin_lock(&inode->i_lock);
- __ceph_flush_snaps(ci, NULL);
- spin_unlock(&inode->i_lock);
-}
-
-/*
- * Add dirty inode to the flushing list. Assigned a seq number so we
- * can wait for caps to flush without starving.
- */
-static void __mark_caps_flushing(struct inode *inode,
- struct ceph_mds_session *session)
-{
- struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
- struct ceph_inode_info *ci = ceph_inode(inode);
-
- BUG_ON(list_empty(&ci->i_dirty_item));
- spin_lock(&mdsc->cap_dirty_lock);
- if (list_empty(&ci->i_flushing_item)) {
- list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
- mdsc->num_cap_flushing++;
- ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
- dout(" inode %p now flushing seq %lld\n", &ci->vfs_inode,
- ci->i_cap_flush_seq);
- }
- spin_unlock(&mdsc->cap_dirty_lock);
-}
-
-/*
- * Swiss army knife function to examine currently used and wanted
- * versus held caps. Release, flush, ack revoked caps to mds as
- * appropriate.
- *
- * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
- * cap release further.
- * CHECK_CAPS_AUTHONLY - we should only check the auth cap
- * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
- * further delay.
- */
-void ceph_check_caps(struct ceph_inode_info *ci, int flags,
- struct ceph_mds_session *session)
-{
- struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
- struct ceph_mds_client *mdsc = &client->mdsc;
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap *cap;
- int file_wanted, used;
- int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
- int drop_session_lock = session ? 0 : 1;
- int want, retain, revoking, flushing = 0;
- int mds = -1; /* keep track of how far we've gone through i_caps list
- to avoid an infinite loop on retry */
- struct rb_node *p;
- int tried_invalidate = 0;
- int delayed = 0, sent = 0, force_requeue = 0, num;
- int is_delayed = flags & CHECK_CAPS_NODELAY;
-
- /* if we are unmounting, flush any unused caps immediately. */
- if (mdsc->stopping)
- is_delayed = 1;
-
- spin_lock(&inode->i_lock);
-
- if (ci->i_ceph_flags & CEPH_I_FLUSH)
- flags |= CHECK_CAPS_FLUSH;
-
- /* flush snaps first time around only */
- if (!list_empty(&ci->i_cap_snaps))
- __ceph_flush_snaps(ci, &session);
- goto retry_locked;
-retry:
- spin_lock(&inode->i_lock);
-retry_locked:
- file_wanted = __ceph_caps_file_wanted(ci);
- used = __ceph_caps_used(ci);
- want = file_wanted | used;
-
- retain = want | CEPH_CAP_PIN;
- if (!mdsc->stopping && inode->i_nlink > 0) {
- if (want) {
- retain |= CEPH_CAP_ANY; /* be greedy */
- } else {
- retain |= CEPH_CAP_ANY_SHARED;
- /*
- * keep RD only if we didn't have the file open RW,
- * because then the mds would revoke it anyway to
- * journal max_size=0.
- */
- if (ci->i_max_size == 0)
- retain |= CEPH_CAP_ANY_RD;
- }
- }
-
- dout("check_caps %p file_want %s used %s dirty %s flushing %s"
- " issued %s retain %s %s%s%s\n", inode,
- ceph_cap_string(file_wanted),
- ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
- ceph_cap_string(ci->i_flushing_caps),
- ceph_cap_string(__ceph_caps_issued(ci, NULL)),
- ceph_cap_string(retain),
- (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
- (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
- (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
-
- /*
- * If we no longer need to hold onto old our caps, and we may
- * have cached pages, but don't want them, then try to invalidate.
- * If we fail, it's because pages are locked.... try again later.
- */
- if ((!is_delayed || mdsc->stopping) &&
- ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
- ci->i_rdcache_gen && /* may have cached pages */
- file_wanted == 0 && /* no open files */
- !ci->i_truncate_pending &&
- !tried_invalidate) {
- u32 invalidating_gen = ci->i_rdcache_gen;
- int ret;
-
- dout("check_caps trying to invalidate on %p\n", inode);
- spin_unlock(&inode->i_lock);
- ret = invalidate_inode_pages2(&inode->i_data);
- spin_lock(&inode->i_lock);
- if (ret == 0 && invalidating_gen == ci->i_rdcache_gen) {
- /* success. */
- ci->i_rdcache_gen = 0;
- ci->i_rdcache_revoking = 0;
- } else {
- dout("check_caps failed to invalidate pages\n");
- /* we failed to invalidate pages. check these
- caps again later. */
- force_requeue = 1;
- __cap_set_timeouts(mdsc, ci);
- }
- tried_invalidate = 1;
- goto retry_locked;
- }
-
- num = 0;
- for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
- cap = rb_entry(p, struct ceph_cap, ci_node);
- num++;
-
- /* avoid looping forever */
- if (mds >= cap->mds ||
- ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
- continue;
-
- /* NOTE: no side-effects allowed, until we take s_mutex */
-
- revoking = cap->implemented & ~cap->issued;
- if (revoking)
- dout("mds%d revoking %s\n", cap->mds,
- ceph_cap_string(revoking));
-
- if (cap == ci->i_auth_cap &&
- (cap->issued & CEPH_CAP_FILE_WR)) {
- /* request larger max_size from MDS? */
- if (ci->i_wanted_max_size > ci->i_max_size &&
- ci->i_wanted_max_size > ci->i_requested_max_size) {
- dout("requesting new max_size\n");
- goto ack;
- }
-
- /* approaching file_max? */
- if ((inode->i_size << 1) >= ci->i_max_size &&
- (ci->i_reported_size << 1) < ci->i_max_size) {
- dout("i_size approaching max_size\n");
- goto ack;
- }
- }
- /* flush anything dirty? */
- if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
- ci->i_dirty_caps) {
- dout("flushing dirty caps\n");
- goto ack;
- }
-
- /* completed revocation? going down and there are no caps? */
- if (revoking && (revoking & used) == 0) {
- dout("completed revocation of %s\n",
- ceph_cap_string(cap->implemented & ~cap->issued));
- goto ack;
- }
-
- /* want more caps from mds? */
- if (want & ~(cap->mds_wanted | cap->issued))
- goto ack;
-
- /* things we might delay */
- if ((cap->issued & ~retain) == 0 &&
- cap->mds_wanted == want)
- continue; /* nope, all good */
-
- if (is_delayed)
- goto ack;
-
- /* delay? */
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
- time_before(jiffies, ci->i_hold_caps_max)) {
- dout(" delaying issued %s -> %s, wanted %s -> %s\n",
- ceph_cap_string(cap->issued),
- ceph_cap_string(cap->issued & retain),
- ceph_cap_string(cap->mds_wanted),
- ceph_cap_string(want));
- delayed++;
- continue;
- }
-
-ack:
- if (session && session != cap->session) {
- dout("oops, wrong session %p mutex\n", session);
- mutex_unlock(&session->s_mutex);
- session = NULL;
- }
- if (!session) {
- session = cap->session;
- if (mutex_trylock(&session->s_mutex) == 0) {
- dout("inverting session/ino locks on %p\n",
- session);
- spin_unlock(&inode->i_lock);
- if (took_snap_rwsem) {
- up_read(&mdsc->snap_rwsem);
- took_snap_rwsem = 0;
- }
- mutex_lock(&session->s_mutex);
- goto retry;
- }
- }
- /* take snap_rwsem after session mutex */
- if (!took_snap_rwsem) {
- if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
- dout("inverting snap/in locks on %p\n",
- inode);
- spin_unlock(&inode->i_lock);
- down_read(&mdsc->snap_rwsem);
- took_snap_rwsem = 1;
- goto retry;
- }
- took_snap_rwsem = 1;
- }
-
- if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
- /* update dirty, flushing bits */
- flushing = ci->i_dirty_caps;
- dout(" flushing %s, flushing_caps %s -> %s\n",
- ceph_cap_string(flushing),
- ceph_cap_string(ci->i_flushing_caps),
- ceph_cap_string(ci->i_flushing_caps | flushing));
- ci->i_flushing_caps |= flushing;
- ci->i_dirty_caps = 0;
- __mark_caps_flushing(inode, session);
- }
-
- mds = cap->mds; /* remember mds, so we don't repeat */
- sent++;
-
- /* __send_cap drops i_lock */
- delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
- retain, flushing, NULL);
- goto retry; /* retake i_lock and restart our cap scan. */
- }
-
- /*
- * Reschedule delayed caps release if we delayed anything,
- * otherwise cancel.
- */
- if (delayed && is_delayed)
- force_requeue = 1; /* __send_cap delayed release; requeue */
- if (!delayed && !is_delayed)
- __cap_delay_cancel(mdsc, ci);
- else if (!is_delayed || force_requeue)
- __cap_delay_requeue(mdsc, ci);
-
- spin_unlock(&inode->i_lock);
-
- if (session && drop_session_lock)
- mutex_unlock(&session->s_mutex);
- if (took_snap_rwsem)
- up_read(&mdsc->snap_rwsem);
-}
-
-/*
- * Mark caps dirty. If inode is newly dirty, add to the global dirty
- * list.
- */
-int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
-{
- struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
- struct inode *inode = &ci->vfs_inode;
- int was = __ceph_caps_dirty(ci);
- int dirty = 0;
-
- dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
- ceph_cap_string(mask), ceph_cap_string(ci->i_dirty_caps),
- ceph_cap_string(ci->i_dirty_caps | mask));
- ci->i_dirty_caps |= mask;
- if (!was) {
- dout(" inode %p now dirty\n", &ci->vfs_inode);
- spin_lock(&mdsc->cap_dirty_lock);
- list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
- spin_unlock(&mdsc->cap_dirty_lock);
- igrab(inode);
- dirty |= I_DIRTY_SYNC;
- }
- if ((was & CEPH_CAP_FILE_BUFFER) &&
- (mask & CEPH_CAP_FILE_BUFFER))
- dirty |= I_DIRTY_DATASYNC;
- if (dirty)
- __mark_inode_dirty(inode, dirty);
- __cap_delay_requeue(mdsc, ci);
- return was;
-}
-
-/*
- * Try to flush dirty caps back to the auth mds.
- */
-static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
- unsigned *flush_tid)
-{
- struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
- struct ceph_inode_info *ci = ceph_inode(inode);
- int unlock_session = session ? 0 : 1;
- int flushing = 0;
-
-retry:
- spin_lock(&inode->i_lock);
- if (ci->i_dirty_caps && ci->i_auth_cap) {
- struct ceph_cap *cap = ci->i_auth_cap;
- int used = __ceph_caps_used(ci);
- int want = __ceph_caps_wanted(ci);
- int delayed;
-
- if (!session) {
- spin_unlock(&inode->i_lock);
- session = cap->session;
- mutex_lock(&session->s_mutex);
- goto retry;
- }
- BUG_ON(session != cap->session);
- if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
- goto out;
-
- __mark_caps_flushing(inode, session);
-
- flushing = ci->i_dirty_caps;
- dout(" flushing %s, flushing_caps %s -> %s\n",
- ceph_cap_string(flushing),
- ceph_cap_string(ci->i_flushing_caps),
- ceph_cap_string(ci->i_flushing_caps | flushing));
- ci->i_flushing_caps |= flushing;
- ci->i_dirty_caps = 0;
-
- /* __send_cap drops i_lock */
- delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
- cap->issued | cap->implemented, flushing,
- flush_tid);
- if (!delayed)
- goto out_unlocked;
-
- spin_lock(&inode->i_lock);
- __cap_delay_requeue(mdsc, ci);
- }
-out:
- spin_unlock(&inode->i_lock);
-out_unlocked:
- if (session && unlock_session)
- mutex_unlock(&session->s_mutex);
- return flushing;
-}
-
-/*
- * Return true if we've flushed caps through the given flush_tid.
- */
-static int caps_are_flushed(struct inode *inode, unsigned tid)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int dirty, i, ret = 1;
-
- spin_lock(&inode->i_lock);
- dirty = __ceph_caps_dirty(ci);
- for (i = 0; i < CEPH_CAP_BITS; i++)
- if ((ci->i_flushing_caps & (1 << i)) &&
- ci->i_cap_flush_tid[i] <= tid) {
- /* still flushing this bit */
- ret = 0;
- break;
- }
- spin_unlock(&inode->i_lock);
- return ret;
-}
-
-/*
- * Wait on any unsafe replies for the given inode. First wait on the
- * newest request, and make that the upper bound. Then, if there are
- * more requests, keep waiting on the oldest as long as it is still older
- * than the original request.
- */
-static void sync_write_wait(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct list_head *head = &ci->i_unsafe_writes;
- struct ceph_osd_request *req;
- u64 last_tid;
-
- spin_lock(&ci->i_unsafe_lock);
- if (list_empty(head))
- goto out;
-
- /* set upper bound as _last_ entry in chain */
- req = list_entry(head->prev, struct ceph_osd_request,
- r_unsafe_item);
- last_tid = req->r_tid;
-
- do {
- ceph_osdc_get_request(req);
- spin_unlock(&ci->i_unsafe_lock);
- dout("sync_write_wait on tid %llu (until %llu)\n",
- req->r_tid, last_tid);
- wait_for_completion(&req->r_safe_completion);
- spin_lock(&ci->i_unsafe_lock);
- ceph_osdc_put_request(req);
-
- /*
- * from here on look at first entry in chain, since we
- * only want to wait for anything older than last_tid
- */
- if (list_empty(head))
- break;
- req = list_entry(head->next, struct ceph_osd_request,
- r_unsafe_item);
- } while (req->r_tid < last_tid);
-out:
- spin_unlock(&ci->i_unsafe_lock);
-}
-
-int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
-{
- struct inode *inode = dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- unsigned flush_tid;
- int ret;
- int dirty;
-
- dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
- sync_write_wait(inode);
-
- ret = filemap_write_and_wait(inode->i_mapping);
- if (ret < 0)
- return ret;
-
- dirty = try_flush_caps(inode, NULL, &flush_tid);
- dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
-
- /*
- * only wait on non-file metadata writeback (the mds
- * can recover size and mtime, so we don't need to
- * wait for that)
- */
- if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
- dout("fsync waiting for flush_tid %u\n", flush_tid);
- ret = wait_event_interruptible(ci->i_cap_wq,
- caps_are_flushed(inode, flush_tid));
- }
-
- dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
- return ret;
-}
-
-/*
- * Flush any dirty caps back to the mds. If we aren't asked to wait,
- * queue inode for flush but don't do so immediately, because we can
- * get by with fewer MDS messages if we wait for data writeback to
- * complete first.
- */
-int ceph_write_inode(struct inode *inode, int wait)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- unsigned flush_tid;
- int err = 0;
- int dirty;
-
- dout("write_inode %p wait=%d\n", inode, wait);
- if (wait) {
- dirty = try_flush_caps(inode, NULL, &flush_tid);
- if (dirty)
- err = wait_event_interruptible(ci->i_cap_wq,
- caps_are_flushed(inode, flush_tid));
- } else {
- struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
-
- spin_lock(&inode->i_lock);
- if (__ceph_caps_dirty(ci))
- __cap_delay_requeue_front(mdsc, ci);
- spin_unlock(&inode->i_lock);
- }
- return err;
-}
-
-/*
- * After a recovering MDS goes active, we need to resend any caps
- * we were flushing.
- *
- * Caller holds session->s_mutex.
- */
-static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- struct ceph_cap_snap *capsnap;
-
- dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
- list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
- flushing_item) {
- struct ceph_inode_info *ci = capsnap->ci;
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap *cap;
-
- spin_lock(&inode->i_lock);
- cap = ci->i_auth_cap;
- if (cap && cap->session == session) {
- dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
- cap, capsnap);
- __ceph_flush_snaps(ci, &session);
- } else {
- pr_err("%p auth cap %p not mds%d ???\n", inode,
- cap, session->s_mds);
- spin_unlock(&inode->i_lock);
- }
- }
-}
-
-void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- struct ceph_inode_info *ci;
-
- kick_flushing_capsnaps(mdsc, session);
-
- dout("kick_flushing_caps mds%d\n", session->s_mds);
- list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap *cap;
- int delayed = 0;
-
- spin_lock(&inode->i_lock);
- cap = ci->i_auth_cap;
- if (cap && cap->session == session) {
- dout("kick_flushing_caps %p cap %p %s\n", inode,
- cap, ceph_cap_string(ci->i_flushing_caps));
- delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
- __ceph_caps_used(ci),
- __ceph_caps_wanted(ci),
- cap->issued | cap->implemented,
- ci->i_flushing_caps, NULL);
- if (delayed) {
- spin_lock(&inode->i_lock);
- __cap_delay_requeue(mdsc, ci);
- spin_unlock(&inode->i_lock);
- }
- } else {
- pr_err("%p auth cap %p not mds%d ???\n", inode,
- cap, session->s_mds);
- spin_unlock(&inode->i_lock);
- }
- }
-}
-
-
-/*
- * Take references to capabilities we hold, so that we don't release
- * them to the MDS prematurely.
- *
- * Protected by i_lock.
- */
-static void __take_cap_refs(struct ceph_inode_info *ci, int got)
-{
- if (got & CEPH_CAP_PIN)
- ci->i_pin_ref++;
- if (got & CEPH_CAP_FILE_RD)
- ci->i_rd_ref++;
- if (got & CEPH_CAP_FILE_CACHE)
- ci->i_rdcache_ref++;
- if (got & CEPH_CAP_FILE_WR)
- ci->i_wr_ref++;
- if (got & CEPH_CAP_FILE_BUFFER) {
- if (ci->i_wrbuffer_ref == 0)
- igrab(&ci->vfs_inode);
- ci->i_wrbuffer_ref++;
- dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
- &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
- }
-}
-
-/*
- * Try to grab cap references. Specify those refs we @want, and the
- * minimal set we @need. Also include the larger offset we are writing
- * to (when applicable), and check against max_size here as well.
- * Note that caller is responsible for ensuring max_size increases are
- * requested from the MDS.
- */
-static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
- int *got, loff_t endoff, int *check_max, int *err)
-{
- struct inode *inode = &ci->vfs_inode;
- int ret = 0;
- int have, implemented;
-
- dout("get_cap_refs %p need %s want %s\n", inode,
- ceph_cap_string(need), ceph_cap_string(want));
- spin_lock(&inode->i_lock);
-
- /* make sure we _have_ some caps! */
- if (!__ceph_is_any_caps(ci)) {
- dout("get_cap_refs %p no real caps\n", inode);
- *err = -EBADF;
- ret = 1;
- goto out;
- }
-
- if (need & CEPH_CAP_FILE_WR) {
- if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
- dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
- inode, endoff, ci->i_max_size);
- if (endoff > ci->i_wanted_max_size) {
- *check_max = 1;
- ret = 1;
- }
- goto out;
- }
- /*
- * If a sync write is in progress, we must wait, so that we
- * can get a final snapshot value for size+mtime.
- */
- if (__ceph_have_pending_cap_snap(ci)) {
- dout("get_cap_refs %p cap_snap_pending\n", inode);
- goto out;
- }
- }
- have = __ceph_caps_issued(ci, &implemented);
-
- /*
- * disallow writes while a truncate is pending
- */
- if (ci->i_truncate_pending)
- have &= ~CEPH_CAP_FILE_WR;
-
- if ((have & need) == need) {
- /*
- * Look at (implemented & ~have & not) so that we keep waiting
- * on transition from wanted -> needed caps. This is needed
- * for WRBUFFER|WR -> WR to avoid a new WR sync write from
- * going before a prior buffered writeback happens.
- */
- int not = want & ~(have & need);
- int revoking = implemented & ~have;
- dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
- inode, ceph_cap_string(have), ceph_cap_string(not),
- ceph_cap_string(revoking));
- if ((revoking & not) == 0) {
- *got = need | (have & want);
- __take_cap_refs(ci, *got);
- ret = 1;
- }
- } else {
- dout("get_cap_refs %p have %s needed %s\n", inode,
- ceph_cap_string(have), ceph_cap_string(need));
- }
-out:
- spin_unlock(&inode->i_lock);
- dout("get_cap_refs %p ret %d got %s\n", inode,
- ret, ceph_cap_string(*got));
- return ret;
-}
-
-/*
- * Check the offset we are writing up to against our current
- * max_size. If necessary, tell the MDS we want to write to
- * a larger offset.
- */
-static void check_max_size(struct inode *inode, loff_t endoff)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int check = 0;
-
- /* do we need to explicitly request a larger max_size? */
- spin_lock(&inode->i_lock);
- if ((endoff >= ci->i_max_size ||
- endoff > (inode->i_size << 1)) &&
- endoff > ci->i_wanted_max_size) {
- dout("write %p at large endoff %llu, req max_size\n",
- inode, endoff);
- ci->i_wanted_max_size = endoff;
- check = 1;
- }
- spin_unlock(&inode->i_lock);
- if (check)
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
-}
-
-/*
- * Wait for caps, and take cap references. If we can't get a WR cap
- * due to a small max_size, make sure we check_max_size (and possibly
- * ask the mds) so we don't get hung up indefinitely.
- */
-int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
- loff_t endoff)
-{
- int check_max, ret, err;
-
-retry:
- if (endoff > 0)
- check_max_size(&ci->vfs_inode, endoff);
- check_max = 0;
- err = 0;
- ret = wait_event_interruptible(ci->i_cap_wq,
- try_get_cap_refs(ci, need, want,
- got, endoff,
- &check_max, &err));
- if (err)
- ret = err;
- if (check_max)
- goto retry;
- return ret;
-}
-
-/*
- * Take cap refs. Caller must already know we hold at least one ref
- * on the caps in question or we don't know this is safe.
- */
-void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
-{
- spin_lock(&ci->vfs_inode.i_lock);
- __take_cap_refs(ci, caps);
- spin_unlock(&ci->vfs_inode.i_lock);
-}
-
-/*
- * Release cap refs.
- *
- * If we released the last ref on any given cap, call ceph_check_caps
- * to release (or schedule a release).
- *
- * If we are releasing a WR cap (from a sync write), finalize any affected
- * cap_snap, and wake up any waiters.
- */
-void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
-{
- struct inode *inode = &ci->vfs_inode;
- int last = 0, put = 0, flushsnaps = 0, wake = 0;
- struct ceph_cap_snap *capsnap;
-
- spin_lock(&inode->i_lock);
- if (had & CEPH_CAP_PIN)
- --ci->i_pin_ref;
- if (had & CEPH_CAP_FILE_RD)
- if (--ci->i_rd_ref == 0)
- last++;
- if (had & CEPH_CAP_FILE_CACHE)
- if (--ci->i_rdcache_ref == 0)
- last++;
- if (had & CEPH_CAP_FILE_BUFFER) {
- if (--ci->i_wrbuffer_ref == 0) {
- last++;
- put++;
- }
- dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
- inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
- }
- if (had & CEPH_CAP_FILE_WR)
- if (--ci->i_wr_ref == 0) {
- last++;
- if (!list_empty(&ci->i_cap_snaps)) {
- capsnap = list_first_entry(&ci->i_cap_snaps,
- struct ceph_cap_snap,
- ci_item);
- if (capsnap->writing) {
- capsnap->writing = 0;
- flushsnaps =
- __ceph_finish_cap_snap(ci,
- capsnap);
- wake = 1;
- }
- }
- }
- spin_unlock(&inode->i_lock);
-
- dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had),
- last ? "last" : "");
-
- if (last && !flushsnaps)
- ceph_check_caps(ci, 0, NULL);
- else if (flushsnaps)
- ceph_flush_snaps(ci);
- if (wake)
- wake_up(&ci->i_cap_wq);
- if (put)
- iput(inode);
-}
-
-/*
- * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
- * context. Adjust per-snap dirty page accounting as appropriate.
- * Once all dirty data for a cap_snap is flushed, flush snapped file
- * metadata back to the MDS. If we dropped the last ref, call
- * ceph_check_caps.
- */
-void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
- struct ceph_snap_context *snapc)
-{
- struct inode *inode = &ci->vfs_inode;
- int last = 0;
- int last_snap = 0;
- int found = 0;
- struct ceph_cap_snap *capsnap = NULL;
-
- spin_lock(&inode->i_lock);
- ci->i_wrbuffer_ref -= nr;
- last = !ci->i_wrbuffer_ref;
-
- if (ci->i_head_snapc == snapc) {
- ci->i_wrbuffer_ref_head -= nr;
- if (!ci->i_wrbuffer_ref_head) {
- ceph_put_snap_context(ci->i_head_snapc);
- ci->i_head_snapc = NULL;
- }
- dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
- inode,
- ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
- ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
- last ? " LAST" : "");
- } else {
- list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- if (capsnap->context == snapc) {
- found = 1;
- capsnap->dirty_pages -= nr;
- last_snap = !capsnap->dirty_pages;
- break;
- }
- }
- BUG_ON(!found);
- dout("put_wrbuffer_cap_refs on %p cap_snap %p "
- " snap %lld %d/%d -> %d/%d %s%s\n",
- inode, capsnap, capsnap->context->seq,
- ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
- ci->i_wrbuffer_ref, capsnap->dirty_pages,
- last ? " (wrbuffer last)" : "",
- last_snap ? " (capsnap last)" : "");
- }
-
- spin_unlock(&inode->i_lock);
-
- if (last) {
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
- iput(inode);
- } else if (last_snap) {
- ceph_flush_snaps(ci);
- wake_up(&ci->i_cap_wq);
- }
-}
-
-/*
- * Handle a cap GRANT message from the MDS. (Note that a GRANT may
- * actually be a revocation if it specifies a smaller cap set.)
- *
- * caller holds s_mutex.
- * return value:
- * 0 - ok
- * 1 - check_caps on auth cap only (writeback)
- * 2 - check_caps (ack revoke)
- */
-static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
- struct ceph_mds_session *session,
- struct ceph_cap *cap,
- struct ceph_buffer *xattr_buf)
- __releases(inode->i_lock)
-
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int mds = session->s_mds;
- int seq = le32_to_cpu(grant->seq);
- int newcaps = le32_to_cpu(grant->caps);
- int issued, implemented, used, wanted, dirty;
- u64 size = le64_to_cpu(grant->size);
- u64 max_size = le64_to_cpu(grant->max_size);
- struct timespec mtime, atime, ctime;
- int reply = 0;
- int wake = 0;
- int writeback = 0;
- int revoked_rdcache = 0;
- int invalidate_async = 0;
- int tried_invalidate = 0;
- int ret;
-
- dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
- inode, cap, mds, seq, ceph_cap_string(newcaps));
- dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
- inode->i_size);
-
- /*
- * If CACHE is being revoked, and we have no dirty buffers,
- * try to invalidate (once). (If there are dirty buffers, we
- * will invalidate _after_ writeback.)
- */
-restart:
- if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
- !ci->i_wrbuffer_ref && !tried_invalidate) {
- dout("CACHE invalidation\n");
- spin_unlock(&inode->i_lock);
- tried_invalidate = 1;
-
- ret = invalidate_inode_pages2(&inode->i_data);
- spin_lock(&inode->i_lock);
- if (ret < 0) {
- /* there were locked pages.. invalidate later
- in a separate thread. */
- if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
- invalidate_async = 1;
- ci->i_rdcache_revoking = ci->i_rdcache_gen;
- }
- } else {
- /* we successfully invalidated those pages */
- revoked_rdcache = 1;
- ci->i_rdcache_gen = 0;
- ci->i_rdcache_revoking = 0;
- }
- goto restart;
- }
-
- /* side effects now are allowed */
-
- issued = __ceph_caps_issued(ci, &implemented);
- issued |= implemented | __ceph_caps_dirty(ci);
-
- cap->gen = session->s_cap_gen;
-
- __check_cap_issue(ci, cap, newcaps);
-
- if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
- inode->i_mode = le32_to_cpu(grant->mode);
- inode->i_uid = le32_to_cpu(grant->uid);
- inode->i_gid = le32_to_cpu(grant->gid);
- dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
- inode->i_uid, inode->i_gid);
- }
-
- if ((issued & CEPH_CAP_LINK_EXCL) == 0)
- inode->i_nlink = le32_to_cpu(grant->nlink);
-
- if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
- int len = le32_to_cpu(grant->xattr_len);
- u64 version = le64_to_cpu(grant->xattr_version);
-
- if (version > ci->i_xattrs.version) {
- dout(" got new xattrs v%llu on %p len %d\n",
- version, inode, len);
- if (ci->i_xattrs.blob)
- ceph_buffer_put(ci->i_xattrs.blob);
- ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
- ci->i_xattrs.version = version;
- }
- }
-
- /* size/ctime/mtime/atime? */
- ceph_fill_file_size(inode, issued,
- le32_to_cpu(grant->truncate_seq),
- le64_to_cpu(grant->truncate_size), size);
- ceph_decode_timespec(&mtime, &grant->mtime);
- ceph_decode_timespec(&atime, &grant->atime);
- ceph_decode_timespec(&ctime, &grant->ctime);
- ceph_fill_file_time(inode, issued,
- le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
- &atime);
-
- /* max size increase? */
- if (max_size != ci->i_max_size) {
- dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
- ci->i_max_size = max_size;
- if (max_size >= ci->i_wanted_max_size) {
- ci->i_wanted_max_size = 0; /* reset */
- ci->i_requested_max_size = 0;
- }
- wake = 1;
- }
-
- /* check cap bits */
- wanted = __ceph_caps_wanted(ci);
- used = __ceph_caps_used(ci);
- dirty = __ceph_caps_dirty(ci);
- dout(" my wanted = %s, used = %s, dirty %s\n",
- ceph_cap_string(wanted),
- ceph_cap_string(used),
- ceph_cap_string(dirty));
- if (wanted != le32_to_cpu(grant->wanted)) {
- dout("mds wanted %s -> %s\n",
- ceph_cap_string(le32_to_cpu(grant->wanted)),
- ceph_cap_string(wanted));
- grant->wanted = cpu_to_le32(wanted);
- }
-
- cap->seq = seq;
-
- /* file layout may have changed */
- ci->i_layout = grant->layout;
-
- /* revocation, grant, or no-op? */
- if (cap->issued & ~newcaps) {
- dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
- ceph_cap_string(newcaps));
- if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
- writeback = 1; /* will delay ack */
- else if (dirty & ~newcaps)
- reply = 1; /* initiate writeback in check_caps */
- else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
- revoked_rdcache)
- reply = 2; /* send revoke ack in check_caps */
- cap->issued = newcaps;
- } else if (cap->issued == newcaps) {
- dout("caps unchanged: %s -> %s\n",
- ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
- } else {
- dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
- ceph_cap_string(newcaps));
- cap->issued = newcaps;
- cap->implemented |= newcaps; /* add bits only, to
- * avoid stepping on a
- * pending revocation */
- wake = 1;
- }
-
- spin_unlock(&inode->i_lock);
- if (writeback) {
- /*
- * queue inode for writeback: we can't actually call
- * filemap_write_and_wait, etc. from message handler
- * context.
- */
- dout("queueing %p for writeback\n", inode);
- if (ceph_queue_writeback(inode))
- igrab(inode);
- }
- if (invalidate_async) {
- dout("queueing %p for page invalidation\n", inode);
- if (ceph_queue_page_invalidation(inode))
- igrab(inode);
- }
- if (wake)
- wake_up(&ci->i_cap_wq);
- return reply;
-}
-
-/*
- * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
- * MDS has been safely committed.
- */
-static void handle_cap_flush_ack(struct inode *inode,
- struct ceph_mds_caps *m,
- struct ceph_mds_session *session,
- struct ceph_cap *cap)
- __releases(inode->i_lock)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
- unsigned seq = le32_to_cpu(m->seq);
- int dirty = le32_to_cpu(m->dirty);
- int cleaned = 0;
- u64 flush_tid = le64_to_cpu(m->client_tid);
- int old_dirty = 0, new_dirty = 0;
- int i;
-
- for (i = 0; i < CEPH_CAP_BITS; i++)
- if ((dirty & (1 << i)) &&
- flush_tid == ci->i_cap_flush_tid[i])
- cleaned |= 1 << i;
-
- dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
- " flushing %s -> %s\n",
- inode, session->s_mds, seq, ceph_cap_string(dirty),
- ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
- ceph_cap_string(ci->i_flushing_caps & ~cleaned));
-
- if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
- goto out;
-
- old_dirty = ci->i_dirty_caps | ci->i_flushing_caps;
- ci->i_flushing_caps &= ~cleaned;
- new_dirty = ci->i_dirty_caps | ci->i_flushing_caps;
-
- spin_lock(&mdsc->cap_dirty_lock);
- if (ci->i_flushing_caps == 0) {
- list_del_init(&ci->i_flushing_item);
- if (!list_empty(&session->s_cap_flushing))
- dout(" mds%d still flushing cap on %p\n",
- session->s_mds,
- &list_entry(session->s_cap_flushing.next,
- struct ceph_inode_info,
- i_flushing_item)->vfs_inode);
- mdsc->num_cap_flushing--;
- wake_up(&mdsc->cap_flushing_wq);
- dout(" inode %p now !flushing\n", inode);
- }
- if (old_dirty && !new_dirty) {
- dout(" inode %p now clean\n", inode);
- list_del_init(&ci->i_dirty_item);
- }
- spin_unlock(&mdsc->cap_dirty_lock);
- wake_up(&ci->i_cap_wq);
-
-out:
- spin_unlock(&inode->i_lock);
- if (old_dirty && !new_dirty)
- iput(inode);
-}
-
-/*
- * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
- * throw away our cap_snap.
- *
- * Caller hold s_mutex.
- */
-static void handle_cap_flushsnap_ack(struct inode *inode,
- struct ceph_mds_caps *m,
- struct ceph_mds_session *session)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- u64 follows = le64_to_cpu(m->snap_follows);
- u64 flush_tid = le64_to_cpu(m->client_tid);
- struct ceph_cap_snap *capsnap;
- int drop = 0;
-
- dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
- inode, ci, session->s_mds, follows);
-
- spin_lock(&inode->i_lock);
- list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- if (capsnap->follows == follows) {
- if (capsnap->flush_tid != flush_tid) {
- dout(" cap_snap %p follows %lld tid %lld !="
- " %lld\n", capsnap, follows,
- flush_tid, capsnap->flush_tid);
- break;
- }
- WARN_ON(capsnap->dirty_pages || capsnap->writing);
- dout(" removing cap_snap %p follows %lld\n",
- capsnap, follows);
- ceph_put_snap_context(capsnap->context);
- list_del(&capsnap->ci_item);
- list_del(&capsnap->flushing_item);
- ceph_put_cap_snap(capsnap);
- drop = 1;
- break;
- } else {
- dout(" skipping cap_snap %p follows %lld\n",
- capsnap, capsnap->follows);
- }
- }
- spin_unlock(&inode->i_lock);
- if (drop)
- iput(inode);
-}
-
-/*
- * Handle TRUNC from MDS, indicating file truncation.
- *
- * caller hold s_mutex.
- */
-static void handle_cap_trunc(struct inode *inode,
- struct ceph_mds_caps *trunc,
- struct ceph_mds_session *session)
- __releases(inode->i_lock)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int mds = session->s_mds;
- int seq = le32_to_cpu(trunc->seq);
- u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
- u64 truncate_size = le64_to_cpu(trunc->truncate_size);
- u64 size = le64_to_cpu(trunc->size);
- int implemented = 0;
- int dirty = __ceph_caps_dirty(ci);
- int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
- int queue_trunc = 0;
-
- issued |= implemented | dirty;
-
- dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
- inode, mds, seq, truncate_size, truncate_seq);
- queue_trunc = ceph_fill_file_size(inode, issued,
- truncate_seq, truncate_size, size);
- spin_unlock(&inode->i_lock);
-
- if (queue_trunc)
- if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
- &ci->i_vmtruncate_work))
- igrab(inode);
-}
-
-/*
- * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
- * different one. If we are the most recent migration we've seen (as
- * indicated by mseq), make note of the migrating cap bits for the
- * duration (until we see the corresponding IMPORT).
- *
- * caller holds s_mutex
- */
-static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
- struct ceph_mds_session *session)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int mds = session->s_mds;
- unsigned mseq = le32_to_cpu(ex->migrate_seq);
- struct ceph_cap *cap = NULL, *t;
- struct rb_node *p;
- int remember = 1;
-
- dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
- inode, ci, mds, mseq);
-
- spin_lock(&inode->i_lock);
-
- /* make sure we haven't seen a higher mseq */
- for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
- t = rb_entry(p, struct ceph_cap, ci_node);
- if (ceph_seq_cmp(t->mseq, mseq) > 0) {
- dout(" higher mseq on cap from mds%d\n",
- t->session->s_mds);
- remember = 0;
- }
- if (t->session->s_mds == mds)
- cap = t;
- }
-
- if (cap) {
- if (remember) {
- /* make note */
- ci->i_cap_exporting_mds = mds;
- ci->i_cap_exporting_mseq = mseq;
- ci->i_cap_exporting_issued = cap->issued;
- }
- __ceph_remove_cap(cap, NULL);
- } else {
- WARN_ON(!cap);
- }
-
- spin_unlock(&inode->i_lock);
-}
-
-/*
- * Handle cap IMPORT. If there are temp bits from an older EXPORT,
- * clean them up.
- *
- * caller holds s_mutex.
- */
-static void handle_cap_import(struct ceph_mds_client *mdsc,
- struct inode *inode, struct ceph_mds_caps *im,
- struct ceph_mds_session *session,
- void *snaptrace, int snaptrace_len)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int mds = session->s_mds;
- unsigned issued = le32_to_cpu(im->caps);
- unsigned wanted = le32_to_cpu(im->wanted);
- unsigned seq = le32_to_cpu(im->seq);
- unsigned mseq = le32_to_cpu(im->migrate_seq);
- u64 realmino = le64_to_cpu(im->realm);
- u64 cap_id = le64_to_cpu(im->cap_id);
-
- if (ci->i_cap_exporting_mds >= 0 &&
- ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
- dout("handle_cap_import inode %p ci %p mds%d mseq %d"
- " - cleared exporting from mds%d\n",
- inode, ci, mds, mseq,
- ci->i_cap_exporting_mds);
- ci->i_cap_exporting_issued = 0;
- ci->i_cap_exporting_mseq = 0;
- ci->i_cap_exporting_mds = -1;
- } else {
- dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
- inode, ci, mds, mseq);
- }
-
- down_write(&mdsc->snap_rwsem);
- ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
- false);
- downgrade_write(&mdsc->snap_rwsem);
- ceph_add_cap(inode, session, cap_id, -1,
- issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
- NULL /* no caps context */);
- try_flush_caps(inode, session, NULL);
- up_read(&mdsc->snap_rwsem);
-}
-
-/*
- * Handle a caps message from the MDS.
- *
- * Identify the appropriate session, inode, and call the right handler
- * based on the cap op.
- */
-void ceph_handle_caps(struct ceph_mds_session *session,
- struct ceph_msg *msg)
-{
- struct ceph_mds_client *mdsc = session->s_mdsc;
- struct super_block *sb = mdsc->client->sb;
- struct inode *inode;
- struct ceph_cap *cap;
- struct ceph_mds_caps *h;
- int mds = le64_to_cpu(msg->hdr.src.name.num);
- int op;
- u32 seq;
- struct ceph_vino vino;
- u64 cap_id;
- u64 size, max_size;
- int check_caps = 0;
- int r;
-
- dout("handle_caps from mds%d\n", mds);
-
- /* decode */
- if (msg->front.iov_len < sizeof(*h))
- goto bad;
- h = msg->front.iov_base;
- op = le32_to_cpu(h->op);
- vino.ino = le64_to_cpu(h->ino);
- vino.snap = CEPH_NOSNAP;
- cap_id = le64_to_cpu(h->cap_id);
- seq = le32_to_cpu(h->seq);
- size = le64_to_cpu(h->size);
- max_size = le64_to_cpu(h->max_size);
-
- mutex_lock(&session->s_mutex);
- session->s_seq++;
- dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
- (unsigned)seq);
-
- /* lookup ino */
- inode = ceph_find_inode(sb, vino);
- dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
- vino.snap, inode);
- if (!inode) {
- dout(" i don't have ino %llx\n", vino.ino);
- goto done;
- }
-
- /* these will work even if we don't have a cap yet */
- switch (op) {
- case CEPH_CAP_OP_FLUSHSNAP_ACK:
- handle_cap_flushsnap_ack(inode, h, session);
- goto done;
-
- case CEPH_CAP_OP_EXPORT:
- handle_cap_export(inode, h, session);
- goto done;
-
- case CEPH_CAP_OP_IMPORT:
- handle_cap_import(mdsc, inode, h, session,
- msg->middle,
- le32_to_cpu(h->snap_trace_len));
- check_caps = 1; /* we may have sent a RELEASE to the old auth */
- goto done;
- }
-
- /* the rest require a cap */
- spin_lock(&inode->i_lock);
- cap = __get_cap_for_mds(ceph_inode(inode), mds);
- if (!cap) {
- dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
- inode, ceph_ino(inode), ceph_snap(inode), mds);
- spin_unlock(&inode->i_lock);
- goto done;
- }
-
- /* note that each of these drops i_lock for us */
- switch (op) {
- case CEPH_CAP_OP_REVOKE:
- case CEPH_CAP_OP_GRANT:
- r = handle_cap_grant(inode, h, session, cap, msg->middle);
- if (r == 1)
- ceph_check_caps(ceph_inode(inode),
- CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
- session);
- else if (r == 2)
- ceph_check_caps(ceph_inode(inode),
- CHECK_CAPS_NODELAY,
- session);
- break;
-
- case CEPH_CAP_OP_FLUSH_ACK:
- handle_cap_flush_ack(inode, h, session, cap);
- break;
-
- case CEPH_CAP_OP_TRUNC:
- handle_cap_trunc(inode, h, session);
- break;
-
- default:
- spin_unlock(&inode->i_lock);
- pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
- ceph_cap_op_name(op));
- }
-
-done:
- mutex_unlock(&session->s_mutex);
-
- if (check_caps)
- ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, NULL);
- if (inode)
- iput(inode);
- return;
-
-bad:
- pr_err("ceph_handle_caps: corrupt message\n");
- return;
-}
-
-/*
- * Delayed work handler to process end of delayed cap release LRU list.
- */
-void ceph_check_delayed_caps(struct ceph_mds_client *mdsc, int flushdirty)
-{
- struct ceph_inode_info *ci;
- int flags = CHECK_CAPS_NODELAY;
-
- if (flushdirty)
- flags |= CHECK_CAPS_FLUSH;
-
- dout("check_delayed_caps\n");
- while (1) {
- spin_lock(&mdsc->cap_delay_lock);
- if (list_empty(&mdsc->cap_delay_list))
- break;
- ci = list_first_entry(&mdsc->cap_delay_list,
- struct ceph_inode_info,
- i_cap_delay_list);
- if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
- time_before(jiffies, ci->i_hold_caps_max))
- break;
- list_del_init(&ci->i_cap_delay_list);
- spin_unlock(&mdsc->cap_delay_lock);
- dout("check_delayed_caps on %p\n", &ci->vfs_inode);
- ceph_check_caps(ci, flags, NULL);
- }
- spin_unlock(&mdsc->cap_delay_lock);
-}
-
-/*
- * Drop open file reference. If we were the last open file,
- * we may need to release capabilities to the MDS (or schedule
- * their delayed release).
- */
-void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
-{
- struct inode *inode = &ci->vfs_inode;
- int last = 0;
-
- spin_lock(&inode->i_lock);
- dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
- ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
- BUG_ON(ci->i_nr_by_mode[fmode] == 0);
- if (--ci->i_nr_by_mode[fmode] == 0)
- last++;
- spin_unlock(&inode->i_lock);
-
- if (last && ci->i_vino.snap == CEPH_NOSNAP)
- ceph_check_caps(ci, 0, NULL);
-}
-
-/*
- * Helpers for embedding cap and dentry lease releases into mds
- * requests.
- *
- * @force is used by dentry_release (below) to force inclusion of a
- * record for the directory inode, even when there aren't any caps to
- * drop.
- */
-int ceph_encode_inode_release(void **p, struct inode *inode,
- int mds, int drop, int unless, int force)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_cap *cap;
- struct ceph_mds_request_release *rel = *p;
- int ret = 0;
-
- dout("encode_inode_release %p mds%d drop %s unless %s\n", inode,
- mds, ceph_cap_string(drop), ceph_cap_string(unless));
-
- spin_lock(&inode->i_lock);
- cap = __get_cap_for_mds(ci, mds);
- if (cap && __cap_is_valid(cap)) {
- if (force ||
- ((cap->issued & drop) &&
- (cap->issued & unless) == 0)) {
- if ((cap->issued & drop) &&
- (cap->issued & unless) == 0) {
- dout("encode_inode_release %p cap %p %s -> "
- "%s\n", inode, cap,
- ceph_cap_string(cap->issued),
- ceph_cap_string(cap->issued & ~drop));
- cap->issued &= ~drop;
- cap->implemented &= ~drop;
- if (ci->i_ceph_flags & CEPH_I_NODELAY) {
- int wanted = __ceph_caps_wanted(ci);
- dout(" wanted %s -> %s (act %s)\n",
- ceph_cap_string(cap->mds_wanted),
- ceph_cap_string(cap->mds_wanted &
- ~wanted),
- ceph_cap_string(wanted));
- cap->mds_wanted &= wanted;
- }
- } else {
- dout("encode_inode_release %p cap %p %s"
- " (force)\n", inode, cap,
- ceph_cap_string(cap->issued));
- }
-
- rel->ino = cpu_to_le64(ceph_ino(inode));
- rel->cap_id = cpu_to_le64(cap->cap_id);
- rel->seq = cpu_to_le32(cap->seq);
- rel->issue_seq = cpu_to_le32(cap->issue_seq),
- rel->mseq = cpu_to_le32(cap->mseq);
- rel->caps = cpu_to_le32(cap->issued);
- rel->wanted = cpu_to_le32(cap->mds_wanted);
- rel->dname_len = 0;
- rel->dname_seq = 0;
- *p += sizeof(*rel);
- ret = 1;
- } else {
- dout("encode_inode_release %p cap %p %s\n",
- inode, cap, ceph_cap_string(cap->issued));
- }
- }
- spin_unlock(&inode->i_lock);
- return ret;
-}
-
-int ceph_encode_dentry_release(void **p, struct dentry *dentry,
- int mds, int drop, int unless)
-{
- struct inode *dir = dentry->d_parent->d_inode;
- struct ceph_mds_request_release *rel = *p;
- struct ceph_dentry_info *di = ceph_dentry(dentry);
- int force = 0;
- int ret;
-
- /*
- * force an record for the directory caps if we have a dentry lease.
- * this is racy (can't take i_lock and d_lock together), but it
- * doesn't have to be perfect; the mds will revoke anything we don't
- * release.
- */
- spin_lock(&dentry->d_lock);
- if (di->lease_session && di->lease_session->s_mds == mds)
- force = 1;
- spin_unlock(&dentry->d_lock);
-
- ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
-
- spin_lock(&dentry->d_lock);
- if (ret && di->lease_session && di->lease_session->s_mds == mds) {
- dout("encode_dentry_release %p mds%d seq %d\n",
- dentry, mds, (int)di->lease_seq);
- rel->dname_len = cpu_to_le32(dentry->d_name.len);
- memcpy(*p, dentry->d_name.name, dentry->d_name.len);
- *p += dentry->d_name.len;
- rel->dname_seq = cpu_to_le32(di->lease_seq);
- }
- spin_unlock(&dentry->d_lock);
- return ret;
-}
+++ /dev/null
-Ceph Distributed File System
-============================
-
-Ceph is a distributed network file system designed to provide good
-performance, reliability, and scalability.
-
-Basic features include:
-
- * POSIX semantics
- * Seamless scaling from 1 to many thousands of nodes
- * High availability and reliability. No single points of failure.
- * N-way replication of data across storage nodes
- * Fast recovery from node failures
- * Automatic rebalancing of data on node addition/removal
- * Easy deployment: most FS components are userspace daemons
-
-Also,
- * Flexible snapshots (on any directory)
- * Recursive accounting (nested files, directories, bytes)
-
-In contrast to cluster filesystems like GFS, OCFS2, and GPFS that rely
-on symmetric access by all clients to shared block devices, Ceph
-separates data and metadata management into independent server
-clusters, similar to Lustre. Unlike Lustre, however, metadata and
-storage nodes run entirely as user space daemons. Storage nodes
-utilize btrfs to store data objects, leveraging its advanced features
-(checksumming, metadata replication, etc.). File data is striped
-across storage nodes in large chunks to distribute workload and
-facilitate high throughputs. When storage nodes fail, data is
-re-replicated in a distributed fashion by the storage nodes themselves
-(with some minimal coordination from a cluster monitor), making the
-system extremely efficient and scalable.
-
-Metadata servers effectively form a large, consistent, distributed
-in-memory cache above the file namespace that is extremely scalable,
-dynamically redistributes metadata in response to workload changes,
-and can tolerate arbitrary (well, non-Byzantine) node failures. The
-metadata server takes a somewhat unconventional approach to metadata
-storage to significantly improve performance for common workloads. In
-particular, inodes with only a single link are embedded in
-directories, allowing entire directories of dentries and inodes to be
-loaded into its cache with a single I/O operation. The contents of
-extremely large directories can be fragmented and managed by
-independent metadata servers, allowing scalable concurrent access.
-
-The system offers automatic data rebalancing/migration when scaling
-from a small cluster of just a few nodes to many hundreds, without
-requiring an administrator carve the data set into static volumes or
-go through the tedious process of migrating data between servers.
-When the file system approaches full, new nodes can be easily added
-and things will "just work."
-
-Ceph includes flexible snapshot mechanism that allows a user to create
-a snapshot on any subdirectory (and its nested contents) in the
-system. Snapshot creation and deletion are as simple as 'mkdir
-.snap/foo' and 'rmdir .snap/foo'.
-
-Ceph also provides some recursive accounting on directories for nested
-files and bytes. That is, a 'getfattr -d foo' on any directory in the
-system will reveal the total number of nested regular files and
-subdirectories, and a summation of all nested file sizes. This makes
-the identification of large disk space consumers relatively quick, as
-no 'du' or similar recursive scan of the file system is required.
-
-
-Mount Syntax
-============
-
-The basic mount syntax is:
-
- # mount -t ceph monip[:port][,monip2[:port]...]:/[subdir] mnt
-
-You only need to specify a single monitor, as the client will get the
-full list when it connects. (However, if the monitor you specify
-happens to be down, the mount won't succeed.) The port can be left
-off if the monitor is using the default. So if the monitor is at
-1.2.3.4,
-
- # mount -t ceph 1.2.3.4:/ /mnt/ceph
-
-is sufficient. If /sbin/mount.ceph is installed, a hostname can be
-used instead of an IP address.
-
-
-
-Mount Options
-=============
-
- ip=A.B.C.D[:N]
- Specify the IP and/or port the client should bind to locally.
- There is normally not much reason to do this. If the IP is not
- specified, the client's IP address is determined by looking at the
- address it's connection to the monitor originates from.
-
- wsize=X
- Specify the maximum write size in bytes. By default there is no
- maximu. Ceph will normally size writes based on the file stripe
- size.
-
- rsize=X
- Specify the maximum readahead.
-
- mount_timeout=X
- Specify the timeout value for mount (in seconds), in the case
- of a non-responsive Ceph file system. The default is 30
- seconds.
-
- rbytes
- When stat() is called on a directory, set st_size to 'rbytes',
- the summation of file sizes over all files nested beneath that
- directory. This is the default.
-
- norbytes
- When stat() is called on a directory, set st_size to the
- number of entries in that directory.
-
- nocrc
- Disable CRC32C calculation for data writes. If set, the OSD
- must rely on TCP's error correction to detect data corruption
- in the data payload.
-
- noasyncreaddir
- Disable client's use its local cache to satisfy readdir
- requests. (This does not change correctness; the client uses
- cached metadata only when a lease or capability ensures it is
- valid.)
-
-
-More Information
-================
-
-For more information on Ceph, see the home page at
- http://ceph.newdream.net/
-
-The Linux kernel client source tree is available at
- git://ceph.newdream.net/linux-ceph-client.git
-
-and the source for the full system is at
- git://ceph.newdream.net/ceph.git
+++ /dev/null
-#ifndef _FS_CEPH_DEBUG_H
-#define _FS_CEPH_DEBUG_H
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
-
-/*
- * wrap pr_debug to include a filename:lineno prefix on each line.
- * this incurs some overhead (kernel size and execution time) due to
- * the extra function call at each call site.
- */
-
-# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
-extern const char *ceph_file_part(const char *s, int len);
-# define dout(fmt, ...) \
- pr_debug(" %12.12s:%-4d : " fmt, \
- ceph_file_part(__FILE__, sizeof(__FILE__)), \
- __LINE__, ##__VA_ARGS__)
-# else
-/* faux printk call just to see any compiler warnings. */
-# define dout(fmt, ...) do { \
- if (0) \
- printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
- } while (0)
-# endif
-
-#else
-
-/*
- * or, just wrap pr_debug
- */
-# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
-
-#endif
-
-#endif
+++ /dev/null
-../include/ceph_frag.cc
\ No newline at end of file
+++ /dev/null
-../include/ceph_frag.h
\ No newline at end of file
+++ /dev/null
-../include/ceph_fs.cc
\ No newline at end of file
+++ /dev/null
-../include/ceph_fs.h
\ No newline at end of file
+++ /dev/null
-../include/ceph_strings.cc
\ No newline at end of file
+++ /dev/null
-../ceph_ver.h
\ No newline at end of file
+++ /dev/null
-../../crush/crush.c
\ No newline at end of file
+++ /dev/null
-../../crush/crush.h
\ No newline at end of file
+++ /dev/null
-../../crush/hash.h
\ No newline at end of file
+++ /dev/null
-../../crush/mapper.c
\ No newline at end of file
+++ /dev/null
-../../crush/mapper.h
\ No newline at end of file
+++ /dev/null
-#include "ceph_debug.h"
-
-#include <linux/module.h>
-#include <linux/ctype.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-
-#include "super.h"
-#include "mds_client.h"
-
-/*
- * Implement /sys/kernel/debug/ceph fun
- *
- * /sys/kernel/debug/ceph/client* - an instance of the ceph client
- * .../osdmap - current osdmap
- * .../mdsmap - current mdsmap
- * .../monmap - current monmap
- * .../osdc - active osd requests
- * .../mdsc - active mds requests
- * .../monc - mon client state
- * .../dentry_lru - dump contents of dentry lru
- * .../caps - expose cap (reservation) stats
- */
-
-static struct dentry *ceph_debugfs_dir;
-
-static int monmap_show(struct seq_file *s, void *p)
-{
- int i;
- struct ceph_client *client = s->private;
-
- if (client->monc.monmap == NULL)
- return 0;
-
- seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
- for (i = 0; i < client->monc.monmap->num_mon; i++) {
- struct ceph_entity_inst *inst =
- &client->monc.monmap->mon_inst[i];
-
- seq_printf(s, "\t%s%lld\t%s\n",
- ENTITY_NAME(inst->name),
- pr_addr(&inst->addr.in_addr));
- }
- return 0;
-}
-
-static int mdsmap_show(struct seq_file *s, void *p)
-{
- int i;
- struct ceph_client *client = s->private;
-
- if (client->mdsc.mdsmap == NULL)
- return 0;
- seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
- seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
- seq_printf(s, "session_timeout %d\n",
- client->mdsc.mdsmap->m_session_timeout);
- seq_printf(s, "session_autoclose %d\n",
- client->mdsc.mdsmap->m_session_autoclose);
- for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
- struct ceph_entity_addr *addr =
- &client->mdsc.mdsmap->m_info[i].addr;
- int state = client->mdsc.mdsmap->m_info[i].state;
-
- seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
- ceph_mds_state_name(state));
- }
- return 0;
-}
-
-static int osdmap_show(struct seq_file *s, void *p)
-{
- int i;
- struct ceph_client *client = s->private;
-
- if (client->osdc.osdmap == NULL)
- return 0;
- seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
- seq_printf(s, "flags%s%s\n",
- (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
- " NEARFULL" : "",
- (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
- " FULL" : "");
- for (i = 0; i < client->osdc.osdmap->num_pools; i++) {
- struct ceph_pg_pool_info *pool =
- &client->osdc.osdmap->pg_pool[i];
- seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
- i, pool->v.pg_num, pool->pg_num_mask,
- pool->v.lpg_num, pool->lpg_num_mask);
- }
- for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
- struct ceph_entity_addr *addr =
- &client->osdc.osdmap->osd_addr[i];
- int state = client->osdc.osdmap->osd_state[i];
- char sb[64];
-
- seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
- i, pr_addr(&addr->in_addr),
- ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
- ceph_osdmap_state_str(sb, sizeof(sb), state));
- }
- return 0;
-}
-
-static int monc_show(struct seq_file *s, void *p)
-{
- struct ceph_client *client = s->private;
- struct ceph_mon_statfs_request *req;
- u64 nexttid = 0;
- int got;
- struct ceph_mon_client *monc = &client->monc;
-
- mutex_lock(&monc->mutex);
-
- if (monc->have_mdsmap)
- seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
- if (monc->have_osdmap)
- seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
- if (monc->want_next_osdmap)
- seq_printf(s, "want next osdmap\n");
-
- while (nexttid < monc->last_tid) {
- got = radix_tree_gang_lookup(&monc->statfs_request_tree,
- (void **)&req, nexttid, 1);
- if (got == 0)
- break;
- nexttid = req->tid + 1;
-
- seq_printf(s, "%lld statfs\n", req->tid);
- }
- mutex_unlock(&monc->mutex);
-
- return 0;
-}
-
-static int mdsc_show(struct seq_file *s, void *p)
-{
- struct ceph_client *client = s->private;
- struct ceph_mds_request *req;
- u64 nexttid = 0;
- int got;
- struct ceph_mds_client *mdsc = &client->mdsc;
- int pathlen;
- u64 pathbase;
- char *path;
-
- mutex_lock(&mdsc->mutex);
- while (nexttid < mdsc->last_tid) {
- got = radix_tree_gang_lookup(&mdsc->request_tree,
- (void **)&req, nexttid, 1);
- if (got == 0)
- break;
- nexttid = req->r_tid + 1;
-
- if (req->r_request)
- seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
- else
- seq_printf(s, "%lld\t(no request)\t", req->r_tid);
-
- seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
-
- if (req->r_got_unsafe)
- seq_printf(s, "\t(unsafe)");
- else
- seq_printf(s, "\t");
-
- if (req->r_inode) {
- seq_printf(s, " #%llx", ceph_ino(req->r_inode));
- } else if (req->r_dentry) {
- path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
- &pathbase, 0);
- spin_lock(&req->r_dentry->d_lock);
- seq_printf(s, " #%llx/%.*s (%s)",
- ceph_ino(req->r_dentry->d_parent->d_inode),
- req->r_dentry->d_name.len,
- req->r_dentry->d_name.name,
- path ? path : "");
- spin_unlock(&req->r_dentry->d_lock);
- kfree(path);
- } else if (req->r_path1) {
- seq_printf(s, " #%llx/%s", req->r_ino1.ino,
- req->r_path1);
- }
-
- if (req->r_old_dentry) {
- path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
- &pathbase, 0);
- spin_lock(&req->r_old_dentry->d_lock);
- seq_printf(s, " #%llx/%.*s (%s)",
- ceph_ino(req->r_old_dentry->d_parent->d_inode),
- req->r_old_dentry->d_name.len,
- req->r_old_dentry->d_name.name,
- path ? path : "");
- spin_unlock(&req->r_old_dentry->d_lock);
- kfree(path);
- } else if (req->r_path2) {
- if (req->r_ino2.ino)
- seq_printf(s, " #%llx/%s", req->r_ino2.ino,
- req->r_path2);
- else
- seq_printf(s, " %s", req->r_path2);
- }
-
- seq_printf(s, "\n");
- }
- mutex_unlock(&mdsc->mutex);
-
- return 0;
-}
-
-static int osdc_show(struct seq_file *s, void *pp)
-{
- struct ceph_client *client = s->private;
- struct ceph_osd_client *osdc = &client->osdc;
- struct rb_node *p;
-
- mutex_lock(&osdc->request_mutex);
- for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
- struct ceph_osd_request *req;
- struct ceph_osd_request_head *head;
- struct ceph_osd_op *op;
- int num_ops;
- int opcode, olen;
- int i;
-
- req = rb_entry(p, struct ceph_osd_request, r_node);
-
- seq_printf(s, "%lld\tosd%d\t", req->r_tid,
- req->r_osd ? req->r_osd->o_osd : -1);
-
- head = req->r_request->front.iov_base;
- op = (void *)(head + 1);
-
- num_ops = le16_to_cpu(head->num_ops);
- olen = le32_to_cpu(head->object_len);
- seq_printf(s, "%.*s", olen,
- (const char *)(head->ops + num_ops));
-
- if (req->r_reassert_version.epoch)
- seq_printf(s, "\t%u'%llu",
- (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
- le64_to_cpu(req->r_reassert_version.version));
- else
- seq_printf(s, "\t");
-
- for (i = 0; i < num_ops; i++) {
- opcode = le16_to_cpu(op->op);
- seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
- op++;
- }
-
- seq_printf(s, "\n");
- }
- mutex_unlock(&osdc->request_mutex);
- return 0;
-}
-
-static int caps_show(struct seq_file *s, void *p)
-{
- struct ceph_client *client = p;
- int total, avail, used, reserved;
-
- ceph_reservation_status(client, &total, &avail, &used, &reserved);
- seq_printf(s, "total\t\t%d\n"
- "avail\t\t%d\n"
- "used\t\t%d\n"
- "reserved\t%d\n",
- total, avail, used, reserved);
- return 0;
-}
-
-static int dentry_lru_show(struct seq_file *s, void *ptr)
-{
- struct ceph_client *client = s->private;
- struct ceph_mds_client *mdsc = &client->mdsc;
- struct ceph_dentry_info *di;
-
- spin_lock(&mdsc->dentry_lru_lock);
- list_for_each_entry(di, &mdsc->dentry_lru, lru) {
- struct dentry *dentry = di->dentry;
- seq_printf(s, "%p %p\t%.*s\n",
- di, dentry, dentry->d_name.len, dentry->d_name.name);
- }
- spin_unlock(&mdsc->dentry_lru_lock);
-
- return 0;
-}
-
-#define DEFINE_SHOW_FUNC(name) \
-static int name##_open(struct inode *inode, struct file *file) \
-{ \
- struct seq_file *sf; \
- int ret; \
- \
- ret = single_open(file, name, NULL); \
- sf = file->private_data; \
- sf->private = inode->i_private; \
- return ret; \
-} \
- \
-static const struct file_operations name##_fops = { \
- .open = name##_open, \
- .read = seq_read, \
- .llseek = seq_lseek, \
- .release = single_release, \
-};
-
-DEFINE_SHOW_FUNC(monmap_show)
-DEFINE_SHOW_FUNC(mdsmap_show)
-DEFINE_SHOW_FUNC(osdmap_show)
-DEFINE_SHOW_FUNC(monc_show)
-DEFINE_SHOW_FUNC(mdsc_show)
-DEFINE_SHOW_FUNC(osdc_show)
-DEFINE_SHOW_FUNC(dentry_lru_show)
-DEFINE_SHOW_FUNC(caps_show)
-
-int __init ceph_debugfs_init(void)
-{
- ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
- if (!ceph_debugfs_dir)
- return -ENOMEM;
- return 0;
-}
-
-void ceph_debugfs_cleanup(void)
-{
- debugfs_remove(ceph_debugfs_dir);
-}
-
-int ceph_debugfs_client_init(struct ceph_client *client)
-{
- int ret = 0;
- char name[80];
-
- snprintf(name, sizeof(name), FSID_FORMAT ".client%lld",
- PR_FSID(&client->monc.monmap->fsid), client->whoami);
-
- client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
- if (!client->debugfs_dir)
- goto out;
-
- client->monc.debugfs_file = debugfs_create_file("monc",
- 0600,
- client->debugfs_dir,
- client,
- &monc_show_fops);
- if (!client->monc.debugfs_file)
- goto out;
-
- client->mdsc.debugfs_file = debugfs_create_file("mdsc",
- 0600,
- client->debugfs_dir,
- client,
- &mdsc_show_fops);
- if (!client->mdsc.debugfs_file)
- goto out;
-
- client->osdc.debugfs_file = debugfs_create_file("osdc",
- 0600,
- client->debugfs_dir,
- client,
- &osdc_show_fops);
- if (!client->osdc.debugfs_file)
- goto out;
-
- client->debugfs_monmap = debugfs_create_file("monmap",
- 0600,
- client->debugfs_dir,
- client,
- &monmap_show_fops);
- if (!client->debugfs_monmap)
- goto out;
-
- client->debugfs_mdsmap = debugfs_create_file("mdsmap",
- 0600,
- client->debugfs_dir,
- client,
- &mdsmap_show_fops);
- if (!client->debugfs_mdsmap)
- goto out;
-
- client->debugfs_osdmap = debugfs_create_file("osdmap",
- 0600,
- client->debugfs_dir,
- client,
- &osdmap_show_fops);
- if (!client->debugfs_osdmap)
- goto out;
-
- client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
- 0600,
- client->debugfs_dir,
- client,
- &dentry_lru_show_fops);
- if (!client->debugfs_dentry_lru)
- goto out;
-
- client->debugfs_caps = debugfs_create_file("caps",
- 0400,
- client->debugfs_dir,
- client,
- &caps_show_fops);
- if (!client->debugfs_caps)
- goto out;
-
- return 0;
-
-out:
- ceph_debugfs_client_cleanup(client);
- return ret;
-}
-
-void ceph_debugfs_client_cleanup(struct ceph_client *client)
-{
- debugfs_remove(client->debugfs_caps);
- debugfs_remove(client->debugfs_dentry_lru);
- debugfs_remove(client->debugfs_osdmap);
- debugfs_remove(client->debugfs_mdsmap);
- debugfs_remove(client->debugfs_monmap);
- debugfs_remove(client->osdc.debugfs_file);
- debugfs_remove(client->mdsc.debugfs_file);
- debugfs_remove(client->monc.debugfs_file);
- debugfs_remove(client->debugfs_dir);
-}
-
+++ /dev/null
-#ifndef __CEPH_DECODE_H
-#define __CEPH_DECODE_H
-
-#include <asm/unaligned.h>
-
-/*
- * in all cases,
- * void **p pointer to position pointer
- * void *end pointer to end of buffer (last byte + 1)
- */
-
-/*
- * bounds check input.
- */
-#define ceph_decode_need(p, end, n, bad) \
- do { \
- if (unlikely(*(p) + (n) > (end))) \
- goto bad; \
- } while (0)
-
-#define ceph_decode_64(p, v) \
- do { \
- v = get_unaligned_le64(*(p)); \
- *(p) += sizeof(u64); \
- } while (0)
-#define ceph_decode_32(p, v) \
- do { \
- v = get_unaligned_le32(*(p)); \
- *(p) += sizeof(u32); \
- } while (0)
-#define ceph_decode_16(p, v) \
- do { \
- v = get_unaligned_le16(*(p)); \
- *(p) += sizeof(u16); \
- } while (0)
-#define ceph_decode_8(p, v) \
- do { \
- v = *(u8 *)*(p); \
- (*p)++; \
- } while (0)
-
-#define ceph_decode_copy(p, pv, n) \
- do { \
- memcpy(pv, *(p), n); \
- *(p) += n; \
- } while (0)
-
-/* bounds check too */
-#define ceph_decode_64_safe(p, end, v, bad) \
- do { \
- ceph_decode_need(p, end, sizeof(u64), bad); \
- ceph_decode_64(p, v); \
- } while (0)
-#define ceph_decode_32_safe(p, end, v, bad) \
- do { \
- ceph_decode_need(p, end, sizeof(u32), bad); \
- ceph_decode_32(p, v); \
- } while (0)
-#define ceph_decode_16_safe(p, end, v, bad) \
- do { \
- ceph_decode_need(p, end, sizeof(u16), bad); \
- ceph_decode_16(p, v); \
- } while (0)
-
-#define ceph_decode_copy_safe(p, end, pv, n, bad) \
- do { \
- ceph_decode_need(p, end, n, bad); \
- ceph_decode_copy(p, pv, n); \
- } while (0)
-
-/*
- * struct ceph_timespec <-> struct timespec
- */
-#define ceph_decode_timespec(ts, tv) \
- do { \
- (ts)->tv_sec = le32_to_cpu((tv)->tv_sec); \
- (ts)->tv_nsec = le32_to_cpu((tv)->tv_nsec); \
- } while (0)
-#define ceph_encode_timespec(tv, ts) \
- do { \
- (tv)->tv_sec = cpu_to_le32((ts)->tv_sec); \
- (tv)->tv_nsec = cpu_to_le32((ts)->tv_nsec); \
- } while (0)
-
-
-/*
- * encoders
- */
-#define ceph_encode_64(p, v) \
- do { \
- put_unaligned_le64(v, (__le64 *)*(p)); \
- *(p) += sizeof(u64); \
- } while (0)
-#define ceph_encode_32(p, v) \
- do { \
- put_unaligned_le32(v, (__le32 *)*(p)); \
- *(p) += sizeof(u32); \
- } while (0)
-#define ceph_encode_16(p, v) \
- do { \
- put_unaligned_le16(v), (__le16 *)*(p)); \
- *(p) += sizeof(u16); \
- } while (0)
-#define ceph_encode_8(p, v) \
- do { \
- *(u8 *)*(p) = v; \
- (*(p))++; \
- } while (0)
-
-/*
- * filepath, string encoders
- */
-static inline void ceph_encode_filepath(void **p, void *end,
- u64 ino, const char *path)
-{
- u32 len = path ? strlen(path) : 0;
- BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
- ceph_encode_64(p, ino);
- ceph_encode_32(p, len);
- if (len)
- memcpy(*p, path, len);
- *p += len;
-}
-
-static inline void ceph_encode_string(void **p, void *end,
- const char *s, u32 len)
-{
- BUG_ON(*p + sizeof(len) + len > end);
- ceph_encode_32(p, len);
- if (len)
- memcpy(*p, s, len);
- *p += len;
-}
-
-
-#endif
+++ /dev/null
-#include "ceph_debug.h"
-
-#include <linux/spinlock.h>
-#include <linux/fs_struct.h>
-#include <linux/namei.h>
-#include <linux/sched.h>
-
-#include "super.h"
-
-/*
- * Directory operations: readdir, lookup, create, link, unlink,
- * rename, etc.
- */
-
-/*
- * Ceph MDS operations are specified in terms of a base ino and
- * relative path. Thus, the client can specify an operation on a
- * specific inode (e.g., a getattr due to fstat(2)), or as a path
- * relative to, say, the root directory.
- *
- * Normally, we limit ourselves to strict inode ops (no path component)
- * or dentry operations (a single path component relative to an ino). The
- * exception to this is open_root_dentry(), which will open the mount
- * point by name.
- */
-
-const struct inode_operations ceph_dir_iops;
-const struct file_operations ceph_dir_fops;
-struct dentry_operations ceph_dentry_ops;
-
-/*
- * Initialize ceph dentry state.
- */
-int ceph_init_dentry(struct dentry *dentry)
-{
- struct ceph_dentry_info *di;
-
- if (dentry->d_fsdata)
- return 0;
-
- if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
- dentry->d_op = &ceph_dentry_ops;
- else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
- dentry->d_op = &ceph_snapdir_dentry_ops;
- else
- dentry->d_op = &ceph_snap_dentry_ops;
-
- di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
- if (!di)
- return -ENOMEM; /* oh well */
-
- spin_lock(&dentry->d_lock);
- if (dentry->d_fsdata) /* lost a race */
- goto out_unlock;
- di->dentry = dentry;
- di->lease_session = NULL;
- dentry->d_fsdata = di;
- dentry->d_time = jiffies;
- ceph_dentry_lru_add(dentry);
-out_unlock:
- spin_unlock(&dentry->d_lock);
- return 0;
-}
-
-
-
-/*
- * for readdir, we encode the directory frag and offset within that
- * frag into f_pos.
- */
-static unsigned fpos_frag(loff_t p)
-{
- return p >> 32;
-}
-static unsigned fpos_off(loff_t p)
-{
- return p & 0xffffffff;
-}
-
-/*
- * When possible, we try to satisfy a readdir by peeking at the
- * dcache. We make this work by carefully ordering dentries on
- * d_u.d_child when we initially get results back from the MDS, and
- * falling back to a "normal" sync readdir if any dentries in the dir
- * are dropped.
- *
- * I_COMPLETE tells indicates we have all dentries in the dir. It is
- * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
- * the MDS if/when the directory is modified).
- */
-static int __dcache_readdir(struct file *filp,
- void *dirent, filldir_t filldir)
-{
- struct inode *inode = filp->f_dentry->d_inode;
- struct ceph_file_info *fi = filp->private_data;
- struct dentry *parent = filp->f_dentry;
- struct inode *dir = parent->d_inode;
- struct list_head *p;
- struct dentry *dentry, *last;
- struct ceph_dentry_info *di;
- int err = 0;
-
- /* claim ref on last dentry we returned */
- last = fi->dentry;
- fi->dentry = NULL;
-
- dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
- last);
-
- spin_lock(&dcache_lock);
-
- /* start at beginning? */
- if (filp->f_pos == 2 || (last &&
- filp->f_pos < ceph_dentry(last)->offset)) {
- if (list_empty(&parent->d_subdirs))
- goto out_unlock;
- p = parent->d_subdirs.prev;
- dout(" initial p %p/%p\n", p->prev, p->next);
- } else {
- p = last->d_u.d_child.prev;
- }
-
-more:
- dentry = list_entry(p, struct dentry, d_u.d_child);
- di = ceph_dentry(dentry);
- while (1) {
- dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
- parent->d_subdirs.prev, parent->d_subdirs.next);
- if (p == &parent->d_subdirs) {
- fi->at_end = 1;
- goto out_unlock;
- }
- if (!d_unhashed(dentry) && dentry->d_inode &&
- filp->f_pos <= di->offset)
- break;
- dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
- dentry->d_name.len, dentry->d_name.name, di->offset,
- filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
- !dentry->d_inode ? " null" : "");
- p = p->prev;
- dentry = list_entry(p, struct dentry, d_u.d_child);
- di = ceph_dentry(dentry);
- }
-
- atomic_inc(&dentry->d_count);
- spin_unlock(&dcache_lock);
- spin_unlock(&inode->i_lock);
-
- dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
- dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
- filp->f_pos = di->offset;
- err = filldir(dirent, dentry->d_name.name,
- dentry->d_name.len, di->offset,
- dentry->d_inode->i_ino,
- dentry->d_inode->i_mode >> 12);
-
- if (last) {
- if (err < 0) {
- /* remember our position */
- fi->dentry = last;
- fi->next_offset = di->offset;
- } else {
- dput(last);
- }
- last = NULL;
- }
-
- spin_lock(&inode->i_lock);
- spin_lock(&dcache_lock);
-
- if (err < 0)
- goto out_unlock;
-
- last = dentry;
-
- p = p->prev;
- filp->f_pos++;
-
- /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
- if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
- goto more;
- dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
- err = -EAGAIN;
-
-out_unlock:
- spin_unlock(&dcache_lock);
-
- if (last) {
- spin_unlock(&inode->i_lock);
- dput(last);
- spin_lock(&inode->i_lock);
- }
-
- return err;
-}
-
-/*
- * make note of the last dentry we read, so we can
- * continue at the same lexicographical point,
- * regardless of what dir changes take place on the
- * server.
- */
-static int note_last_dentry(struct ceph_file_info *fi, const char *name,
- int len)
-{
- kfree(fi->last_name);
- fi->last_name = kmalloc(len+1, GFP_NOFS);
- if (!fi->last_name)
- return -ENOMEM;
- memcpy(fi->last_name, name, len);
- fi->last_name[len] = 0;
- dout("note_last_dentry '%s'\n", fi->last_name);
- return 0;
-}
-
-static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
- struct ceph_file_info *fi = filp->private_data;
- struct inode *inode = filp->f_dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_client *client = ceph_inode_to_client(inode);
- struct ceph_mds_client *mdsc = &client->mdsc;
- unsigned frag = fpos_frag(filp->f_pos);
- int off = fpos_off(filp->f_pos);
- int err;
- u32 ftype;
- struct ceph_mds_reply_info_parsed *rinfo;
- const int max_entries = client->mount_args.max_readdir;
-
- dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
- if (fi->at_end)
- return 0;
-
- /* always start with . and .. */
- if (filp->f_pos == 0) {
- /* note dir version at start of readdir so we can tell
- * if any dentries get dropped */
- fi->dir_release_count = ci->i_release_count;
-
- dout("readdir off 0 -> '.'\n");
- if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
- inode->i_ino, inode->i_mode >> 12) < 0)
- return 0;
- filp->f_pos = 1;
- off = 1;
- }
- if (filp->f_pos == 1) {
- dout("readdir off 1 -> '..'\n");
- if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
- filp->f_dentry->d_parent->d_inode->i_ino,
- inode->i_mode >> 12) < 0)
- return 0;
- filp->f_pos = 2;
- off = 2;
- }
-
- /* can we use the dcache? */
- spin_lock(&inode->i_lock);
- if ((filp->f_pos == 2 || fi->dentry) &&
- !ceph_test_opt(client, NOASYNCREADDIR) &&
- (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
- __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
- err = __dcache_readdir(filp, dirent, filldir);
- if (err != -EAGAIN) {
- spin_unlock(&inode->i_lock);
- return err;
- }
- }
- spin_unlock(&inode->i_lock);
- if (fi->dentry) {
- err = note_last_dentry(fi, fi->dentry->d_name.name,
- fi->dentry->d_name.len);
- if (err)
- return err;
- dput(fi->dentry);
- fi->dentry = NULL;
- }
-
- /* proceed with a normal readdir */
-
-more:
- /* do we have the correct frag content buffered? */
- if (fi->frag != frag || fi->last_readdir == NULL) {
- struct ceph_mds_request *req;
- int op = ceph_snap(inode) == CEPH_SNAPDIR ?
- CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
-
- /* discard old result, if any */
- if (fi->last_readdir)
- ceph_mdsc_put_request(fi->last_readdir);
-
- /* requery frag tree, as the frag topology may have changed */
- frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
-
- dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
- ceph_vinop(inode), frag, fi->last_name);
- req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
- if (IS_ERR(req))
- return PTR_ERR(req);
- req->r_inode = igrab(inode);
- req->r_dentry = dget(filp->f_dentry);
- /* hints to request -> mds selection code */
- req->r_direct_mode = USE_AUTH_MDS;
- req->r_direct_hash = ceph_frag_value(frag);
- req->r_direct_is_hash = true;
- req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
- req->r_readdir_offset = fi->next_offset;
- req->r_args.readdir.frag = cpu_to_le32(frag);
- req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
- req->r_num_caps = max_entries;
- err = ceph_mdsc_do_request(mdsc, NULL, req);
- if (err < 0) {
- ceph_mdsc_put_request(req);
- return err;
- }
- dout("readdir got and parsed readdir result=%d"
- " on frag %x, end=%d, complete=%d\n", err, frag,
- (int)req->r_reply_info.dir_end,
- (int)req->r_reply_info.dir_complete);
-
- if (!req->r_did_prepopulate) {
- dout("readdir !did_prepopulate");
- fi->dir_release_count--; /* preclude I_COMPLETE */
- }
-
- /* note next offset and last dentry name */
- fi->offset = fi->next_offset;
- fi->last_readdir = req;
-
- if (req->r_reply_info.dir_end) {
- kfree(fi->last_name);
- fi->last_name = NULL;
- fi->next_offset = 0;
- } else {
- rinfo = &req->r_reply_info;
- err = note_last_dentry(fi,
- rinfo->dir_dname[rinfo->dir_nr-1],
- rinfo->dir_dname_len[rinfo->dir_nr-1]);
- if (err)
- return err;
- fi->next_offset += rinfo->dir_nr;
- }
- }
-
- rinfo = &fi->last_readdir->r_reply_info;
- dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
- rinfo->dir_nr, off, fi->offset);
- while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) {
- u64 pos = ceph_make_fpos(frag, off);
- struct ceph_mds_reply_inode *in =
- rinfo->dir_in[off - fi->offset].in;
- dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
- off, off - fi->offset, rinfo->dir_nr, pos,
- rinfo->dir_dname_len[off - fi->offset],
- rinfo->dir_dname[off - fi->offset], in);
- BUG_ON(!in);
- ftype = le32_to_cpu(in->mode) >> 12;
- if (filldir(dirent,
- rinfo->dir_dname[off - fi->offset],
- rinfo->dir_dname_len[off - fi->offset],
- pos,
- le64_to_cpu(in->ino),
- ftype) < 0) {
- dout("filldir stopping us...\n");
- return 0;
- }
- off++;
- filp->f_pos = pos + 1;
- }
-
- if (fi->last_name) {
- ceph_mdsc_put_request(fi->last_readdir);
- fi->last_readdir = NULL;
- goto more;
- }
-
- /* more frags? */
- if (!ceph_frag_is_rightmost(frag)) {
- frag = ceph_frag_next(frag);
- off = 0;
- filp->f_pos = ceph_make_fpos(frag, off);
- dout("readdir next frag is %x\n", frag);
- goto more;
- }
- fi->at_end = 1;
-
- /*
- * if dir_release_count still matches the dir, no dentries
- * were released during the whole readdir, and we should have
- * the complete dir contents in our cache.
- */
- spin_lock(&inode->i_lock);
- if (ci->i_release_count == fi->dir_release_count) {
- dout(" marking %p complete\n", inode);
- ci->i_ceph_flags |= CEPH_I_COMPLETE;
- ci->i_max_offset = filp->f_pos;
- }
- spin_unlock(&inode->i_lock);
-
- dout("readdir %p filp %p done.\n", inode, filp);
- return 0;
-}
-
-static void reset_readdir(struct ceph_file_info *fi)
-{
- if (fi->last_readdir) {
- ceph_mdsc_put_request(fi->last_readdir);
- fi->last_readdir = NULL;
- }
- kfree(fi->last_name);
- fi->next_offset = 2; /* compensate for . and .. */
- if (fi->dentry) {
- dput(fi->dentry);
- fi->dentry = NULL;
- }
- fi->at_end = 0;
-}
-
-static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
-{
- struct ceph_file_info *fi = file->private_data;
- struct inode *inode = file->f_mapping->host;
- loff_t old_offset = offset;
- loff_t retval;
-
- mutex_lock(&inode->i_mutex);
- switch (origin) {
- case SEEK_END:
- offset += inode->i_size + 2; /* FIXME */
- break;
- case SEEK_CUR:
- offset += file->f_pos;
- }
- retval = -EINVAL;
- if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
- if (offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
- fi->at_end = 0;
- }
- retval = offset;
-
- /*
- * discard buffered readdir content on seekdir(0), or
- * seek to new frag, or seek prior to current chunk.
- */
- if (offset == 0 ||
- fpos_frag(offset) != fpos_frag(old_offset) ||
- fpos_off(offset) < fi->offset) {
- dout("dir_llseek dropping %p content\n", file);
- reset_readdir(fi);
- }
-
- /* bump dir_release_count if we did a forward seek */
- if (offset > old_offset)
- fi->dir_release_count--;
- }
- mutex_unlock(&inode->i_mutex);
- return retval;
-}
-
-/*
- * Process result of a lookup/open request.
- *
- * Mainly, make sure we return the final req->r_dentry (if it already
- * existed) in place of the original VFS-provided dentry when they
- * differ.
- *
- * Gracefully handle the case where the MDS replies with -ENOENT and
- * no trace (which it may do, at its discretion, e.g., if it doesn't
- * care to issue a lease on the negative dentry).
- */
-struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
- struct dentry *dentry, int err)
-{
- struct ceph_client *client = ceph_client(dentry->d_sb);
- struct inode *parent = dentry->d_parent->d_inode;
-
- /* .snap dir? */
- if (err == -ENOENT &&
- ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
- strcmp(dentry->d_name.name, client->mount_args.snapdir_name) == 0) {
- struct inode *inode = ceph_get_snapdir(parent);
- dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
- dentry, dentry->d_name.len, dentry->d_name.name, inode);
- d_add(dentry, inode);
- err = 0;
- }
-
- if (err == -ENOENT) {
- /* no trace? */
- err = 0;
- if (!req->r_reply_info.head->is_dentry) {
- dout("ENOENT and no trace, dentry %p inode %p\n",
- dentry, dentry->d_inode);
- if (dentry->d_inode) {
- d_drop(dentry);
- err = -ENOENT;
- } else {
- d_add(dentry, NULL);
- }
- }
- }
- if (err)
- dentry = ERR_PTR(err);
- else if (dentry != req->r_dentry)
- dentry = dget(req->r_dentry); /* we got spliced */
- else
- dentry = NULL;
- return dentry;
-}
-
-/*
- * Look up a single dir entry. If there is a lookup intent, inform
- * the MDS so that it gets our 'caps wanted' value in a single op.
- */
-static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
- struct nameidata *nd)
-{
- struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
- struct ceph_mds_request *req;
- int op;
- int err;
-
- dout("lookup %p dentry %p '%.*s'\n",
- dir, dentry, dentry->d_name.len, dentry->d_name.name);
-
- if (dentry->d_name.len > NAME_MAX)
- return ERR_PTR(-ENAMETOOLONG);
-
- err = ceph_init_dentry(dentry);
- if (err < 0)
- return ERR_PTR(err);
-
- /* open (but not create!) intent? */
- if (nd &&
- (nd->flags & LOOKUP_OPEN) &&
- (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */
- !(nd->intent.open.flags & O_CREAT)) {
- int mode = nd->intent.open.create_mode & ~current->fs->umask;
- return ceph_lookup_open(dir, dentry, nd, mode, 1);
- }
-
- /* can we conclude ENOENT locally? */
- if (dentry->d_inode == NULL) {
- struct ceph_inode_info *ci = ceph_inode(dir);
- struct ceph_dentry_info *di = ceph_dentry(dentry);
-
- spin_lock(&dir->i_lock);
- dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
- if (strncmp(dentry->d_name.name,
- client->mount_args.snapdir_name,
- dentry->d_name.len) &&
- (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
- (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
- di->offset = ci->i_max_offset++;
- spin_unlock(&dir->i_lock);
- dout(" dir %p complete, -ENOENT\n", dir);
- d_add(dentry, NULL);
- di->lease_shared_gen = ci->i_shared_gen;
- return NULL;
- }
- spin_unlock(&dir->i_lock);
- }
-
- op = ceph_snap(dir) == CEPH_SNAPDIR ?
- CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
- req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
- if (IS_ERR(req))
- return ERR_PTR(PTR_ERR(req));
- req->r_dentry = dget(dentry);
- req->r_num_caps = 2;
- /* we only need inode linkage */
- req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
- req->r_locked_dir = dir;
- err = ceph_mdsc_do_request(mdsc, NULL, req);
- dentry = ceph_finish_lookup(req, dentry, err);
- ceph_mdsc_put_request(req); /* will dput(dentry) */
- dout("lookup result=%p\n", dentry);
- return dentry;
-}
-
-/*
- * If we do a create but get no trace back from the MDS, follow up with
- * a lookup (the VFS expects us to link up the provided dentry).
- */
-int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
-{
- struct dentry *result = ceph_lookup(dir, dentry, NULL);
-
- if (result && !IS_ERR(result)) {
- /*
- * We created the item, then did a lookup, and found
- * it was already linked to another inode we already
- * had in our cache (and thus got spliced). Link our
- * dentry to that inode, but don't hash it, just in
- * case the VFS wants to dereference it.
- */
- BUG_ON(!result->d_inode);
- d_instantiate(dentry, result->d_inode);
- return 0;
- }
- return PTR_ERR(result);
-}
-
-static int ceph_mknod(struct inode *dir, struct dentry *dentry,
- int mode, dev_t rdev)
-{
- struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
- struct ceph_mds_request *req;
- int err;
-
- if (ceph_snap(dir) != CEPH_NOSNAP)
- return -EROFS;
-
- dout("mknod in dir %p dentry %p mode 0%o rdev %d\n",
- dir, dentry, mode, rdev);
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
- if (IS_ERR(req)) {
- d_drop(dentry);
- return PTR_ERR(req);
- }
- req->r_dentry = dget(dentry);
- req->r_num_caps = 2;
- req->r_locked_dir = dir;
- req->r_args.mknod.mode = cpu_to_le32(mode);
- req->r_args.mknod.rdev = cpu_to_le32(rdev);
- req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
- req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
- err = ceph_mdsc_do_request(mdsc, dir, req);
- if (!err && !req->r_reply_info.head->is_dentry)
- err = ceph_handle_notrace_create(dir, dentry);
- ceph_mdsc_put_request(req);
- if (err)
- d_drop(dentry);
- return err;
-}
-
-static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
- struct nameidata *nd)
-{
- dout("create in dir %p dentry %p name '%.*s'\n",
- dir, dentry, dentry->d_name.len, dentry->d_name.name);
-
- if (ceph_snap(dir) != CEPH_NOSNAP)
- return -EROFS;
-
- if (nd) {
- BUG_ON((nd->flags & LOOKUP_OPEN) == 0);
- dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);
- /* hrm, what should i do here if we get aliased? */
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
- return 0;
- }
-
- /* fall back to mknod */
- return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0);
-}
-
-static int ceph_symlink(struct inode *dir, struct dentry *dentry,
- const char *dest)
-{
- struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
- struct ceph_mds_request *req;
- int err;
-
- if (ceph_snap(dir) != CEPH_NOSNAP)
- return -EROFS;
-
- dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
- if (IS_ERR(req)) {
- d_drop(dentry);
- return PTR_ERR(req);
- }
- req->r_dentry = dget(dentry);
- req->r_num_caps = 2;
- req->r_path2 = kstrdup(dest, GFP_NOFS);
- req->r_locked_dir = dir;
- req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
- req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
- err = ceph_mdsc_do_request(mdsc, dir, req);
- if (!err && !req->r_reply_info.head->is_dentry)
- err = ceph_handle_notrace_create(dir, dentry);
- ceph_mdsc_put_request(req);
- if (err)
- d_drop(dentry);
- return err;
-}
-
-static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
- struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
- struct ceph_mds_request *req;
- int err = -EROFS;
- int op;
-
- if (ceph_snap(dir) == CEPH_SNAPDIR) {
- /* mkdir .snap/foo is a MKSNAP */
- op = CEPH_MDS_OP_MKSNAP;
- dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
- dentry->d_name.len, dentry->d_name.name, dentry);
- } else if (ceph_snap(dir) == CEPH_NOSNAP) {
- dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode);
- op = CEPH_MDS_OP_MKDIR;
- } else {
- goto out;
- }
- req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
- if (IS_ERR(req)) {
- err = PTR_ERR(req);
- goto out;
- }
-
- req->r_dentry = dget(dentry);
- req->r_num_caps = 2;
- req->r_locked_dir = dir;
- req->r_args.mkdir.mode = cpu_to_le32(mode);
- req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
- req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
- err = ceph_mdsc_do_request(mdsc, dir, req);
- if (!err && !req->r_reply_info.head->is_dentry)
- err = ceph_handle_notrace_create(dir, dentry);
- ceph_mdsc_put_request(req);
-out:
- if (err < 0)
- d_drop(dentry);
- return err;
-}
-
-static int ceph_link(struct dentry *old_dentry, struct inode *dir,
- struct dentry *dentry)
-{
- struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
- struct ceph_mds_request *req;
- int err;
-
- if (ceph_snap(dir) != CEPH_NOSNAP)
- return -EROFS;
-
- dout("link in dir %p old_dentry %p dentry %p\n", dir,
- old_dentry, dentry);
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
- if (IS_ERR(req)) {
- d_drop(dentry);
- return PTR_ERR(req);
- }
- req->r_dentry = dget(dentry);
- req->r_num_caps = 2;
- req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
- req->r_locked_dir = dir;
- req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
- req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
- err = ceph_mdsc_do_request(mdsc, dir, req);
- if (err)
- d_drop(dentry);
- else if (!req->r_reply_info.head->is_dentry)
- d_instantiate(dentry, igrab(old_dentry->d_inode));
- ceph_mdsc_put_request(req);
- return err;
-}
-
-/*
- * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
- * looks like the link count will hit 0, drop any other caps (other
- * than PIN) we don't specifically want (due to the file still being
- * open).
- */
-static int drop_caps_for_unlink(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
-
- spin_lock(&inode->i_lock);
- if (inode->i_nlink == 1) {
- drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
- ci->i_ceph_flags |= CEPH_I_NODELAY;
- }
- spin_unlock(&inode->i_lock);
- return drop;
-}
-
-/*
- * rmdir and unlink are differ only by the metadata op code
- */
-static int ceph_unlink(struct inode *dir, struct dentry *dentry)
-{
- struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
- struct inode *inode = dentry->d_inode;
- struct ceph_mds_request *req;
- int err = -EROFS;
- int op;
-
- if (ceph_snap(dir) == CEPH_SNAPDIR) {
- /* rmdir .snap/foo is RMSNAP */
- dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
- dentry->d_name.name, dentry);
- op = CEPH_MDS_OP_RMSNAP;
- } else if (ceph_snap(dir) == CEPH_NOSNAP) {
- dout("unlink/rmdir dir %p dn %p inode %p\n",
- dir, dentry, inode);
- op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ?
- CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
- } else
- goto out;
- req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
- if (IS_ERR(req)) {
- err = PTR_ERR(req);
- goto out;
- }
- req->r_dentry = dget(dentry);
- req->r_num_caps = 2;
- req->r_locked_dir = dir;
- req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
- req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
- req->r_inode_drop = drop_caps_for_unlink(inode);
- err = ceph_mdsc_do_request(mdsc, dir, req);
- if (!err && !req->r_reply_info.head->is_dentry)
- d_delete(dentry);
- ceph_mdsc_put_request(req);
-out:
- return err;
-}
-
-static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
-{
- struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
- struct ceph_mds_request *req;
- int err;
-
- if (ceph_snap(old_dir) != ceph_snap(new_dir))
- return -EXDEV;
- if (ceph_snap(old_dir) != CEPH_NOSNAP ||
- ceph_snap(new_dir) != CEPH_NOSNAP)
- return -EROFS;
- dout("rename dir %p dentry %p to dir %p dentry %p\n",
- old_dir, old_dentry, new_dir, new_dentry);
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
- if (IS_ERR(req))
- return PTR_ERR(req);
- req->r_dentry = dget(new_dentry);
- req->r_num_caps = 2;
- req->r_old_dentry = dget(old_dentry);
- req->r_locked_dir = new_dir;
- req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
- req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
- req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
- req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
- /* release LINK_RDCACHE on source inode (mds will lock it) */
- req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
- if (new_dentry->d_inode)
- req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
- err = ceph_mdsc_do_request(mdsc, old_dir, req);
- if (!err && !req->r_reply_info.head->is_dentry) {
- /*
- * Normally d_move() is done by fill_trace (called by
- * do_request, above). If there is no trace, we need
- * to do it here.
- */
- d_move(old_dentry, new_dentry);
- }
- ceph_mdsc_put_request(req);
- return err;
-}
-
-
-/*
- * Check if dentry lease is valid. If not, delete the lease. Try to
- * renew if the least is more than half up.
- */
-static int dentry_lease_is_valid(struct dentry *dentry)
-{
- struct ceph_dentry_info *di;
- struct ceph_mds_session *s;
- int valid = 0;
- u32 gen;
- unsigned long ttl;
- struct ceph_mds_session *session = NULL;
- struct inode *dir = NULL;
- u32 seq = 0;
-
- spin_lock(&dentry->d_lock);
- di = ceph_dentry(dentry);
- if (di && di->lease_session) {
- s = di->lease_session;
- spin_lock(&s->s_cap_lock);
- gen = s->s_cap_gen;
- ttl = s->s_cap_ttl;
- spin_unlock(&s->s_cap_lock);
-
- if (di->lease_gen == gen &&
- time_before(jiffies, dentry->d_time) &&
- time_before(jiffies, ttl)) {
- valid = 1;
- if (di->lease_renew_after &&
- time_after(jiffies, di->lease_renew_after)) {
- /* we should renew */
- dir = dentry->d_parent->d_inode;
- session = ceph_get_mds_session(s);
- seq = di->lease_seq;
- di->lease_renew_after = 0;
- di->lease_renew_from = jiffies;
- }
- } else {
- __ceph_mdsc_drop_dentry_lease(dentry);
- }
- }
- spin_unlock(&dentry->d_lock);
-
- if (session) {
- ceph_mdsc_lease_send_msg(session, dir, dentry,
- CEPH_MDS_LEASE_RENEW, seq);
- ceph_put_mds_session(session);
- }
- dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
- return valid;
-}
-
-/*
- * Check if directory-wide content lease/cap is valid.
- */
-static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
-{
- struct ceph_inode_info *ci = ceph_inode(dir);
- struct ceph_dentry_info *di = ceph_dentry(dentry);
- int valid = 0;
-
- spin_lock(&dir->i_lock);
- if (ci->i_shared_gen == di->lease_shared_gen)
- valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
- spin_unlock(&dir->i_lock);
- dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
- dir, (unsigned)ci->i_shared_gen, dentry,
- (unsigned)di->lease_shared_gen, valid);
- return valid;
-}
-
-/*
- * Check if cached dentry can be trusted.
- */
-static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
-{
- struct inode *dir = dentry->d_parent->d_inode;
-
- dout("d_revalidate %p '%.*s' inode %p\n", dentry,
- dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
-
- /* always trust cached snapped dentries, snapdir dentry */
- if (ceph_snap(dir) != CEPH_NOSNAP) {
- dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
- dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
- goto out_touch;
- }
- if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR)
- goto out_touch;
-
- if (dentry_lease_is_valid(dentry) ||
- dir_lease_is_valid(dir, dentry))
- goto out_touch;
-
- dout("d_revalidate %p invalid\n", dentry);
- d_drop(dentry);
- return 0;
-out_touch:
- ceph_dentry_lru_touch(dentry);
- return 1;
-}
-
-/*
- * When a dentry is released, clear the dir I_COMPLETE if it was part
- * of the current dir gen.
- */
-static void ceph_dentry_release(struct dentry *dentry)
-{
- struct ceph_dentry_info *di = ceph_dentry(dentry);
- struct inode *parent_inode = dentry->d_parent->d_inode;
-
- if (parent_inode) {
- struct ceph_inode_info *ci = ceph_inode(parent_inode);
-
- spin_lock(&parent_inode->i_lock);
- if (ci->i_shared_gen == di->lease_shared_gen) {
- dout(" clearing %p complete (d_release)\n",
- parent_inode);
- ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
- ci->i_release_count++;
- }
- spin_unlock(&parent_inode->i_lock);
- }
- if (di) {
- ceph_dentry_lru_del(dentry);
- if (di->lease_session)
- ceph_put_mds_session(di->lease_session);
- kmem_cache_free(ceph_dentry_cachep, di);
- dentry->d_fsdata = NULL;
- }
-}
-
-static int ceph_snapdir_d_revalidate(struct dentry *dentry,
- struct nameidata *nd)
-{
- /*
- * Eventually, we'll want to revalidate snapped metadata
- * too... probably...
- */
- return 1;
-}
-
-
-
-/*
- * read() on a dir. This weird interface hack only works if mounted
- * with '-o dirstat'.
- */
-static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
- loff_t *ppos)
-{
- struct ceph_file_info *cf = file->private_data;
- struct inode *inode = file->f_dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- int left;
-
- if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
- return -EISDIR;
-
- if (!cf->dir_info) {
- cf->dir_info = kmalloc(1024, GFP_NOFS);
- if (!cf->dir_info)
- return -ENOMEM;
- cf->dir_info_len =
- sprintf(cf->dir_info,
- "entries: %20lld\n"
- " files: %20lld\n"
- " subdirs: %20lld\n"
- "rentries: %20lld\n"
- " rfiles: %20lld\n"
- " rsubdirs: %20lld\n"
- "rbytes: %20lld\n"
- "rctime: %10ld.%09ld\n",
- ci->i_files + ci->i_subdirs,
- ci->i_files,
- ci->i_subdirs,
- ci->i_rfiles + ci->i_rsubdirs,
- ci->i_rfiles,
- ci->i_rsubdirs,
- ci->i_rbytes,
- (long)ci->i_rctime.tv_sec,
- (long)ci->i_rctime.tv_nsec);
- }
-
- if (*ppos >= cf->dir_info_len)
- return 0;
- size = min_t(unsigned, size, cf->dir_info_len-*ppos);
- left = copy_to_user(buf, cf->dir_info + *ppos, size);
- if (left == size)
- return -EFAULT;
- *ppos += (size - left);
- return size - left;
-}
-
-/*
- * an fsync() on a dir will wait for any uncommitted directory
- * operations to commit.
- */
-static int ceph_dir_fsync(struct file *file, struct dentry *dentry,
- int datasync)
-{
- struct inode *inode = dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct list_head *head = &ci->i_unsafe_dirops;
- struct ceph_mds_request *req;
- u64 last_tid;
- int ret = 0;
-
- dout("dir_fsync %p\n", inode);
- spin_lock(&ci->i_unsafe_lock);
- if (list_empty(head))
- goto out;
-
- req = list_entry(head->prev,
- struct ceph_mds_request, r_unsafe_dir_item);
- last_tid = req->r_tid;
-
- do {
- ceph_mdsc_get_request(req);
- spin_unlock(&ci->i_unsafe_lock);
- dout("dir_fsync %p wait on tid %llu (until %llu)\n",
- inode, req->r_tid, last_tid);
- if (req->r_timeout) {
- ret = wait_for_completion_timeout(
- &req->r_safe_completion, req->r_timeout);
- if (ret > 0)
- ret = 0;
- else if (ret == 0)
- ret = -EIO; /* timed out */
- } else {
- wait_for_completion(&req->r_safe_completion);
- }
- spin_lock(&ci->i_unsafe_lock);
- ceph_mdsc_put_request(req);
-
- if (ret || list_empty(head))
- break;
- req = list_entry(head->next,
- struct ceph_mds_request, r_unsafe_dir_item);
- } while (req->r_tid < last_tid);
-out:
- spin_unlock(&ci->i_unsafe_lock);
- return ret;
-}
-
-/*
- * We maintain a private dentry LRU.
- *
- * FIXME: this needs to be changed to a per-mds lru to be useful.
- */
-void ceph_dentry_lru_add(struct dentry *dn)
-{
- struct ceph_dentry_info *di = ceph_dentry(dn);
- struct ceph_mds_client *mdsc;
- dout("dentry_lru_add %p %p\t%.*s\n",
- di, dn, dn->d_name.len, dn->d_name.name);
-
- if (di) {
- mdsc = &ceph_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_add_tail(&di->lru, &mdsc->dentry_lru);
- mdsc->num_dentry++;
- spin_unlock(&mdsc->dentry_lru_lock);
- }
-}
-
-void ceph_dentry_lru_touch(struct dentry *dn)
-{
- struct ceph_dentry_info *di = ceph_dentry(dn);
- struct ceph_mds_client *mdsc;
- dout("dentry_lru_touch %p %p\t%.*s\n",
- di, dn, dn->d_name.len, dn->d_name.name);
-
- if (di) {
- mdsc = &ceph_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_move_tail(&di->lru, &mdsc->dentry_lru);
- spin_unlock(&mdsc->dentry_lru_lock);
- }
-}
-
-void ceph_dentry_lru_del(struct dentry *dn)
-{
- struct ceph_dentry_info *di = ceph_dentry(dn);
- struct ceph_mds_client *mdsc;
-
- dout("dentry_lru_del %p %p\t%.*s\n",
- di, dn, dn->d_name.len, dn->d_name.name);
- if (di) {
- mdsc = &ceph_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_del_init(&di->lru);
- mdsc->num_dentry--;
- spin_unlock(&mdsc->dentry_lru_lock);
- }
-}
-
-const struct file_operations ceph_dir_fops = {
- .read = ceph_read_dir,
- .readdir = ceph_readdir,
- .llseek = ceph_dir_llseek,
- .open = ceph_open,
- .release = ceph_release,
- .unlocked_ioctl = ceph_ioctl,
- .fsync = ceph_dir_fsync,
-};
-
-const struct inode_operations ceph_dir_iops = {
- .lookup = ceph_lookup,
- .permission = ceph_permission,
- .getattr = ceph_getattr,
- .setattr = ceph_setattr,
- .setxattr = ceph_setxattr,
- .getxattr = ceph_getxattr,
- .listxattr = ceph_listxattr,
- .removexattr = ceph_removexattr,
- .mknod = ceph_mknod,
- .symlink = ceph_symlink,
- .mkdir = ceph_mkdir,
- .link = ceph_link,
- .unlink = ceph_unlink,
- .rmdir = ceph_unlink,
- .rename = ceph_rename,
- .create = ceph_create,
-};
-
-struct dentry_operations ceph_dentry_ops = {
- .d_revalidate = ceph_d_revalidate,
- .d_release = ceph_dentry_release,
-};
-
-struct dentry_operations ceph_snapdir_dentry_ops = {
- .d_revalidate = ceph_snapdir_d_revalidate,
-};
-
-struct dentry_operations ceph_snap_dentry_ops = {
-};
+++ /dev/null
-#include "ceph_debug.h"
-
-#include <linux/exportfs.h>
-#include <asm/unaligned.h>
-
-#include "super.h"
-
-/*
- * NFS export support
- *
- * NFS re-export of a ceph mount is, at present, only semireliable.
- * The basic issue is that the Ceph architectures doesn't lend itself
- * well to generating filehandles that will remain valid forever.
- *
- * So, we do our best. If you're lucky, your inode will be in the
- * client's cache. If it's not, and you have a connectable fh, then
- * the MDS server may be able to find it for you. Otherwise, you get
- * ESTALE.
- *
- * There are ways to this more reliable, but in the non-connectable fh
- * case, we won't every work perfectly, and in the connectable case,
- * some changes are needed on the MDS side to work better.
- */
-
-/*
- * Basic fh
- */
-struct ceph_nfs_fh {
- u64 ino;
-} __attribute__ ((packed));
-
-/*
- * Larger 'connectable' fh that includes parent ino and name hash.
- * Use this whenever possible, as it works more reliably.
- */
-struct ceph_nfs_confh {
- u64 ino, parent_ino;
- u32 parent_name_hash;
-} __attribute__ ((packed));
-
-static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
- int connectable)
-{
- struct ceph_nfs_fh *fh = (void *)rawfh;
- struct ceph_nfs_confh *cfh = (void *)rawfh;
- struct dentry *parent = dentry->d_parent;
- struct inode *inode = dentry->d_inode;
- int type;
-
- /* don't re-export snaps */
- if (ceph_snap(inode) != CEPH_NOSNAP)
- return -EINVAL;
-
- if (*max_len >= sizeof(*cfh)) {
- dout("encode_fh %p connectable\n", dentry);
- cfh->ino = ceph_ino(dentry->d_inode);
- cfh->parent_ino = ceph_ino(parent->d_inode);
- cfh->parent_name_hash = parent->d_name.hash;
- *max_len = sizeof(*cfh);
- type = 2;
- } else if (*max_len > sizeof(*fh)) {
- if (connectable)
- return -ENOSPC;
- dout("encode_fh %p\n", dentry);
- fh->ino = ceph_ino(dentry->d_inode);
- *max_len = sizeof(*fh);
- type = 1;
- } else {
- return -ENOSPC;
- }
- return type;
-}
-
-/*
- * convert regular fh to dentry
- *
- * FIXME: we should try harder by querying the mds for the ino.
- */
-static struct dentry *__fh_to_dentry(struct super_block *sb,
- struct ceph_nfs_fh *fh)
-{
- struct inode *inode;
- struct dentry *dentry;
- struct ceph_vino vino;
- int err;
-
- dout("__fh_to_dentry %llx\n", fh->ino);
- vino.ino = fh->ino;
- vino.snap = CEPH_NOSNAP;
- inode = ceph_find_inode(sb, vino);
- if (!inode)
- return ERR_PTR(-ESTALE);
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28)
- dentry = d_obtain_alias(inode);
-#else
- dentry = d_alloc_anon(inode);
-#endif
-
- if (!dentry) {
- pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
- fh->ino, inode);
- iput(inode);
- return ERR_PTR(-ENOMEM);
- }
- err = ceph_init_dentry(dentry);
-
- if (err < 0) {
- iput(inode);
- return ERR_PTR(err);
- }
- dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
- return dentry;
-}
-
-/*
- * convert connectable fh to dentry
- */
-static struct dentry *__cfh_to_dentry(struct super_block *sb,
- struct ceph_nfs_confh *cfh)
-{
- struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
- struct inode *inode;
- struct dentry *dentry;
- struct ceph_vino vino;
- int err;
-
- dout("__cfh_to_dentry %llx (%llx/%x)\n",
- cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
-
- vino.ino = cfh->ino;
- vino.snap = CEPH_NOSNAP;
- inode = ceph_find_inode(sb, vino);
- if (!inode) {
- struct ceph_mds_request *req;
-
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
- USE_ANY_MDS);
- if (IS_ERR(req))
- return ERR_PTR(PTR_ERR(req));
-
- req->r_ino1 = vino;
- req->r_ino2.ino = cfh->parent_ino;
- req->r_ino2.snap = CEPH_NOSNAP;
- req->r_path2 = kmalloc(16, GFP_NOFS);
- snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
- req->r_num_caps = 1;
- err = ceph_mdsc_do_request(mdsc, NULL, req);
- ceph_mdsc_put_request(req);
- inode = ceph_find_inode(sb, vino);
- if (!inode)
- return ERR_PTR(err ? err : -ESTALE);
- }
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28)
- dentry = d_obtain_alias(inode);
-#else
- dentry = d_alloc_anon(inode);
-#endif
- if (!dentry) {
- pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
- cfh->ino, inode);
- iput(inode);
- return ERR_PTR(-ENOMEM);
- }
- err = ceph_init_dentry(dentry);
- if (err < 0) {
- iput(inode);
- return ERR_PTR(err);
- }
- dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
- return dentry;
-}
-
-static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type)
-{
- if (fh_type == 1)
- return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw);
- else
- return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw);
-}
-
-/*
- * get parent, if possible.
- *
- * FIXME: we could do better by querying the mds to discover the
- * parent.
- */
-static struct dentry *ceph_fh_to_parent(struct super_block *sb,
- struct fid *fid,
- int fh_len, int fh_type)
-{
- struct ceph_nfs_confh *cfh = (void *)fid->raw;
- struct ceph_vino vino;
- struct inode *inode;
- struct dentry *dentry;
- int err;
-
- if (fh_type == 1)
- return ERR_PTR(-ESTALE);
-
- pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
- cfh->parent_name_hash);
-
- vino.ino = cfh->ino;
- vino.snap = CEPH_NOSNAP;
- inode = ceph_find_inode(sb, vino);
- if (!inode)
- return ERR_PTR(-ESTALE);
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28)
- dentry = d_obtain_alias(inode);
-#else
- dentry = d_alloc_anon(inode);
-#endif
- if (!dentry) {
- pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
- cfh->ino, inode);
- iput(inode);
- return ERR_PTR(-ENOMEM);
- }
- err = ceph_init_dentry(dentry);
- if (err < 0) {
- iput(inode);
- return ERR_PTR(err);
- }
- dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
- return dentry;
-}
-
-const struct export_operations ceph_export_ops = {
- .encode_fh = ceph_encode_fh,
- .fh_to_dentry = ceph_fh_to_dentry,
- .fh_to_parent = ceph_fh_to_parent,
-};
+++ /dev/null
-#include "ceph_debug.h"
-
-#include <linux/sched.h>
-#include <linux/file.h>
-#include <linux/namei.h>
-#include <linux/writeback.h>
-
-#include "super.h"
-#include "mds_client.h"
-
-/*
- * Ceph file operations
- *
- * Implement basic open/close functionality, and implement
- * read/write.
- *
- * We implement three modes of file I/O:
- * - buffered uses the generic_file_aio_{read,write} helpers
- *
- * - synchronous is used when there is multi-client read/write
- * sharing, avoids the page cache, and synchronously waits for an
- * ack from the OSD.
- *
- * - direct io takes the variant of the sync path that references
- * user pages directly.
- *
- * fsync() flushes and waits on dirty pages, but just queues metadata
- * for writeback: since the MDS can recover size and mtime there is no
- * need to wait for MDS acknowledgement.
- */
-
-
-/*
- * Prepare an open request. Preallocate ceph_cap to avoid an
- * inopportune ENOMEM later.
- */
-static struct ceph_mds_request *
-prepare_open_request(struct super_block *sb, int flags, int create_mode)
-{
- struct ceph_client *client = ceph_sb_to_client(sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
- struct ceph_mds_request *req;
- int want_auth = USE_ANY_MDS;
- int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
-
- if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
- want_auth = USE_AUTH_MDS;
-
- req = ceph_mdsc_create_request(mdsc, op, want_auth);
- if (IS_ERR(req))
- goto out;
- req->r_fmode = ceph_flags_to_mode(flags);
- req->r_args.open.flags = cpu_to_le32(flags);
- req->r_args.open.mode = cpu_to_le32(create_mode);
- req->r_args.open.preferred = -1;
-out:
- return req;
-}
-
-/*
- * initialize private struct file data.
- * if we fail, clean up by dropping fmode reference on the ceph_inode
- */
-static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
-{
- struct ceph_file_info *cf;
- int ret = 0;
-
- switch (inode->i_mode & S_IFMT) {
- case S_IFREG:
- case S_IFDIR:
- dout("init_file %p %p 0%o (regular)\n", inode, file,
- inode->i_mode);
- cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
- if (cf == NULL) {
- ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
- return -ENOMEM;
- }
- cf->fmode = fmode;
- cf->next_offset = 2;
- file->private_data = cf;
- BUG_ON(inode->i_fop->release != ceph_release);
- break;
-
- case S_IFLNK:
- dout("init_file %p %p 0%o (symlink)\n", inode, file,
- inode->i_mode);
- ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
- break;
-
- default:
- dout("init_file %p %p 0%o (special)\n", inode, file,
- inode->i_mode);
- /*
- * we need to drop the open ref now, since we don't
- * have .release set to ceph_release.
- */
- ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
- BUG_ON(inode->i_fop->release == ceph_release);
-
- /* call the proper open fop */
- ret = inode->i_fop->open(inode, file);
- }
- return ret;
-}
-
-/*
- * If the filp already has private_data, that means the file was
- * already opened by intent during lookup, and we do nothing.
- *
- * If we already have the requisite capabilities, we can satisfy
- * the open request locally (no need to request new caps from the
- * MDS). We do, however, need to inform the MDS (asynchronously)
- * if our wanted caps set expands.
- */
-int ceph_open(struct inode *inode, struct file *file)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
- struct ceph_mds_request *req;
- struct ceph_file_info *cf = file->private_data;
- struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
- int err;
- int flags, fmode, wanted;
-
- if (cf) {
- dout("open file %p is already opened\n", file);
- return 0;
- }
-
- /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */
- flags = file->f_flags & ~(O_CREAT|O_EXCL);
- if (S_ISDIR(inode->i_mode))
- flags = O_DIRECTORY; /* mds likes to know */
-
- dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
- ceph_vinop(inode), file, flags, file->f_flags);
- fmode = ceph_flags_to_mode(flags);
- wanted = ceph_caps_for_mode(fmode);
-
- /* snapped files are read-only */
- if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
- return -EROFS;
-
- /* trivially open snapdir */
- if (ceph_snap(inode) == CEPH_SNAPDIR) {
- spin_lock(&inode->i_lock);
- __ceph_get_fmode(ci, fmode);
- spin_unlock(&inode->i_lock);
- return ceph_init_file(inode, file, fmode);
- }
-
- /*
- * No need to block if we have any caps. Update wanted set
- * asynchronously.
- */
- spin_lock(&inode->i_lock);
- if (__ceph_is_any_real_caps(ci)) {
- int mds_wanted = __ceph_caps_mds_wanted(ci);
- int issued = __ceph_caps_issued(ci, NULL);
-
- dout("open %p fmode %d want %s issued %s using existing\n",
- inode, fmode, ceph_cap_string(wanted),
- ceph_cap_string(issued));
- __ceph_get_fmode(ci, fmode);
- spin_unlock(&inode->i_lock);
-
- /* adjust wanted? */
- if ((issued & wanted) != wanted &&
- (mds_wanted & wanted) != wanted &&
- ceph_snap(inode) != CEPH_SNAPDIR)
- ceph_check_caps(ci, 0, NULL);
-
- return ceph_init_file(inode, file, fmode);
- } else if (ceph_snap(inode) != CEPH_NOSNAP &&
- (ci->i_snap_caps & wanted) == wanted) {
- __ceph_get_fmode(ci, fmode);
- spin_unlock(&inode->i_lock);
- return ceph_init_file(inode, file, fmode);
- }
- spin_unlock(&inode->i_lock);
-
- dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
- req = prepare_open_request(inode->i_sb, flags, 0);
- if (IS_ERR(req)) {
- err = PTR_ERR(req);
- goto out;
- }
- req->r_inode = igrab(inode);
- req->r_num_caps = 1;
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- if (!err)
- err = ceph_init_file(inode, file, req->r_fmode);
- ceph_mdsc_put_request(req);
- dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
-out:
- return err;
-}
-
-
-/*
- * Do a lookup + open with a single request.
- *
- * If this succeeds, but some subsequent check in the vfs
- * may_open() fails, the struct *file gets cleaned up (i.e.
- * ceph_release gets called). So fear not!
- */
-/*
- * flags
- * path_lookup_open -> LOOKUP_OPEN
- * path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE
- */
-struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
- struct nameidata *nd, int mode,
- int locked_dir)
-{
- struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
- struct file *file = nd->intent.open.file;
- struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
- struct ceph_mds_request *req;
- int err;
- int flags = nd->intent.open.flags - 1; /* silly vfs! */
-
- dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n",
- dentry, dentry->d_name.len, dentry->d_name.name, flags, mode);
-
- /* do the open */
- req = prepare_open_request(dir->i_sb, flags, mode);
- if (IS_ERR(req))
- return ERR_PTR(PTR_ERR(req));
- req->r_dentry = dget(dentry);
- req->r_num_caps = 2;
- if (flags & O_CREAT) {
- req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
- req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
- }
- req->r_locked_dir = dir; /* caller holds dir->i_mutex */
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- dentry = ceph_finish_lookup(req, dentry, err);
- if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
- err = ceph_handle_notrace_create(dir, dentry);
- if (!err)
- err = ceph_init_file(req->r_dentry->d_inode, file,
- req->r_fmode);
- ceph_mdsc_put_request(req);
- dout("ceph_lookup_open result=%p\n", dentry);
- return dentry;
-}
-
-int ceph_release(struct inode *inode, struct file *file)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_file_info *cf = file->private_data;
-
- dout("release inode %p file %p\n", inode, file);
- ceph_put_fmode(ci, cf->fmode);
- if (cf->last_readdir)
- ceph_mdsc_put_request(cf->last_readdir);
- kfree(cf->last_name);
- kfree(cf->dir_info);
- dput(cf->dentry);
- kmem_cache_free(ceph_file_cachep, cf);
- return 0;
-}
-
-/*
- * build a vector of user pages
- */
-static struct page **get_direct_page_vector(const char __user *data,
- int num_pages,
- loff_t off, size_t len)
-{
- struct page **pages;
- int rc;
-
- pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
- if (!pages)
- return ERR_PTR(-ENOMEM);
-
- down_read(¤t->mm->mmap_sem);
- rc = get_user_pages(current, current->mm, (unsigned long)data,
- num_pages, 0, 0, pages, NULL);
- up_read(¤t->mm->mmap_sem);
- if (rc < 0)
- goto fail;
- return pages;
-
-fail:
- kfree(pages);
- return ERR_PTR(rc);
-}
-
-static void put_page_vector(struct page **pages, int num_pages)
-{
- int i;
-
- for (i = 0; i < num_pages; i++)
- put_page(pages[i]);
- kfree(pages);
-}
-
-void ceph_release_page_vector(struct page **pages, int num_pages)
-{
- int i;
-
- for (i = 0; i < num_pages; i++)
- __free_pages(pages[i], 0);
- kfree(pages);
-}
-
-/*
- * allocate a vector new pages
- */
-static struct page **alloc_page_vector(int num_pages)
-{
- struct page **pages;
- int i;
-
- pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
- if (!pages)
- return ERR_PTR(-ENOMEM);
- for (i = 0; i < num_pages; i++) {
- pages[i] = alloc_page(GFP_NOFS);
- if (pages[i] == NULL) {
- ceph_release_page_vector(pages, i);
- return ERR_PTR(-ENOMEM);
- }
- }
- return pages;
-}
-
-/*
- * copy user data into a page vector
- */
-static int copy_user_to_page_vector(struct page **pages,
- const char __user *data,
- loff_t off, size_t len)
-{
- int i = 0;
- int po = off & ~PAGE_CACHE_MASK;
- int left = len;
- int l, bad;
-
- while (left > 0) {
- l = min_t(int, PAGE_CACHE_SIZE-po, left);
- bad = copy_from_user(page_address(pages[i]) + po, data, l);
- if (bad == l)
- return -EFAULT;
- data += l - bad;
- left -= l - bad;
- if (po) {
- po += l - bad;
- if (po == PAGE_CACHE_SIZE)
- po = 0;
- }
- }
- return len;
-}
-
-/*
- * copy user data from a page vector into a user pointer
- */
-static int copy_page_vector_to_user(struct page **pages, char __user *data,
- loff_t off, size_t len)
-{
- int i = 0;
- int po = off & ~PAGE_CACHE_MASK;
- int left = len;
- int l, bad;
-
- while (left > 0) {
- l = min_t(int, left, PAGE_CACHE_SIZE-po);
- bad = copy_to_user(data, page_address(pages[i]) + po, l);
- if (bad == l)
- return -EFAULT;
- data += l - bad;
- left -= l - bad;
- if (po) {
- po += l - bad;
- if (po == PAGE_CACHE_SIZE)
- po = 0;
- }
- i++;
- }
- return len;
-}
-
-/*
- * Zero an extent within a page vector. Offset is relative to the
- * start of the first page.
- */
-static void zero_page_vector_range(int off, int len, struct page **pages)
-{
- int i = off >> PAGE_CACHE_SHIFT;
-
- dout("zero_page_vector_page %u~%u\n", off, len);
- BUG_ON(len < PAGE_CACHE_SIZE);
-
- /* leading partial page? */
- if (off & ~PAGE_CACHE_MASK) {
- dout("zeroing %d %p head from %d\n", i, pages[i],
- (int)(off & ~PAGE_CACHE_MASK));
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25)
- zero_user_segment(pages[i], off & ~PAGE_CACHE_MASK,
- PAGE_CACHE_SIZE);
-#else
- zero_user_page(pages[i], off & ~PAGE_CACHE_MASK,
- PAGE_CACHE_SIZE - (off & ~PAGE_CACHE_MASK),
- KM_USER0);
-#endif
- off += PAGE_CACHE_SIZE;
- off &= PAGE_CACHE_MASK;
- i++;
- }
- while (len >= PAGE_CACHE_SIZE) {
- dout("zeroing %d %p\n", i, pages[i]);
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25)
- zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
-#else
- zero_user_page(pages[i], 0, PAGE_CACHE_SIZE, KM_USER0);
-#endif
- off += PAGE_CACHE_SIZE;
- len -= PAGE_CACHE_SIZE;
- i++;
- }
- /* trailing partial page? */
- if (len) {
- dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25)
- zero_user_segment(pages[i], 0, len);
-#else
- zero_user_page(pages[i], 0, PAGE_CACHE_SIZE - len, KM_USER0);
-#endif
- }
-}
-
-
-/*
- * Read a range of bytes striped over one or more objects. Iterate over
- * objects we stripe over. (That's not atomic, but good enough for now.)
- *
- * If we get a short result from the OSD, check against i_size; we need to
- * only return a short read to the caller if we hit EOF.
- */
-static int striped_read(struct inode *inode,
- u64 off, u64 len,
- struct page **pages, int num_pages)
-{
- struct ceph_client *client = ceph_inode_to_client(inode);
- struct ceph_inode_info *ci = ceph_inode(inode);
- u64 pos, this_len;
- int page_off = off & ~PAGE_CACHE_SIZE; /* first byte's offset in page */
- int left, pages_left;
- int read;
- struct page **page_pos;
- int ret;
- bool hit_stripe, was_short;
-
- /*
- * we may need to do multiple reads. not atomic, unfortunately.
- */
- pos = off;
- left = len;
- page_pos = pages;
- pages_left = num_pages;
- read = 0;
-
-more:
- this_len = left;
- ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
- &ci->i_layout, pos, &this_len,
- ci->i_truncate_seq,
- ci->i_truncate_size,
- page_pos, pages_left);
- hit_stripe = this_len < left;
- was_short = ret >= 0 && ret < this_len;
- if (ret == -ENOENT)
- ret = 0;
- dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
- ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
-
- if (ret > 0) {
- int didpages =
- ((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT;
-
- if (read < pos - off) {
- dout(" zero gap %llu to %llu\n", off + read, pos);
- zero_page_vector_range(page_off + read,
- pos - off - read, pages);
- }
- pos += ret;
- read = pos - off;
- left -= ret;
- page_pos += didpages;
- pages_left -= didpages;
-
- /* hit stripe? */
- if (left && hit_stripe)
- goto more;
- }
-
- if (was_short) {
- /* was original extent fully inside i_size? */
- if (pos + left <= inode->i_size) {
- dout("zero tail\n");
- zero_page_vector_range(page_off + read, len - read,
- pages);
- goto out;
- }
-
- /* check i_size */
- ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
- if (ret < 0)
- goto out;
-
- /* hit EOF? */
- if (pos >= inode->i_size)
- goto out;
-
- goto more;
- }
-
-out:
- if (ret >= 0)
- ret = read;
- dout("striped_read returns %d\n", ret);
- return ret;
-}
-
-/*
- * Completely synchronous read and write methods. Direct from __user
- * buffer to osd, or directly to user pages (if O_DIRECT).
- *
- * If the read spans object boundary, just do multiple reads.
- */
-static ssize_t ceph_sync_read(struct file *file, char __user *data,
- unsigned len, loff_t *poff)
-{
- struct inode *inode = file->f_dentry->d_inode;
- struct page **pages;
- u64 off = *poff;
- int num_pages = calc_pages_for(off, len);
- int ret;
-
- dout("sync_read on file %p %llu~%u %s\n", file, off, len,
- (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
-
- if (file->f_flags & O_DIRECT) {
- pages = get_direct_page_vector(data, num_pages, off, len);
-
- /*
- * flush any page cache pages in this range. this
- * will make concurrent normal and O_DIRECT io slow,
- * but it will at least behave sensibly when they are
- * in sequence.
- */
- filemap_write_and_wait(inode->i_mapping);
- } else {
- pages = alloc_page_vector(num_pages);
- }
- if (IS_ERR(pages))
- return PTR_ERR(pages);
-
- ret = striped_read(inode, off, len, pages, num_pages);
-
- if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
- ret = copy_page_vector_to_user(pages, data, off, ret);
- if (ret >= 0)
- *poff = off + ret;
-
- if (file->f_flags & O_DIRECT)
- put_page_vector(pages, num_pages);
- else
- ceph_release_page_vector(pages, num_pages);
- dout("sync_read result %d\n", ret);
- return ret;
-}
-
-/*
- * Write commit callback, called if we requested both an ACK and
- * ONDISK commit reply from the OSD.
- */
-static void sync_write_commit(struct ceph_osd_request *req,
- struct ceph_msg *msg)
-{
- struct ceph_inode_info *ci = ceph_inode(req->r_inode);
-
- dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
- spin_lock(&ci->i_unsafe_lock);
- list_del_init(&req->r_unsafe_item);
- spin_unlock(&ci->i_unsafe_lock);
- ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
-}
-
-/*
- * Synchronous write, straight from __user pointer or user pages (if
- * O_DIRECT).
- *
- * If write spans object boundary, just do multiple writes. (For a
- * correct atomic write, we should e.g. take write locks on all
- * objects, rollback on failure, etc.)
- */
-static ssize_t ceph_sync_write(struct file *file, const char __user *data,
- size_t left, loff_t *offset)
-{
- struct inode *inode = file->f_dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_client *client = ceph_inode_to_client(inode);
- struct ceph_osd_request *req;
- struct page **pages;
- int num_pages;
- long long unsigned pos;
- u64 len;
- int written = 0;
- int flags;
- int do_sync = 0;
- int check_caps = 0;
- int ret;
- struct timespec mtime = CURRENT_TIME;
-
- if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
- return -EROFS;
-
- dout("sync_write on file %p %lld~%u %s\n", file, *offset,
- (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
-
- if (file->f_flags & O_APPEND)
- pos = i_size_read(inode);
- else
- pos = *offset;
-
- flags = CEPH_OSD_FLAG_ORDERSNAP |
- CEPH_OSD_FLAG_ONDISK |
- CEPH_OSD_FLAG_WRITE;
- if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
- flags |= CEPH_OSD_FLAG_ACK;
- else
- do_sync = 1;
-
- /*
- * we may need to do multiple writes here if we span an object
- * boundary. this isn't atomic, unfortunately. :(
- */
-more:
- len = left;
- req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
- ceph_vino(inode), pos, &len,
- CEPH_OSD_OP_WRITE, flags,
- ci->i_snap_realm->cached_context,
- do_sync,
- ci->i_truncate_seq, ci->i_truncate_size,
- &mtime, false, 2);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- num_pages = calc_pages_for(pos, len);
-
- if (file->f_flags & O_DIRECT) {
- pages = get_direct_page_vector(data, num_pages, pos, len);
- if (IS_ERR(pages)) {
- ret = PTR_ERR(pages);
- goto out;
- }
-
- /*
- * throw out any page cache pages in this range. this
- * may block.
- */
- truncate_inode_pages_range(inode->i_mapping, pos, pos+len);
- } else {
- pages = alloc_page_vector(num_pages);
- if (IS_ERR(pages)) {
- ret = PTR_ERR(pages);
- goto out;
- }
- ret = copy_user_to_page_vector(pages, data, pos, len);
- if (ret < 0) {
- ceph_release_page_vector(pages, num_pages);
- goto out;
- }
-
- if ((file->f_flags & O_SYNC) == 0) {
- /* get a second commit callback */
- req->r_safe_callback = sync_write_commit;
- req->r_own_pages = 1;
- }
- }
- req->r_pages = pages;
- req->r_num_pages = num_pages;
- req->r_inode = inode;
-
- ret = ceph_osdc_start_request(&client->osdc, req, false);
- if (!ret) {
- if (req->r_safe_callback) {
- /*
- * Add to inode unsafe list only after we
- * start_request so that a tid has been assigned.
- */
- spin_lock(&ci->i_unsafe_lock);
- list_add(&ci->i_unsafe_writes, &req->r_unsafe_item);
- spin_unlock(&ci->i_unsafe_lock);
- ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
- }
- ret = ceph_osdc_wait_request(&client->osdc, req);
- }
-
- if (file->f_flags & O_DIRECT)
- put_page_vector(pages, num_pages);
- else if (file->f_flags & O_SYNC)
- ceph_release_page_vector(pages, num_pages);
-
-out:
- ceph_osdc_put_request(req);
- if (ret == 0) {
- pos += len;
- written += len;
- left -= len;
- if (left)
- goto more;
-
- ret = written;
- *offset = pos;
- if (pos > i_size_read(inode))
- check_caps = ceph_inode_set_size(inode, pos);
- if (check_caps)
- ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
- NULL);
- }
- return ret;
-}
-
-/*
- * Wrap generic_file_aio_read with checks for cap bits on the inode.
- * Atomically grab references, so that those bits are not released
- * back to the MDS mid-read.
- *
- * Hmm, the sync read case isn't actually async... should it be?
- */
-static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
-{
- struct file *filp = iocb->ki_filp;
- loff_t *ppos = &iocb->ki_pos;
- size_t len = iov->iov_len;
- struct inode *inode = filp->f_dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- ssize_t ret;
- int got = 0;
-
- dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
- inode, ceph_vinop(inode), pos, (unsigned)len, inode);
- __ceph_do_pending_vmtruncate(inode);
- ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE,
- &got, -1);
- if (ret < 0)
- goto out;
- dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
- inode, ceph_vinop(inode), pos, (unsigned)len,
- ceph_cap_string(got));
-
- if ((got & CEPH_CAP_FILE_CACHE) == 0 ||
- (iocb->ki_filp->f_flags & O_DIRECT) ||
- (inode->i_sb->s_flags & MS_SYNCHRONOUS))
- /* hmm, this isn't really async... */
- ret = ceph_sync_read(filp, iov->iov_base, len, ppos);
- else
- ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
-
-out:
- dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
- inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
- ceph_put_cap_refs(ci, got);
- return ret;
-}
-
-/*
- * Take cap references to avoid releasing caps to MDS mid-write.
- *
- * If we are synchronous, and write with an old snap context, the OSD
- * may return EOLDSNAPC. In that case, retry the write.. _after_
- * dropping our cap refs and allowing the pending snap to logically
- * complete _before_ this write occurs.
- *
- * If we are near ENOSPC, write synchronously.
- */
-static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
- loff_t endoff = pos + iov->iov_len;
- int got = 0;
- int ret;
-
- if (ceph_snap(inode) != CEPH_NOSNAP)
- return -EROFS;
-
-retry_snap:
- if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
- return -ENOSPC;
- __ceph_do_pending_vmtruncate(inode);
- dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
- inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
- inode->i_size);
- ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
- &got, endoff);
- if (ret < 0)
- goto out;
-
- dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
- inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
- ceph_cap_string(got));
-
- if ((got & CEPH_CAP_FILE_BUFFER) == 0 ||
- (iocb->ki_filp->f_flags & O_DIRECT) ||
- (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
- ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
- &iocb->ki_pos);
- } else {
- ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
-
- if ((ret >= 0 || ret == -EIOCBQUEUED) &&
- ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
- || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL)))
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 32)
- ret = vfs_fsync_range(file, file->f_path.dentry,
- pos, pos + ret - 1, 1);
-#else
- ret = sync_page_range(inode, &inode->i_data, pos, ret);
-#endif
- }
- if (ret >= 0) {
- spin_lock(&inode->i_lock);
- __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
- spin_unlock(&inode->i_lock);
- }
-
-out:
- dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
- inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
- ceph_cap_string(got));
- ceph_put_cap_refs(ci, got);
-
- if (ret == -EOLDSNAPC) {
- dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
- inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
- goto retry_snap;
- }
-
- return ret;
-}
-
-/*
- * llseek. be sure to verify file size on SEEK_END.
- */
-static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
-{
- struct inode *inode = file->f_mapping->host;
- int ret;
-
- mutex_lock(&inode->i_mutex);
- __ceph_do_pending_vmtruncate(inode);
- switch (origin) {
- case SEEK_END:
- ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
- if (ret < 0) {
- offset = ret;
- goto out;
- }
- offset += inode->i_size;
- break;
- case SEEK_CUR:
- /*
- * Here we special-case the lseek(fd, 0, SEEK_CUR)
- * position-querying operation. Avoid rewriting the "same"
- * f_pos value back to the file because a concurrent read(),
- * write() or lseek() might have altered it
- */
- if (offset == 0) {
- offset = file->f_pos;
- goto out;
- }
- offset += file->f_pos;
- break;
- }
-
- if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
- offset = -EINVAL;
- goto out;
- }
-
- /* Special lock needed here? */
- if (offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
- }
-
-out:
- mutex_unlock(&inode->i_mutex);
- return offset;
-}
-
-const struct file_operations ceph_file_fops = {
- .open = ceph_open,
- .release = ceph_release,
- .llseek = ceph_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = ceph_aio_read,
- .aio_write = ceph_aio_write,
- .mmap = ceph_mmap,
- .fsync = ceph_fsync,
- .splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
- .unlocked_ioctl = ceph_ioctl,
- .compat_ioctl = ceph_ioctl,
-};
-
+++ /dev/null
-#!/bin/sh
-
-# run me from the root of a _linux_ git tree, and pass ceph tree root.
-cephtree=$1
-echo ceph tree at $cephtree.
-target=$2
-echo target is $target
-test -d .git || exit 0
-test -e include/linux/mm.h || exit 0
-test -e $cephtree/src/kernel/super.h || exit 0
-
-# copy into the tree
-mkdir -p $target/ceph
-mkdir $target/ceph/crush
-cp $cephtree/src/kernel/Makefile $target/ceph
-cp $cephtree/src/kernel/Kconfig $target/ceph
-cp $cephtree/src/kernel/*.[ch] $target/ceph
-cp $cephtree/src/kernel/crush/*.[ch] $target/ceph/crush
-cp $cephtree/src/kernel/ceph.txt Documentation/filesystems
-
-# build the patch sequence
-git branch -D series_start
-git branch series_start
-
-# fs/staging
-#git cherry-pick 5556036065d8b04b2f7dd439fbf0d710e295cd44
-
-git add Documentation/filesystems/ceph.txt
-git commit -s -F - <<EOF
-ceph: documentation
-
-Mount options, syntax.
-
-EOF
-
-git add $target/ceph/ceph_fs.h
-git add $target/ceph/ceph_fs.c
-git add $target/ceph/msgr.h
-git add $target/ceph/rados.h
-git add $target/ceph/ceph_strings.c
-git commit -s -F - <<EOF
-ceph: on-wire types
-
-These headers describe the types used to exchange messages between the
-Ceph client and various servers. All types are little-endian and
-packed. These headers are shared between the kernel and userspace, so
-all types are in terms of e.g. __u32.
-
-Additionally, we define a few magic values to identify the current
-version of the protocol(s) in use, so that discrepancies to be
-detected on mount.
-
-EOF
-
-git add $target/ceph/types.h
-git add $target/ceph/super.h
-git add $target/ceph/ceph_ver.h
-git add $target/ceph/ceph_debug.h
-git add $target/ceph/ceph_frag.h
-git add $target/ceph/ceph_frag.c
-git commit -s -F - <<EOF
-ceph: client types
-
-We first define constants, types, and prototypes for the kernel client
-proper.
-
-A few subsystems are defined separately later: the MDS, OSD, and
-monitor clients, and the messaging layer.
-
-EOF
-
-git add $target/ceph/buffer.h
-git add $target/ceph/buffer.c
-git commit -s -F - <<EOF
-ceph: ref counted buffer
-
-struct ceph_buffer is a simple ref-counted buffer. We transparently
-choose between kmalloc for small buffers and vmalloc for large ones.
-
-This is currently used only for allocating memory for xattr data.
-
-EOF
-
-git add $target/ceph/super.c
-git commit -s -F - <<EOF
-ceph: super.c
-
-Mount option parsing, client setup and teardown, and a few odds and
-ends (e.g., statfs).
-
-EOF
-
-
-git add $target/ceph/inode.c
-git add $target/ceph/xattr.c
-git commit -s -F - <<EOF
-ceph: inode operations
-
-Inode cache and inode operations. We also include routines to
-incorporate metadata structures returned by the MDS into the client
-cache, and some helpers to deal with file capabilities and metadata
-leases. The bulk of that work is done by fill_inode() and
-fill_trace().
-
-EOF
-
-git add $target/ceph/dir.c
-git commit -s -F - <<EOF
-ceph: directory operations
-
-Directory operations, including lookup, are defined here. We take
-advantage of lookup intents when possible. For the most part, we just
-need to build the proper requests for the metadata server(s) and
-pass things off to the mds_client.
-
-The results of most operations are normally incorporated into the
-client's cache when the reply is parsed by ceph_fill_trace().
-However, if the MDS replies without a trace (e.g., when retrying an
-update after an MDS failure recovery), some operation-specific cleanup
-may be needed.
-
-We can validate cached dentries in two ways. A per-dentry lease may
-be issued by the MDS, or a per-directory cap may be issued that acts
-as a lease on the entire directory. In the latter case, a 'gen' value
-is used to determine which dentries belong to the currently leased
-directory contents.
-
-We normally prepopulate the dcache and icache with readdir results.
-This makes subsequent lookups and getattrs avoid any server
-interaction. It also lets us satisfy readdir operation by peeking at
-the dcache IFF we hold the per-directory cap/lease, previously
-performed a readdir, and haven't dropped any of the resulting
-dentries.
-
-EOF
-
-git add $target/ceph/file.c
-git commit -s -F - <<EOF
-ceph: file operations
-
-File open and close operations, and read and write methods that ensure
-we have obtained the proper capabilities from the MDS cluster before
-performing IO on a file. We take references on held capabilities for
-the duration of the read/write to avoid prematurely releasing them
-back to the MDS.
-
-We implement two main paths for read and write: one that is buffered
-(and uses generic_aio_{read,write}), and one that is fully synchronous
-and blocking (operating either on a __user pointer or, if O_DIRECT,
-directly on user pages).
-
-EOF
-
-git add $target/ceph/addr.c
-git commit -s -F - <<EOF
-ceph: address space operations
-
-The ceph address space methods are concerned primarily with managing
-the dirty page accounting in the inode, which (among other things)
-must keep track of which snapshot context each page was dirtied in,
-and ensure that dirty data is written out to the OSDs in snapshort
-order.
-
-A writepage() on a page that is not currently writeable due to
-snapshot writeback ordering constraints is ignored (it was presumably
-called from kswapd).
-
-EOF
-
-git add $target/ceph/mds_client.h
-git add $target/ceph/mds_client.c
-git add $target/ceph/mdsmap.h
-git add $target/ceph/mdsmap.c
-git commit -s -F - <<EOF
-ceph: MDS client
-
-The MDS (metadata server) client is responsible for submitting
-requests to the MDS cluster and parsing the response. We decide which
-MDS to submit each request to based on cached information about the
-current partition of the directory hierarchy across the cluster. A
-stateful session is opened with each MDS before we submit requests to
-it, and a mutex is used to control the ordering of messages within
-each session.
-
-An MDS request may generate two responses. The first indicates the
-operation was a success and returns any result. A second reply is
-sent when the operation commits to disk. Note that locking on the MDS
-ensures that the results of updates are visible only to the updating
-client before the operation commits. Requests are linked to the
-containing directory so that an fsync will wait for them to commit.
-
-If an MDS fails and/or recovers, we resubmit requests as needed. We
-also reconnect existing capabilities to a recovering MDS to
-reestablish that shared session state. Old dentry leases are
-invalidated.
-
-EOF
-
-git add $target/ceph/osd_client.h
-git add $target/ceph/osd_client.c
-git add $target/ceph/osdmap.h
-git add $target/ceph/osdmap.c
-git commit -s -F - <<EOF
-ceph: OSD client
-
-The OSD client is responsible for reading and writing data from/to the
-object storage pool. This includes determining where objects are
-stored in the cluster, and ensuring that requests are retried or
-redirected in the event of a node failure or data migration.
-
-If an OSD does not respond before a timeout expires, keepalive
-messages are sent across the lossless, ordered communications channel
-to ensure that any break in the TCP is discovered. If the session
-does reset, a reconnection is attempted and affected requests are
-resent (by the message transport layer).
-
-EOF
-
-git add $target/ceph/crush/crush.h
-git add $target/ceph/crush/crush.c
-git add $target/ceph/crush/mapper.h
-git add $target/ceph/crush/mapper.c
-git add $target/ceph/crush/hash.h
-git commit -s -F - <<EOF
-ceph: CRUSH mapping algorithm
-
-CRUSH is a pseudorandom data distribution function designed to map
-inputs onto a dynamic hierarchy of devices, while minimizing the
-extent to which inputs are remapped when the devices are added or
-removed. It includes some features that are specifically useful for
-storage, most notably the ability to map each input onto a set of N
-devices that are separated across administrator-defined failure
-domains. CRUSH is used to distribute data across the cluster of Ceph
-storage nodes.
-
-More information about CRUSH can be found in this paper:
-
- http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
-
-EOF
-
-git add $target/ceph/mon_client.h
-git add $target/ceph/mon_client.c
-git commit -s -F - <<EOF
-ceph: monitor client
-
-The monitor cluster is responsible for managing cluster membership
-and state. The monitor client handles what minimal interaction
-the Ceph client has with it: checking for updated versions of the
-MDS and OSD maps, getting statfs() information, and unmounting.
-
-EOF
-
-git add $target/ceph/caps.c
-git commit -s -F - <<EOF
-ceph: capability management
-
-The Ceph metadata servers control client access to inode metadata and
-file data by issuing capabilities, granting clients permission to read
-and/or write both inode field and file data to OSDs (storage nodes).
-Each capability consists of a set of bits indicating which operations
-are allowed.
-
-If the client holds a *_SHARED cap, the client has a coherent value
-that can be safely read from the cached inode.
-
-In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the client
-is allowed to change inode attributes (e.g., file size, mtime), note
-its dirty state in the ceph_cap, and asynchronously flush that
-metadata change to the MDS.
-
-In the event of a conflicting operation (perhaps by another client),
-the MDS will revoke the conflicting client capabilities.
-
-In order for a client to cache an inode, it must hold a capability
-with at least one MDS server. When inodes are released, release
-notifications are batched and periodically sent en masse to the MDS
-cluster to release server state.
-
-EOF
-
-git add $target/ceph/snap.c
-git commit -s -F - <<EOF
-ceph: snapshot management
-
-Ceph snapshots rely on client cooperation in determining which
-operations apply to which snapshots, and appropriately flushing
-snapshotted data and metadata back to the OSD and MDS clusters.
-Because snapshots apply to subtrees of the file hierarchy and can be
-created at any time, there is a fair bit of bookkeeping required to
-make this work.
-
-Portions of the hierarchy that belong to the same set of snapshots
-are described by a single 'snap realm.' A 'snap context' describes
-the set of snapshots that exist for a given file or directory.
-
-EOF
-
-git add $target/ceph/decode.h
-git add $target/ceph/messenger.h
-git add $target/ceph/messenger.c
-git commit -s -F - <<EOF
-ceph: messenger library
-
-A generic message passing library is used to communicate with all
-other components in the Ceph file system. The messenger library
-provides ordered, reliable delivery of messages between two nodes in
-the system.
-
-This implementation is based on TCP.
-
-EOF
-
-git add $target/ceph/msgpool.h
-git add $target/ceph/msgpool.c
-git commit -s -F - <<EOF
-ceph: message pools
-
-The msgpool is a basic mempool_t-like structure to preallocate
-messages we expect to receive over the wire. This ensures we have the
-necessary memory preallocated to process replies to requests, or to
-process unsolicited messages from various servers.
-
-EOF
-
-git add $target/ceph/export.c
-git commit -s -F - <<EOF
-ceph: nfs re-export support
-
-Basic NFS re-export support is included. This mostly works. However,
-Ceph's MDS design precludes the ability to generate a (small)
-filehandle that will be valid forever, so this is of limited utility.
-
-EOF
-
-git apply $cephtree/src/kernel/ioctl-number.patch
-git add Documentation/ioctl/ioctl-number.txt
-git add $target/ceph/ioctl.h
-git add $target/ceph/ioctl.c
-git commit -s -F - <<EOF
-ceph: ioctls
-
-A few Ceph ioctls for getting and setting file layout (striping)
-parameters, and learning the identity and network address of the OSD a
-given region of a file is stored on.
-
-EOF
-
-git add $target/ceph/debugfs.c
-git commit -s -F - <<EOF
-ceph: debugfs
-
-Basic state information is available via /sys/kernel/debug/ceph,
-including instances of the client, fsids, current monitor, mds and osd
-maps, outstanding server requests, and hooks to adjust debug levels.
-
-EOF
-
-#git apply $cephtree/src/kernel/kbuild.staging.patch
-git apply $cephtree/src/kernel/kbuild.patch
-git add $target/ceph/Makefile
-git add $target/ceph/Kconfig
-git add $target/Kconfig
-git add $target/Makefile
-git apply $cephtree/src/kernel/maintainers.patch
-git add MAINTAINERS
-git commit -s -F - <<EOF
-ceph: Kconfig, Makefile
-
-Kconfig options and Makefile.
-
-EOF
-
-
-# build the patch files
-mkdir out
-rm out/*
-git-format-patch -s -o out -n series_start..HEAD
-
-cp 0000 out/0000
-echo --- >> out/0000
-git diff --stat series_start >> out/0000
\ No newline at end of file
+++ /dev/null
-#include "ceph_debug.h"
-
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/smp_lock.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/uaccess.h>
-#include <linux/kernel.h>
-#include <linux/namei.h>
-#include <linux/writeback.h>
-#include <linux/vmalloc.h>
-
-#include "super.h"
-#include "decode.h"
-
-/*
- * Ceph inode operations
- *
- * Implement basic inode helpers (get, alloc) and inode ops (getattr,
- * setattr, etc.), xattr helpers, and helpers for assimilating
- * metadata returned by the MDS into our cache.
- *
- * Also define helpers for doing asynchronous writeback, invalidation,
- * and truncation for the benefit of those who can't afford to block
- * (typically because they are in the message handler path).
- */
-
-static const struct inode_operations ceph_symlink_iops;
-
-static void ceph_inode_invalidate_pages(struct work_struct *work);
-
-/*
- * find or create an inode, given the ceph ino number
- */
-struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
-{
- struct inode *inode;
- ino_t t = ceph_vino_to_ino(vino);
-
- inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
- if (inode == NULL)
- return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW) {
- dout("get_inode created new inode %p %llx.%llx ino %llx\n",
- inode, ceph_vinop(inode), (u64)inode->i_ino);
- unlock_new_inode(inode);
- }
-
- dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
- vino.snap, inode);
- return inode;
-}
-
-/*
- * get/constuct snapdir inode for a given directory
- */
-struct inode *ceph_get_snapdir(struct inode *parent)
-{
- struct ceph_vino vino = {
- .ino = ceph_ino(parent),
- .snap = CEPH_SNAPDIR,
- };
- struct inode *inode = ceph_get_inode(parent->i_sb, vino);
-
- BUG_ON(!S_ISDIR(parent->i_mode));
- if (IS_ERR(inode))
- return ERR_PTR(PTR_ERR(inode));
- inode->i_mode = parent->i_mode;
- inode->i_uid = parent->i_uid;
- inode->i_gid = parent->i_gid;
- inode->i_op = &ceph_dir_iops;
- inode->i_fop = &ceph_dir_fops;
- ceph_inode(inode)->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
- return inode;
-}
-
-const struct inode_operations ceph_file_iops = {
- .permission = ceph_permission,
- .setattr = ceph_setattr,
- .getattr = ceph_getattr,
- .setxattr = ceph_setxattr,
- .getxattr = ceph_getxattr,
- .listxattr = ceph_listxattr,
- .removexattr = ceph_removexattr,
-};
-
-
-/*
- * We use a 'frag tree' to keep track of the MDS's directory fragments
- * for a given inode (usually there is just a single fragment). We
- * need to know when a child frag is delegated to a new MDS, or when
- * it is flagged as replicated, so we can direct our requests
- * accordingly.
- */
-
-/*
- * find/create a frag in the tree
- */
-static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
- u32 f)
-{
- struct rb_node **p;
- struct rb_node *parent = NULL;
- struct ceph_inode_frag *frag;
- int c;
-
- p = &ci->i_fragtree.rb_node;
- while (*p) {
- parent = *p;
- frag = rb_entry(parent, struct ceph_inode_frag, node);
- c = ceph_frag_compare(f, frag->frag);
- if (c < 0)
- p = &(*p)->rb_left;
- else if (c > 0)
- p = &(*p)->rb_right;
- else
- return frag;
- }
-
- frag = kmalloc(sizeof(*frag), GFP_NOFS);
- if (!frag) {
- pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
- "frag %x\n", &ci->vfs_inode,
- ceph_vinop(&ci->vfs_inode), f);
- return ERR_PTR(-ENOMEM);
- }
- frag->frag = f;
- frag->split_by = 0;
- frag->mds = -1;
- frag->ndist = 0;
-
- rb_link_node(&frag->node, parent, p);
- rb_insert_color(&frag->node, &ci->i_fragtree);
-
- dout("get_or_create_frag added %llx.%llx frag %x\n",
- ceph_vinop(&ci->vfs_inode), f);
- return frag;
-}
-
-/*
- * find a specific frag @f
- */
-struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
-{
- struct rb_node *n = ci->i_fragtree.rb_node;
-
- while (n) {
- struct ceph_inode_frag *frag =
- rb_entry(n, struct ceph_inode_frag, node);
- int c = ceph_frag_compare(f, frag->frag);
- if (c < 0)
- n = n->rb_left;
- else if (c > 0)
- n = n->rb_right;
- else
- return frag;
- }
- return NULL;
-}
-
-/*
- * Choose frag containing the given value @v. If @pfrag is
- * specified, copy the frag delegation info to the caller if
- * it is present.
- */
-u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
- struct ceph_inode_frag *pfrag,
- int *found)
-{
- u32 t = ceph_frag_make(0, 0);
- struct ceph_inode_frag *frag;
- unsigned nway, i;
- u32 n;
-
- if (found)
- *found = 0;
-
- mutex_lock(&ci->i_fragtree_mutex);
- while (1) {
- WARN_ON(!ceph_frag_contains_value(t, v));
- frag = __ceph_find_frag(ci, t);
- if (!frag)
- break; /* t is a leaf */
- if (frag->split_by == 0) {
- if (pfrag)
- memcpy(pfrag, frag, sizeof(*pfrag));
- if (found)
- *found = 1;
- break;
- }
-
- /* choose child */
- nway = 1 << frag->split_by;
- dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
- frag->split_by, nway);
- for (i = 0; i < nway; i++) {
- n = ceph_frag_make_child(t, frag->split_by, i);
- if (ceph_frag_contains_value(n, v)) {
- t = n;
- break;
- }
- }
- BUG_ON(i == nway);
- }
- dout("choose_frag(%x) = %x\n", v, t);
-
- mutex_unlock(&ci->i_fragtree_mutex);
- return t;
-}
-
-/*
- * Process dirfrag (delegation) info from the mds. Include leaf
- * fragment in tree ONLY if ndist > 0. Otherwise, only
- * branches/splits are included in i_fragtree)
- */
-static int ceph_fill_dirfrag(struct inode *inode,
- struct ceph_mds_reply_dirfrag *dirinfo)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_inode_frag *frag;
- u32 id = le32_to_cpu(dirinfo->frag);
- int mds = le32_to_cpu(dirinfo->auth);
- int ndist = le32_to_cpu(dirinfo->ndist);
- int i;
- int err = 0;
-
- mutex_lock(&ci->i_fragtree_mutex);
- if (ndist == 0) {
- /* no delegation info needed. */
- frag = __ceph_find_frag(ci, id);
- if (!frag)
- goto out;
- if (frag->split_by == 0) {
- /* tree leaf, remove */
- dout("fill_dirfrag removed %llx.%llx frag %x"
- " (no ref)\n", ceph_vinop(inode), id);
- rb_erase(&frag->node, &ci->i_fragtree);
- kfree(frag);
- } else {
- /* tree branch, keep and clear */
- dout("fill_dirfrag cleared %llx.%llx frag %x"
- " referral\n", ceph_vinop(inode), id);
- frag->mds = -1;
- frag->ndist = 0;
- }
- goto out;
- }
-
-
- /* find/add this frag to store mds delegation info */
- frag = __get_or_create_frag(ci, id);
- if (IS_ERR(frag)) {
- /* this is not the end of the world; we can continue
- with bad/inaccurate delegation info */
- pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
- ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
- err = -ENOMEM;
- goto out;
- }
-
- frag->mds = mds;
- frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
- for (i = 0; i < frag->ndist; i++)
- frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
- dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
- ceph_vinop(inode), frag->frag, frag->ndist);
-
-out:
- mutex_unlock(&ci->i_fragtree_mutex);
- return err;
-}
-
-
-/*
- * initialize a newly allocated inode.
- */
-struct inode *ceph_alloc_inode(struct super_block *sb)
-{
- struct ceph_inode_info *ci;
- int i;
-
- ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
- if (!ci)
- return NULL;
-
- dout("alloc_inode %p\n", &ci->vfs_inode);
-
- ci->i_version = 0;
- ci->i_time_warp_seq = 0;
- ci->i_ceph_flags = 0;
- ci->i_release_count = 0;
- ci->i_symlink = NULL;
-
- ci->i_fragtree = RB_ROOT;
- mutex_init(&ci->i_fragtree_mutex);
-
- ci->i_xattrs.blob = NULL;
- ci->i_xattrs.prealloc_blob = NULL;
- ci->i_xattrs.dirty = false;
- ci->i_xattrs.index = RB_ROOT;
- ci->i_xattrs.count = 0;
- ci->i_xattrs.names_size = 0;
- ci->i_xattrs.vals_size = 0;
- ci->i_xattrs.version = 0;
- ci->i_xattrs.index_version = 0;
-
- ci->i_caps = RB_ROOT;
- ci->i_auth_cap = NULL;
- ci->i_dirty_caps = 0;
- ci->i_flushing_caps = 0;
- INIT_LIST_HEAD(&ci->i_dirty_item);
- INIT_LIST_HEAD(&ci->i_flushing_item);
- ci->i_cap_flush_seq = 0;
- ci->i_cap_flush_last_tid = 0;
- memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
- init_waitqueue_head(&ci->i_cap_wq);
- ci->i_hold_caps_min = 0;
- ci->i_hold_caps_max = 0;
- INIT_LIST_HEAD(&ci->i_cap_delay_list);
- ci->i_cap_exporting_mds = 0;
- ci->i_cap_exporting_mseq = 0;
- ci->i_cap_exporting_issued = 0;
- INIT_LIST_HEAD(&ci->i_cap_snaps);
- ci->i_head_snapc = NULL;
- ci->i_snap_caps = 0;
-
- for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
- ci->i_nr_by_mode[i] = 0;
-
- ci->i_truncate_seq = 0;
- ci->i_truncate_size = 0;
- ci->i_truncate_pending = 0;
-
- ci->i_max_size = 0;
- ci->i_reported_size = 0;
- ci->i_wanted_max_size = 0;
- ci->i_requested_max_size = 0;
-
- ci->i_pin_ref = 0;
- ci->i_rd_ref = 0;
- ci->i_rdcache_ref = 0;
- ci->i_wr_ref = 0;
- ci->i_wrbuffer_ref = 0;
- ci->i_wrbuffer_ref_head = 0;
- ci->i_shared_gen = 0;
- ci->i_rdcache_gen = 0;
- ci->i_rdcache_revoking = 0;
-
- INIT_LIST_HEAD(&ci->i_unsafe_writes);
- INIT_LIST_HEAD(&ci->i_unsafe_dirops);
- spin_lock_init(&ci->i_unsafe_lock);
-
- ci->i_snap_realm = NULL;
- INIT_LIST_HEAD(&ci->i_snap_realm_item);
- INIT_LIST_HEAD(&ci->i_snap_flush_item);
-
- INIT_WORK(&ci->i_wb_work, ceph_inode_writeback);
- INIT_WORK(&ci->i_pg_inv_work, ceph_inode_invalidate_pages);
-
- INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
-
- return &ci->vfs_inode;
-}
-
-void ceph_destroy_inode(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_inode_frag *frag;
- struct rb_node *n;
-
- dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
-
- ceph_queue_caps_release(inode);
-
- kfree(ci->i_symlink);
- while ((n = rb_first(&ci->i_fragtree)) != NULL) {
- frag = rb_entry(n, struct ceph_inode_frag, node);
- rb_erase(n, &ci->i_fragtree);
- kfree(frag);
- }
-
- __ceph_destroy_xattrs(ci);
- ceph_buffer_put(ci->i_xattrs.blob);
- ceph_buffer_put(ci->i_xattrs.prealloc_blob);
-
- kmem_cache_free(ceph_inode_cachep, ci);
-}
-
-
-/*
- * Helpers to fill in size, ctime, mtime, and atime. We have to be
- * careful because either the client or MDS may have more up to date
- * info, depending on which capabilities are held, and whether
- * time_warp_seq or truncate_seq have increased. (Ordinarily, mtime
- * and size are monotonically increasing, except when utimes() or
- * truncate() increments the corresponding _seq values.)
- */
-int ceph_fill_file_size(struct inode *inode, int issued,
- u32 truncate_seq, u64 truncate_size, u64 size)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int queue_trunc = 0;
-
- if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
- (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
- dout("size %lld -> %llu\n", inode->i_size, size);
- inode->i_size = size;
- inode->i_blocks = (size + (1<<9) - 1) >> 9;
- ci->i_reported_size = size;
- if (truncate_seq != ci->i_truncate_seq) {
- dout("truncate_seq %u -> %u\n",
- ci->i_truncate_seq, truncate_seq);
- ci->i_truncate_seq = truncate_seq;
- if (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
- CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
- CEPH_CAP_FILE_EXCL)) {
- ci->i_truncate_pending++;
- queue_trunc = 1;
- }
- }
- }
- if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
- ci->i_truncate_size != truncate_size) {
- dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
- truncate_size);
- ci->i_truncate_size = truncate_size;
- }
- return queue_trunc;
-}
-
-void ceph_fill_file_time(struct inode *inode, int issued,
- u64 time_warp_seq, struct timespec *ctime,
- struct timespec *mtime, struct timespec *atime)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int warn = 0;
-
- if (issued & (CEPH_CAP_FILE_EXCL|
- CEPH_CAP_FILE_WR|
- CEPH_CAP_FILE_BUFFER)) {
- if (timespec_compare(ctime, &inode->i_ctime) > 0) {
- dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
- inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
- ctime->tv_sec, ctime->tv_nsec);
- inode->i_ctime = *ctime;
- }
- if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
- /* the MDS did a utimes() */
- dout("mtime %ld.%09ld -> %ld.%09ld "
- "tw %d -> %d\n",
- inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
- mtime->tv_sec, mtime->tv_nsec,
- ci->i_time_warp_seq, (int)time_warp_seq);
-
- inode->i_mtime = *mtime;
- inode->i_atime = *atime;
- ci->i_time_warp_seq = time_warp_seq;
- } else if (time_warp_seq == ci->i_time_warp_seq) {
- /* nobody did utimes(); take the max */
- if (timespec_compare(mtime, &inode->i_mtime) > 0) {
- dout("mtime %ld.%09ld -> %ld.%09ld inc\n",
- inode->i_mtime.tv_sec,
- inode->i_mtime.tv_nsec,
- mtime->tv_sec, mtime->tv_nsec);
- inode->i_mtime = *mtime;
- }
- if (timespec_compare(atime, &inode->i_atime) > 0) {
- dout("atime %ld.%09ld -> %ld.%09ld inc\n",
- inode->i_atime.tv_sec,
- inode->i_atime.tv_nsec,
- atime->tv_sec, atime->tv_nsec);
- inode->i_atime = *atime;
- }
- } else if (issued & CEPH_CAP_FILE_EXCL) {
- /* we did a utimes(); ignore mds values */
- } else {
- warn = 1;
- }
- } else {
- /* we have no write caps; whatever the MDS says is true */
- if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
- inode->i_ctime = *ctime;
- inode->i_mtime = *mtime;
- inode->i_atime = *atime;
- ci->i_time_warp_seq = time_warp_seq;
- } else {
- warn = 1;
- }
- }
- if (warn) /* time_warp_seq shouldn't go backwards */
- dout("%p mds time_warp_seq %llu < %u\n",
- inode, time_warp_seq, ci->i_time_warp_seq);
-}
-
-/*
- * Populate an inode based on info from mds. May be called on new or
- * existing inodes.
- */
-static int fill_inode(struct inode *inode,
- struct ceph_mds_reply_info_in *iinfo,
- struct ceph_mds_reply_dirfrag *dirinfo,
- struct ceph_mds_session *session,
- unsigned long ttl_from, int cap_fmode,
- struct ceph_cap_reservation *caps_reservation)
-{
- struct ceph_mds_reply_inode *info = iinfo->in;
- struct ceph_inode_info *ci = ceph_inode(inode);
- int i;
- int issued, implemented;
- struct timespec mtime, atime, ctime;
- u32 nsplits;
- struct ceph_buffer *xattr_blob = NULL;
- int err = 0;
- int queue_trunc = 0;
-
- dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
- inode, ceph_vinop(inode), le64_to_cpu(info->version),
- ci->i_version);
-
- /*
- * prealloc xattr data, if it looks like we'll need it. only
- * if len > 4 (meaning there are actually xattrs; the first 4
- * bytes are the xattr count).
- */
- if (iinfo->xattr_len > 4) {
- xattr_blob = ceph_buffer_new_alloc(iinfo->xattr_len, GFP_NOFS);
- if (!xattr_blob)
- pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
- iinfo->xattr_len);
- }
-
- spin_lock(&inode->i_lock);
-
- /*
- * provided version will be odd if inode value is projected,
- * even if stable. skip the update if we have a newer info
- * (e.g., due to inode info racing form multiple MDSs), or if
- * we are getting projected (unstable) inode info.
- */
- if (le64_to_cpu(info->version) > 0 &&
- (ci->i_version & ~1) > le64_to_cpu(info->version))
- goto no_change;
-
- issued = __ceph_caps_issued(ci, &implemented);
- issued |= implemented | __ceph_caps_dirty(ci);
-
- /* update inode */
- ci->i_version = le64_to_cpu(info->version);
- inode->i_version++;
- inode->i_rdev = le32_to_cpu(info->rdev);
-
- if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
- inode->i_mode = le32_to_cpu(info->mode);
- inode->i_uid = le32_to_cpu(info->uid);
- inode->i_gid = le32_to_cpu(info->gid);
- dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
- inode->i_uid, inode->i_gid);
- }
-
- if ((issued & CEPH_CAP_LINK_EXCL) == 0)
- inode->i_nlink = le32_to_cpu(info->nlink);
-
- /* be careful with mtime, atime, size */
- ceph_decode_timespec(&atime, &info->atime);
- ceph_decode_timespec(&mtime, &info->mtime);
- ceph_decode_timespec(&ctime, &info->ctime);
- queue_trunc = ceph_fill_file_size(inode, issued,
- le32_to_cpu(info->truncate_seq),
- le64_to_cpu(info->truncate_size),
- S_ISDIR(inode->i_mode) ?
- ci->i_rbytes :
- le64_to_cpu(info->size));
- ceph_fill_file_time(inode, issued,
- le32_to_cpu(info->time_warp_seq),
- &ctime, &mtime, &atime);
-
- ci->i_max_size = le64_to_cpu(info->max_size);
- ci->i_layout = info->layout;
- inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
-
- /* xattrs */
- /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
- if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
- le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
- if (ci->i_xattrs.blob)
- ceph_buffer_put(ci->i_xattrs.blob);
- ci->i_xattrs.blob = xattr_blob;
- if (xattr_blob)
- memcpy(ci->i_xattrs.blob->vec.iov_base,
- iinfo->xattr_data, iinfo->xattr_len);
- ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
- }
-
- inode->i_mapping->a_ops = &ceph_aops;
- inode->i_mapping->backing_dev_info =
- &ceph_client(inode->i_sb)->backing_dev_info;
-
- switch (inode->i_mode & S_IFMT) {
- case S_IFIFO:
- case S_IFBLK:
- case S_IFCHR:
- case S_IFSOCK:
- init_special_inode(inode, inode->i_mode, inode->i_rdev);
- inode->i_op = &ceph_file_iops;
- break;
- case S_IFREG:
- inode->i_op = &ceph_file_iops;
- inode->i_fop = &ceph_file_fops;
- break;
- case S_IFLNK:
- inode->i_op = &ceph_symlink_iops;
- if (!ci->i_symlink) {
- int symlen = iinfo->symlink_len;
- char *sym;
-
- BUG_ON(symlen != inode->i_size);
- spin_unlock(&inode->i_lock);
-
- err = -ENOMEM;
- sym = kmalloc(symlen+1, GFP_NOFS);
- if (!sym)
- goto out;
- memcpy(sym, iinfo->symlink, symlen);
- sym[symlen] = 0;
-
- spin_lock(&inode->i_lock);
- if (!ci->i_symlink)
- ci->i_symlink = sym;
- else
- kfree(sym); /* lost a race */
- }
- break;
- case S_IFDIR:
- inode->i_op = &ceph_dir_iops;
- inode->i_fop = &ceph_dir_fops;
-
- ci->i_files = le64_to_cpu(info->files);
- ci->i_subdirs = le64_to_cpu(info->subdirs);
- ci->i_rbytes = le64_to_cpu(info->rbytes);
- ci->i_rfiles = le64_to_cpu(info->rfiles);
- ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
- ceph_decode_timespec(&ci->i_rctime, &info->rctime);
-
- /* set dir completion flag? */
- if (ci->i_files == 0 && ci->i_subdirs == 0 &&
- ceph_snap(inode) == CEPH_NOSNAP &&
- (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
- dout(" marking %p complete (empty)\n", inode);
- ci->i_ceph_flags |= CEPH_I_COMPLETE;
- ci->i_max_offset = 2;
- }
-
- /* it may be better to set st_size in getattr instead? */
- if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
- inode->i_size = ci->i_rbytes;
- break;
- default:
- pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
- ceph_vinop(inode), inode->i_mode);
- }
-
-no_change:
- spin_unlock(&inode->i_lock);
-
- /* queue truncate if we saw i_size decrease */
- if (queue_trunc)
- if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
- &ci->i_vmtruncate_work))
- igrab(inode);
-
- /* populate frag tree */
- /* FIXME: move me up, if/when version reflects fragtree changes */
- nsplits = le32_to_cpu(info->fragtree.nsplits);
- mutex_lock(&ci->i_fragtree_mutex);
- for (i = 0; i < nsplits; i++) {
- u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
- struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
-
- if (IS_ERR(frag))
- continue;
- frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
- dout(" frag %x split by %d\n", frag->frag, frag->split_by);
- }
- mutex_unlock(&ci->i_fragtree_mutex);
-
- /* were we issued a capability? */
- if (info->cap.caps) {
- if (ceph_snap(inode) == CEPH_NOSNAP) {
- ceph_add_cap(inode, session,
- le64_to_cpu(info->cap.cap_id),
- cap_fmode,
- le32_to_cpu(info->cap.caps),
- le32_to_cpu(info->cap.wanted),
- le32_to_cpu(info->cap.seq),
- le32_to_cpu(info->cap.mseq),
- le64_to_cpu(info->cap.realm),
- info->cap.flags,
- caps_reservation);
- } else {
- spin_lock(&inode->i_lock);
- dout(" %p got snap_caps %s\n", inode,
- ceph_cap_string(le32_to_cpu(info->cap.caps)));
- ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
- if (cap_fmode >= 0)
- __ceph_get_fmode(ci, cap_fmode);
- spin_unlock(&inode->i_lock);
- }
- }
-
- /* update delegation info? */
- if (dirinfo)
- ceph_fill_dirfrag(inode, dirinfo);
-
- err = 0;
-
-out:
- ceph_buffer_put(xattr_blob);
- return err;
-}
-
-/*
- * caller should hold session s_mutex.
- */
-static void update_dentry_lease(struct dentry *dentry,
- struct ceph_mds_reply_lease *lease,
- struct ceph_mds_session *session,
- unsigned long from_time)
-{
- struct ceph_dentry_info *di = ceph_dentry(dentry);
- long unsigned duration = le32_to_cpu(lease->duration_ms);
- long unsigned ttl = from_time + (duration * HZ) / 1000;
- long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
- struct inode *dir;
-
- /* only track leases on regular dentries */
- if (dentry->d_op != &ceph_dentry_ops)
- return;
-
- spin_lock(&dentry->d_lock);
- dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n",
- dentry, le16_to_cpu(lease->mask), duration, ttl);
-
- /* make lease_rdcache_gen match directory */
- dir = dentry->d_parent->d_inode;
- di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
-
- if (lease->mask == 0)
- goto out_unlock;
-
- if (di->lease_gen == session->s_cap_gen &&
- time_before(ttl, dentry->d_time))
- goto out_unlock; /* we already have a newer lease. */
-
- if (di->lease_session && di->lease_session != session)
- goto out_unlock;
-
- ceph_dentry_lru_touch(dentry);
-
- if (!di->lease_session)
- di->lease_session = ceph_get_mds_session(session);
- di->lease_gen = session->s_cap_gen;
- di->lease_seq = le32_to_cpu(lease->seq);
- di->lease_renew_after = half_ttl;
- di->lease_renew_from = 0;
- dentry->d_time = ttl;
-out_unlock:
- spin_unlock(&dentry->d_lock);
- return;
-}
-
-/*
- * splice a dentry to an inode.
- * caller must hold directory i_mutex for this to be safe.
- *
- * we will only rehash the resulting dentry if @prehash is
- * true; @prehash will be set to false (for the benefit of
- * the caller) if we fail.
- */
-static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
- bool *prehash)
-{
- struct dentry *realdn;
-
- /* dn must be unhashed */
- if (!d_unhashed(dn))
- d_drop(dn);
- realdn = d_materialise_unique(dn, in);
- if (IS_ERR(realdn)) {
- pr_err("splice_dentry error %p inode %p ino %llx.%llx\n",
- dn, in, ceph_vinop(in));
- if (prehash)
- *prehash = false; /* don't rehash on error */
- dn = realdn; /* note realdn contains the error */
- goto out;
- } else if (realdn) {
- dout("dn %p (%d) spliced with %p (%d) "
- "inode %p ino %llx.%llx\n",
- dn, atomic_read(&dn->d_count),
- realdn, atomic_read(&realdn->d_count),
- realdn->d_inode, ceph_vinop(realdn->d_inode));
- dput(dn);
- dn = realdn;
- } else {
- BUG_ON(!ceph_dentry(dn));
-
- dout("dn %p attached to %p ino %llx.%llx\n",
- dn, dn->d_inode, ceph_vinop(dn->d_inode));
- }
- if ((!prehash || *prehash) && d_unhashed(dn))
- d_rehash(dn);
-out:
- return dn;
-}
-
-/*
- * Incorporate results into the local cache. This is either just
- * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
- * after a lookup).
- *
- * A reply may contain
- * a directory inode along with a dentry.
- * and/or a target inode
- *
- * Called with snap_rwsem (read).
- */
-int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
- struct ceph_mds_session *session)
-{
- struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
- struct inode *in = NULL;
- struct ceph_mds_reply_inode *ininfo;
- struct ceph_vino vino;
- int i = 0;
- int err = 0;
-
- dout("fill_trace %p is_dentry %d is_target %d\n", req,
- rinfo->head->is_dentry, rinfo->head->is_target);
-
-#if 0
- /*
- * Debugging hook:
- *
- * If we resend completed ops to a recovering mds, we get no
- * trace. Since that is very rare, pretend this is the case
- * to ensure the 'no trace' handlers in the callers behave.
- *
- * Fill in inodes unconditionally to avoid breaking cap
- * invariants.
- */
- if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
- pr_info("fill_trace faking empty trace on %lld %s\n",
- req->r_tid, ceph_mds_op_name(rinfo->head->op));
- if (rinfo->head->is_dentry) {
- rinfo->head->is_dentry = 0;
- err = fill_inode(req->r_locked_dir,
- &rinfo->diri, rinfo->dirfrag,
- session, req->r_request_started, -1);
- }
- if (rinfo->head->is_target) {
- rinfo->head->is_target = 0;
- ininfo = rinfo->targeti.in;
- vino.ino = le64_to_cpu(ininfo->ino);
- vino.snap = le64_to_cpu(ininfo->snapid);
- in = ceph_get_inode(sb, vino);
- err = fill_inode(in, &rinfo->targeti, NULL,
- session, req->r_request_started,
- req->r_fmode);
- iput(in);
- }
- }
-#endif
-
- if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
- dout("fill_trace reply is empty!\n");
- if (rinfo->head->result == 0 && req->r_locked_dir) {
- struct ceph_inode_info *ci =
- ceph_inode(req->r_locked_dir);
- dout(" clearing %p complete (empty trace)\n",
- req->r_locked_dir);
- ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
- ci->i_release_count++;
- }
- return 0;
- }
-
- if (rinfo->head->is_dentry) {
- /*
- * lookup link rename : null -> possibly existing inode
- * mknod symlink mkdir : null -> new inode
- * unlink : linked -> null
- */
- struct inode *dir = req->r_locked_dir;
- struct dentry *dn = req->r_dentry;
- bool have_dir_cap, have_lease;
-
- BUG_ON(!dn);
- BUG_ON(!dir);
- BUG_ON(dn->d_parent->d_inode != dir);
- BUG_ON(ceph_ino(dir) !=
- le64_to_cpu(rinfo->diri.in->ino));
- BUG_ON(ceph_snap(dir) !=
- le64_to_cpu(rinfo->diri.in->snapid));
-
- err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
- session, req->r_request_started, -1,
- &req->r_caps_reservation);
- if (err < 0)
- return err;
-
- /* do we have a lease on the whole dir? */
- have_dir_cap =
- (le32_to_cpu(rinfo->diri.in->cap.caps) &
- CEPH_CAP_FILE_SHARED);
-
- /* do we have a dn lease? */
- have_lease = have_dir_cap ||
- (le16_to_cpu(rinfo->dlease->mask) &
- CEPH_LOCK_DN);
-
- if (!have_lease)
- dout("fill_trace no dentry lease or dir cap\n");
-
- /* rename? */
- if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
- dout(" src %p '%.*s' dst %p '%.*s'\n",
- req->r_old_dentry,
- req->r_old_dentry->d_name.len,
- req->r_old_dentry->d_name.name,
- dn, dn->d_name.len, dn->d_name.name);
- dout("fill_trace doing d_move %p -> %p\n",
- req->r_old_dentry, dn);
- d_move(req->r_old_dentry, dn);
- dout(" src %p '%.*s' dst %p '%.*s'\n",
- req->r_old_dentry,
- req->r_old_dentry->d_name.len,
- req->r_old_dentry->d_name.name,
- dn, dn->d_name.len, dn->d_name.name);
- /* take overwritten dentry's readdir offset */
- ceph_dentry(req->r_old_dentry)->offset =
- ceph_dentry(dn)->offset;
- dn = req->r_old_dentry; /* use old_dentry */
- in = dn->d_inode;
- }
-
- /* null dentry? */
- if (!rinfo->head->is_target) {
- dout("fill_trace null dentry\n");
- if (dn->d_inode) {
- dout("d_delete %p\n", dn);
- d_delete(dn);
- } else {
- dout("d_instantiate %p NULL\n", dn);
- d_instantiate(dn, NULL);
- if (have_lease && d_unhashed(dn))
- d_rehash(dn);
- update_dentry_lease(dn, rinfo->dlease,
- session,
- req->r_request_started);
- }
- goto done;
- }
-
- /* attach proper inode */
- ininfo = rinfo->targeti.in;
- vino.ino = le64_to_cpu(ininfo->ino);
- vino.snap = le64_to_cpu(ininfo->snapid);
- if (!dn->d_inode) {
- in = ceph_get_inode(sb, vino);
- if (IS_ERR(in)) {
- pr_err("fill_trace bad get_inode "
- "%llx.%llx\n", vino.ino, vino.snap);
- err = PTR_ERR(in);
- d_delete(dn);
- goto done;
- }
- dn = splice_dentry(dn, in, &have_lease);
- if (IS_ERR(dn)) {
- err = PTR_ERR(dn);
- goto done;
- }
- req->r_dentry = dn; /* may have spliced */
- igrab(in);
- } else if (ceph_ino(in) == vino.ino &&
- ceph_snap(in) == vino.snap) {
- igrab(in);
- } else {
- dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
- dn, in, ceph_ino(in), ceph_snap(in),
- vino.ino, vino.snap);
- have_lease = false;
- in = NULL;
- }
-
- if (have_lease)
- update_dentry_lease(dn, rinfo->dlease, session,
- req->r_request_started);
- dout(" final dn %p\n", dn);
- i++;
- } else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
- req->r_op == CEPH_MDS_OP_MKSNAP) {
- struct dentry *dn = req->r_dentry;
-
- /* fill out a snapdir LOOKUPSNAP dentry */
- BUG_ON(!dn);
- BUG_ON(!req->r_locked_dir);
- BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
- ininfo = rinfo->targeti.in;
- vino.ino = le64_to_cpu(ininfo->ino);
- vino.snap = le64_to_cpu(ininfo->snapid);
- in = ceph_get_inode(sb, vino);
- if (IS_ERR(in)) {
- pr_err("fill_inode get_inode badness %llx.%llx\n",
- vino.ino, vino.snap);
- err = PTR_ERR(in);
- d_delete(dn);
- goto done;
- }
- dout(" linking snapped dir %p to dn %p\n", in, dn);
- dn = splice_dentry(dn, in, NULL);
- if (IS_ERR(dn)) {
- err = PTR_ERR(dn);
- goto done;
- }
- req->r_dentry = dn; /* may have spliced */
- igrab(in);
- rinfo->head->is_dentry = 1; /* fool notrace handlers */
- }
-
- if (rinfo->head->is_target) {
- vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
- vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
-
- if (in == NULL || ceph_ino(in) != vino.ino ||
- ceph_snap(in) != vino.snap) {
- in = ceph_get_inode(sb, vino);
- if (IS_ERR(in)) {
- err = PTR_ERR(in);
- goto done;
- }
- }
- req->r_target_inode = in;
-
- err = fill_inode(in,
- &rinfo->targeti, NULL,
- session, req->r_request_started,
- (le32_to_cpu(rinfo->head->result) == 0) ?
- req->r_fmode : -1,
- &req->r_caps_reservation);
- if (err < 0) {
- pr_err("fill_inode badness %p %llx.%llx\n",
- in, ceph_vinop(in));
- goto done;
- }
- }
-
-done:
- dout("fill_trace done err=%d\n", err);
- return err;
-}
-
-/*
- * Prepopulate our cache with readdir results, leases, etc.
- */
-int ceph_readdir_prepopulate(struct ceph_mds_request *req,
- struct ceph_mds_session *session)
-{
- struct dentry *parent = req->r_dentry;
- struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
- struct qstr dname;
- struct dentry *dn;
- struct inode *in;
- int err = 0, i;
- struct inode *snapdir = NULL;
- struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
- u64 frag = le32_to_cpu(rhead->args.readdir.frag);
- struct ceph_dentry_info *di;
-
- if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
- snapdir = ceph_get_snapdir(parent->d_inode);
- parent = d_find_alias(snapdir);
- dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
- rinfo->dir_nr, parent);
- } else {
- dout("readdir_prepopulate %d items under dn %p\n",
- rinfo->dir_nr, parent);
- if (rinfo->dir_dir)
- ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
- }
-
- for (i = 0; i < rinfo->dir_nr; i++) {
- struct ceph_vino vino;
-
- dname.name = rinfo->dir_dname[i];
- dname.len = rinfo->dir_dname_len[i];
- dname.hash = full_name_hash(dname.name, dname.len);
-
- vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
- vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
-
-retry_lookup:
- dn = d_lookup(parent, &dname);
- dout("d_lookup on parent=%p name=%.*s got %p\n",
- parent, dname.len, dname.name, dn);
-
- if (!dn) {
- dn = d_alloc(parent, &dname);
- dout("d_alloc %p '%.*s' = %p\n", parent,
- dname.len, dname.name, dn);
- if (dn == NULL) {
- dout("d_alloc badness\n");
- err = -ENOMEM;
- goto out;
- }
- err = ceph_init_dentry(dn);
- if (err < 0)
- goto out;
- } else if (dn->d_inode &&
- (ceph_ino(dn->d_inode) != vino.ino ||
- ceph_snap(dn->d_inode) != vino.snap)) {
- dout(" dn %p points to wrong inode %p\n",
- dn, dn->d_inode);
- d_delete(dn);
- dput(dn);
- goto retry_lookup;
- } else {
- /* reorder parent's d_subdirs */
- spin_lock(&dcache_lock);
- spin_lock(&dn->d_lock);
- list_move(&dn->d_u.d_child, &parent->d_subdirs);
- spin_unlock(&dn->d_lock);
- spin_unlock(&dcache_lock);
- }
-
- di = dn->d_fsdata;
- di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
-
- /* inode */
- if (dn->d_inode) {
- in = dn->d_inode;
- } else {
- in = ceph_get_inode(parent->d_sb, vino);
- if (in == NULL) {
- dout("new_inode badness\n");
- d_delete(dn);
- dput(dn);
- err = -ENOMEM;
- goto out;
- }
- dn = splice_dentry(dn, in, NULL);
- }
-
- if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
- req->r_request_started, -1,
- &req->r_caps_reservation) < 0) {
- pr_err("fill_inode badness on %p\n", in);
- dput(dn);
- continue;
- }
- update_dentry_lease(dn, rinfo->dir_dlease[i],
- req->r_session, req->r_request_started);
- dput(dn);
- }
- req->r_did_prepopulate = true;
-
-out:
- if (snapdir) {
- iput(snapdir);
- dput(parent);
- }
- dout("readdir_prepopulate done\n");
- return err;
-}
-
-int ceph_inode_set_size(struct inode *inode, loff_t size)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int ret = 0;
-
- spin_lock(&inode->i_lock);
- dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
- inode->i_size = size;
- inode->i_blocks = (size + (1 << 9) - 1) >> 9;
-
- /* tell the MDS if we are approaching max_size */
- if ((size << 1) >= ci->i_max_size &&
- (ci->i_reported_size << 1) < ci->i_max_size)
- ret = 1;
-
- spin_unlock(&inode->i_lock);
- return ret;
-}
-
-/*
- * Write back inode data in a worker thread. (This can't be done
- * in the message handler context.)
- */
-void ceph_inode_writeback(struct work_struct *work)
-{
- struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
- i_wb_work);
- struct inode *inode = &ci->vfs_inode;
-
- dout("writeback %p\n", inode);
- filemap_fdatawrite(&inode->i_data);
- iput(inode);
-}
-
-/*
- * Invalidate inode pages in a worker thread. (This can't be done
- * in the message handler context.)
- */
-static void ceph_inode_invalidate_pages(struct work_struct *work)
-{
- struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
- i_pg_inv_work);
- struct inode *inode = &ci->vfs_inode;
- u32 orig_gen;
- int check = 0;
-
- spin_lock(&inode->i_lock);
- dout("invalidate_pages %p gen %d revoking %d\n", inode,
- ci->i_rdcache_gen, ci->i_rdcache_revoking);
- if (ci->i_rdcache_gen == 0 ||
- ci->i_rdcache_revoking != ci->i_rdcache_gen) {
- BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
- /* nevermind! */
- ci->i_rdcache_revoking = 0;
- spin_unlock(&inode->i_lock);
- goto out;
- }
- orig_gen = ci->i_rdcache_gen;
- spin_unlock(&inode->i_lock);
-
- truncate_inode_pages(&inode->i_data, 0);
-
- spin_lock(&inode->i_lock);
- if (orig_gen == ci->i_rdcache_gen) {
- dout("invalidate_pages %p gen %d successful\n", inode,
- ci->i_rdcache_gen);
- ci->i_rdcache_gen = 0;
- ci->i_rdcache_revoking = 0;
- check = 1;
- } else {
- dout("invalidate_pages %p gen %d raced, gen now %d\n",
- inode, orig_gen, ci->i_rdcache_gen);
- }
- spin_unlock(&inode->i_lock);
-
- if (check)
- ceph_check_caps(ci, 0, NULL);
-out:
- iput(inode);
-}
-
-
-/*
- * called by trunc_wq; take i_mutex ourselves
- *
- * We also truncate in a separate thread as well.
- */
-void ceph_vmtruncate_work(struct work_struct *work)
-{
- struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
- i_vmtruncate_work);
- struct inode *inode = &ci->vfs_inode;
-
- dout("vmtruncate_work %p\n", inode);
- mutex_lock(&inode->i_mutex);
- __ceph_do_pending_vmtruncate(inode);
- mutex_unlock(&inode->i_mutex);
- iput(inode);
-}
-
-/*
- * called with i_mutex held.
- *
- * Make sure any pending truncation is applied before doing anything
- * that may depend on it.
- */
-void __ceph_do_pending_vmtruncate(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- u64 to;
- int wrbuffer_refs, wake = 0;
-
-retry:
- spin_lock(&inode->i_lock);
- if (ci->i_truncate_pending == 0) {
- dout("__do_pending_vmtruncate %p none pending\n", inode);
- spin_unlock(&inode->i_lock);
- return;
- }
-
- /*
- * make sure any dirty snapped pages are flushed before we
- * possibly truncate them.. so write AND block!
- */
- if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
- dout("__do_pending_vmtruncate %p flushing snaps first\n",
- inode);
- spin_unlock(&inode->i_lock);
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 30)
- filemap_write_and_wait_range(&inode->i_data, 0,
- inode->i_sb->s_maxbytes);
-#else
-# warning i may not flush all data after a snapshot + truncate w/ < 2.6.30
- filemap_write_and_wait(&inode->i_data);
-#endif
- goto retry;
- }
-
- to = ci->i_truncate_size;
- wrbuffer_refs = ci->i_wrbuffer_ref;
- dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
- ci->i_truncate_pending, to);
- spin_unlock(&inode->i_lock);
-
- truncate_inode_pages(inode->i_mapping, to);
-
- spin_lock(&inode->i_lock);
- ci->i_truncate_pending--;
- if (ci->i_truncate_pending == 0)
- wake = 1;
- spin_unlock(&inode->i_lock);
-
- if (wrbuffer_refs == 0)
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
- if (wake)
- wake_up(&ci->i_cap_wq);
-}
-
-
-/*
- * symlinks
- */
-static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
- struct ceph_inode_info *ci = ceph_inode(dentry->d_inode);
- nd_set_link(nd, ci->i_symlink);
- return NULL;
-}
-
-static const struct inode_operations ceph_symlink_iops = {
- .readlink = generic_readlink,
- .follow_link = ceph_sym_follow_link,
-};
-
-/*
- * setattr
- */
-int ceph_setattr(struct dentry *dentry, struct iattr *attr)
-{
- struct inode *inode = dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct inode *parent_inode = dentry->d_parent->d_inode;
- const unsigned int ia_valid = attr->ia_valid;
- struct ceph_mds_request *req;
- struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
- int issued;
- int release = 0, dirtied = 0;
- int mask = 0;
- int err = 0;
- int queue_trunc = 0;
-
- if (ceph_snap(inode) != CEPH_NOSNAP)
- return -EROFS;
-
- __ceph_do_pending_vmtruncate(inode);
-
- err = inode_change_ok(inode, attr);
- if (err != 0)
- return err;
-
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
- USE_AUTH_MDS);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- spin_lock(&inode->i_lock);
- issued = __ceph_caps_issued(ci, NULL);
- dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
-
- if (ia_valid & ATTR_UID) {
- dout("setattr %p uid %d -> %d\n", inode,
- inode->i_uid, attr->ia_uid);
- if (issued & CEPH_CAP_AUTH_EXCL) {
- inode->i_uid = attr->ia_uid;
- dirtied |= CEPH_CAP_AUTH_EXCL;
- } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
- attr->ia_uid != inode->i_uid) {
- req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
- mask |= CEPH_SETATTR_UID;
- release |= CEPH_CAP_AUTH_SHARED;
- }
- }
- if (ia_valid & ATTR_GID) {
- dout("setattr %p gid %d -> %d\n", inode,
- inode->i_gid, attr->ia_gid);
- if (issued & CEPH_CAP_AUTH_EXCL) {
- inode->i_gid = attr->ia_gid;
- dirtied |= CEPH_CAP_AUTH_EXCL;
- } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
- attr->ia_gid != inode->i_gid) {
- req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
- mask |= CEPH_SETATTR_GID;
- release |= CEPH_CAP_AUTH_SHARED;
- }
- }
- if (ia_valid & ATTR_MODE) {
- dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
- attr->ia_mode);
- if (issued & CEPH_CAP_AUTH_EXCL) {
- inode->i_mode = attr->ia_mode;
- dirtied |= CEPH_CAP_AUTH_EXCL;
- } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
- attr->ia_mode != inode->i_mode) {
- req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
- mask |= CEPH_SETATTR_MODE;
- release |= CEPH_CAP_AUTH_SHARED;
- }
- }
-
- if (ia_valid & ATTR_ATIME) {
- dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode,
- inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
- attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
- if (issued & CEPH_CAP_FILE_EXCL) {
- ci->i_time_warp_seq++;
- inode->i_atime = attr->ia_atime;
- dirtied |= CEPH_CAP_FILE_EXCL;
- } else if ((issued & CEPH_CAP_FILE_WR) &&
- timespec_compare(&inode->i_atime,
- &attr->ia_atime) < 0) {
- inode->i_atime = attr->ia_atime;
- dirtied |= CEPH_CAP_FILE_WR;
- } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
- !timespec_equal(&inode->i_atime, &attr->ia_atime)) {
- ceph_encode_timespec(&req->r_args.setattr.atime,
- &attr->ia_atime);
- mask |= CEPH_SETATTR_ATIME;
- release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
- CEPH_CAP_FILE_WR;
- }
- }
- if (ia_valid & ATTR_MTIME) {
- dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode,
- inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
- attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
- if (issued & CEPH_CAP_FILE_EXCL) {
- ci->i_time_warp_seq++;
- inode->i_mtime = attr->ia_mtime;
- dirtied |= CEPH_CAP_FILE_EXCL;
- } else if ((issued & CEPH_CAP_FILE_WR) &&
- timespec_compare(&inode->i_mtime,
- &attr->ia_mtime) < 0) {
- inode->i_mtime = attr->ia_mtime;
- dirtied |= CEPH_CAP_FILE_WR;
- } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
- !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
- ceph_encode_timespec(&req->r_args.setattr.mtime,
- &attr->ia_mtime);
- mask |= CEPH_SETATTR_MTIME;
- release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
- CEPH_CAP_FILE_WR;
- }
- }
- if (ia_valid & ATTR_SIZE) {
- dout("setattr %p size %lld -> %lld\n", inode,
- inode->i_size, attr->ia_size);
- if (attr->ia_size > inode->i_sb->s_maxbytes) {
- err = -EINVAL;
- goto out;
- }
- if ((issued & CEPH_CAP_FILE_EXCL) &&
- attr->ia_size > inode->i_size) {
- inode->i_size = attr->ia_size;
- if (attr->ia_size < inode->i_size) {
- ci->i_truncate_size = attr->ia_size;
- ci->i_truncate_pending++;
- queue_trunc = 1;
- }
- inode->i_blocks =
- (attr->ia_size + (1 << 9) - 1) >> 9;
- inode->i_ctime = attr->ia_ctime;
- ci->i_reported_size = attr->ia_size;
- dirtied |= CEPH_CAP_FILE_EXCL;
- } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
- attr->ia_size != inode->i_size) {
- req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
- req->r_args.setattr.old_size =
- cpu_to_le64(inode->i_size);
- mask |= CEPH_SETATTR_SIZE;
- release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
- CEPH_CAP_FILE_WR;
- }
- }
-
- /* these do nothing */
- if (ia_valid & ATTR_CTIME) {
- bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
- ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
- dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode,
- inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
- attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
- only ? "ctime only" : "ignored");
- inode->i_ctime = attr->ia_ctime;
- if (only) {
- /*
- * if kernel wants to dirty ctime but nothing else,
- * we need to choose a cap to dirty under, or do
- * a almost-no-op setattr
- */
- if (issued & CEPH_CAP_AUTH_EXCL)
- dirtied |= CEPH_CAP_AUTH_EXCL;
- else if (issued & CEPH_CAP_FILE_EXCL)
- dirtied |= CEPH_CAP_FILE_EXCL;
- else if (issued & CEPH_CAP_XATTR_EXCL)
- dirtied |= CEPH_CAP_XATTR_EXCL;
- else
- mask |= CEPH_SETATTR_CTIME;
- }
- }
- if (ia_valid & ATTR_FILE)
- dout("setattr %p ATTR_FILE ... hrm!\n", inode);
-
- if (dirtied) {
- __ceph_mark_dirty_caps(ci, dirtied);
- inode->i_ctime = CURRENT_TIME;
- }
-
- release &= issued;
- spin_unlock(&inode->i_lock);
-
- if (queue_trunc)
- __ceph_do_pending_vmtruncate(inode);
-
- if (mask) {
- req->r_inode = igrab(inode);
- req->r_inode_drop = release;
- req->r_args.setattr.mask = cpu_to_le32(mask);
- req->r_num_caps = 1;
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- }
- dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
- ceph_cap_string(dirtied), mask);
-
- ceph_mdsc_put_request(req);
- __ceph_do_pending_vmtruncate(inode);
- return err;
-out:
- spin_unlock(&inode->i_lock);
- ceph_mdsc_put_request(req);
- return err;
-}
-
-/*
- * Verify that we have a lease on the given mask. If not,
- * do a getattr against an mds.
- */
-int ceph_do_getattr(struct inode *inode, int mask)
-{
- struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
- struct ceph_mds_request *req;
- int err;
-
- if (ceph_snap(inode) == CEPH_SNAPDIR) {
- dout("do_getattr inode %p SNAPDIR\n", inode);
- return 0;
- }
-
- dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
- if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
- return 0;
-
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
- if (IS_ERR(req))
- return PTR_ERR(req);
- req->r_inode = igrab(inode);
- req->r_num_caps = 1;
- req->r_args.getattr.mask = cpu_to_le32(mask);
- err = ceph_mdsc_do_request(mdsc, NULL, req);
- ceph_mdsc_put_request(req);
- dout("do_getattr result=%d\n", err);
- return err;
-}
-
-
-/*
- * Check inode permissions. We verify we have a valid value for
- * the AUTH cap, then call the generic handler.
- */
-int ceph_permission(struct inode *inode, int mask)
-{
- int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
-
- if (!err)
- err = generic_permission(inode, mask, NULL);
- return err;
-}
-
-/*
- * Get all attributes. Hopefully somedata we'll have a statlite()
- * and can limit the fields we require to be accurate.
- */
-int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
- struct kstat *stat)
-{
- struct inode *inode = dentry->d_inode;
- int err;
-
- err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
- if (!err) {
- generic_fillattr(inode, stat);
- stat->ino = inode->i_ino;
- if (ceph_snap(inode) != CEPH_NOSNAP)
- stat->dev = ceph_snap(inode);
- else
- stat->dev = 0;
- if (S_ISDIR(inode->i_mode))
- stat->blksize = 65536;
- }
- return err;
-}
+++ /dev/null
-diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
-index dbea4f9..29b9abc 100644
---- a/Documentation/ioctl/ioctl-number.txt
-+++ b/Documentation/ioctl/ioctl-number.txt
-@@ -180,6 +180,7 @@ Code Seq# Include File Comments
- <http://www.proximity.com.au/~brian/winradio/>
- 0x90 00 drivers/cdrom/sbpcd.h
- 0x93 60-7F linux/auto_fs.h
-+0x97 00-7F fs/ceph/ioctl.h Ceph file system
- 0x99 00-0F 537-Addinboard driver
- <mailto:buk@buks.ipn.de>
- 0xA0 all linux/sdp/sdp.h Industrial Device Project
+++ /dev/null
-#include <linux/in.h>
-
-#include "ioctl.h"
-#include "super.h"
-#include "ceph_debug.h"
-
-
-/*
- * ioctls
- */
-
-/*
- * get and set the file layout
- */
-static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
-{
- struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
- struct ceph_ioctl_layout l;
- int err;
-
- err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
- if (!err) {
- l.stripe_unit = ceph_file_layout_su(ci->i_layout);
- l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
- l.object_size = ceph_file_layout_object_size(ci->i_layout);
- l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
- if (copy_to_user(arg, &l, sizeof(l)))
- return -EFAULT;
- }
-
- return err;
-}
-
-static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
-{
- struct inode *inode = file->f_dentry->d_inode;
- struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
- struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
- struct ceph_mds_request *req;
- struct ceph_ioctl_layout l;
- int err, i;
-
- /* copy and validate */
- if (copy_from_user(&l, arg, sizeof(l)))
- return -EFAULT;
-
- if ((l.object_size & ~PAGE_MASK) ||
- (l.stripe_unit & ~PAGE_MASK) ||
- !l.stripe_unit ||
- (l.object_size &&
- (unsigned)l.object_size % (unsigned)l.stripe_unit))
- return -EINVAL;
-
- /* make sure it's a valid data pool */
- if (l.data_pool > 0) {
- mutex_lock(&mdsc->mutex);
- err = -EINVAL;
- for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
- if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
- err = 0;
- break;
- }
- mutex_unlock(&mdsc->mutex);
- if (err)
- return err;
- }
-
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
- USE_AUTH_MDS);
- if (IS_ERR(req))
- return PTR_ERR(req);
- req->r_inode = igrab(inode);
- req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
-
- req->r_args.setlayout.layout.fl_stripe_unit =
- cpu_to_le32(l.stripe_unit);
- req->r_args.setlayout.layout.fl_stripe_count =
- cpu_to_le32(l.stripe_count);
- req->r_args.setlayout.layout.fl_object_size =
- cpu_to_le32(l.object_size);
- req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
- req->r_args.setlayout.layout.fl_pg_preferred = cpu_to_le32((s32)-1);
-
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- ceph_mdsc_put_request(req);
- return err;
-}
-
-/*
- * Return object name, size/offset information, and location (OSD
- * number, network address) for a given file offset.
- */
-static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
-{
- struct ceph_ioctl_dataloc dl;
- struct inode *inode = file->f_dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
- u64 len = 1, olen;
- u64 tmp;
- struct ceph_object_layout ol;
- union ceph_pg pgid;
-
- /* copy and validate */
- if (copy_from_user(&dl, arg, sizeof(dl)))
- return -EFAULT;
-
- down_read(&osdc->map_sem);
- ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
- &dl.object_no, &dl.object_offset, &olen);
- dl.file_offset -= dl.object_offset;
- dl.object_size = ceph_file_layout_object_size(ci->i_layout);
- dl.block_size = ceph_file_layout_su(ci->i_layout);
-
- /* block_offset = object_offset % block_size */
- tmp = dl.object_offset;
- dl.block_offset = do_div(tmp, dl.block_size);
-
- snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
- ceph_ino(inode), dl.object_no);
- ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
- osdc->osdmap);
-
- pgid.pg64 = le64_to_cpu(ol.ol_pgid);
- dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
- if (dl.osd >= 0) {
- struct ceph_entity_addr *a =
- ceph_osd_addr(osdc->osdmap, dl.osd);
- if (a)
- memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
- } else {
- memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
- }
- up_read(&osdc->map_sem);
-
- /* send result back to user */
- if (copy_to_user(arg, &dl, sizeof(dl)))
- return -EFAULT;
-
- return 0;
-}
-
-long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
- dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
- switch (cmd) {
- case CEPH_IOC_GET_LAYOUT:
- return ceph_ioctl_get_layout(file, (void __user *)arg);
-
- case CEPH_IOC_SET_LAYOUT:
- return ceph_ioctl_set_layout(file, (void __user *)arg);
-
- case CEPH_IOC_GET_DATALOC:
- return ceph_ioctl_get_dataloc(file, (void __user *)arg);
- }
- return -ENOTTY;
-}
+++ /dev/null
-#ifndef FS_CEPH_IOCTL_H
-#define FS_CEPH_IOCTL_H
-
-#include <linux/ioctl.h>
-#include <linux/types.h>
-
-#define CEPH_IOCTL_MAGIC 0x97
-
-/* just use u64 to align sanely on all archs */
-struct ceph_ioctl_layout {
- __u64 stripe_unit, stripe_count, object_size;
- __u64 data_pool;
-};
-
-#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1, \
- struct ceph_ioctl_layout)
-#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
- struct ceph_ioctl_layout)
-
-/*
- * Extract identity, address of the OSD and object storing a given
- * file offset.
- */
-struct ceph_ioctl_dataloc {
- __u64 file_offset; /* in+out: file offset */
- __u64 object_offset; /* out: offset in object */
- __u64 object_no; /* out: object # */
- __u64 object_size; /* out: object size */
- char object_name[64]; /* out: object name */
- __u64 block_offset; /* out: offset in block */
- __u64 block_size; /* out: block length */
- __s64 osd; /* out: osd # */
- struct sockaddr_storage osd_addr; /* out: osd address */
-};
-
-#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
- struct ceph_ioctl_dataloc)
-
-#endif
+++ /dev/null
-diff --git a/fs/Kconfig b/fs/Kconfig
-index 93945dd..928b98b 100644
---- a/fs/Kconfig
-+++ b/fs/Kconfig
-@@ -266,6 +266,7 @@ config NFS_COMMON
-
- source "net/sunrpc/Kconfig"
- source "fs/smbfs/Kconfig"
-+source "fs/ceph/Kconfig"
- source "fs/cifs/Kconfig"
- source "fs/ncpfs/Kconfig"
- source "fs/coda/Kconfig"
-diff --git a/fs/Makefile b/fs/Makefile
-index 38bc735..e11fa80 100644
---- a/fs/Makefile
-+++ b/fs/Makefile
-@@ -122,4 +122,5 @@ obj-$(CONFIG_DEBUG_FS) += debugfs/
- obj-$(CONFIG_OCFS2_FS) += ocfs2/
- obj-$(CONFIG_BTRFS_FS) += btrfs/
- obj-$(CONFIG_GFS2_FS) += gfs2/
- obj-$(CONFIG_EXOFS_FS) += exofs/
-+obj-$(CONFIG_CEPH_FS) += ceph/
+++ /dev/null
-diff --git a/fs/staging/Kconfig b/fs/staging/Kconfig
-index 605d8ae..bed45e4 100644
---- a/fs/staging/Kconfig
-+++ b/fs/staging/Kconfig
-@@ -42,5 +42,7 @@ config FSSTAGING_EXCLUDE_BUILD
-
- if !FSSTAGING_EXCLUDE_BUILD
-
-+source "fs/staging/ceph/Kconfig"
-+
- endif # !FSSTAGING_EXCLUDE_BUILD
- endif # FSSTAGING
-diff --git a/fs/staging/Makefile b/fs/staging/Makefile
-index 0e2c0d6..5dabb66 100644
---- a/fs/staging/Makefile
-+++ b/fs/staging/Makefile
-@@ -3,3 +3,4 @@
- # fix for build system bug...
- obj-$(CONFIG_FSSTAGING) += fsstaging.o
-
-+obj-$(CONFIG_CEPH_FS) += ceph/
-\ No newline at end of file
+++ /dev/null
-From c629c17b36c8e85396d4764f4bdee447c68c3d71 Mon Sep 17 00:00:00 2001
-From: Sage Weil <sage@newdream.net>
-Date: Thu, 1 Oct 2009 13:44:41 -0700
-Subject: [PATCH] asdf
-
----
- MAINTAINERS | 9 +++++++++
- 1 files changed, 9 insertions(+), 0 deletions(-)
-
-diff --git a/MAINTAINERS b/MAINTAINERS
-index c450f3a..9b680ff 100644
---- a/MAINTAINERS
-+++ b/MAINTAINERS
-@@ -1294,6 +1294,15 @@ F: arch/powerpc/include/asm/spu*.h
- F: arch/powerpc/oprofile/*cell*
- F: arch/powerpc/platforms/cell/
-
-+CEPH DISTRIBUTED FILE SYSTEM CLIENT
-+M: Sage Weil <sage@newdream.net>
-+L: ceph-devel@lists.sourceforge.net
-+W: http://ceph.newdream.net/
-+T: git git://ceph.newdream.net/linux-ceph-client.git
-+S: Supported
-+F: Documentation/filesystems/ceph.txt
-+F: fs/ceph
-+
- CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
- M: David Vrabel <david.vrabel@csr.com>
- L: linux-usb@vger.kernel.org
---
-1.5.6.5
-
+++ /dev/null
-#include "ceph_debug.h"
-
-#include <linux/wait.h>
-#include <linux/sched.h>
-
-#include "mds_client.h"
-#include "mon_client.h"
-#include "super.h"
-#include "messenger.h"
-#include "decode.h"
-
-/*
- * A cluster of MDS (metadata server) daemons is responsible for
- * managing the file system namespace (the directory hierarchy and
- * inodes) and for coordinating shared access to storage. Metadata is
- * partitioning hierarchically across a number of servers, and that
- * partition varies over time as the cluster adjusts the distribution
- * in order to balance load.
- *
- * The MDS client is primarily responsible to managing synchronous
- * metadata requests for operations like open, unlink, and so forth.
- * If there is a MDS failure, we find out about it when we (possibly
- * request and) receive a new MDS map, and can resubmit affected
- * requests.
- *
- * For the most part, though, we take advantage of a lossless
- * communications channel to the MDS, and do not need to worry about
- * timing out or resubmitting requests.
- *
- * We maintain a stateful "session" with each MDS we interact with.
- * Within each session, we sent periodic heartbeat messages to ensure
- * any capabilities or leases we have been issues remain valid. If
- * the session times out and goes stale, our leases and capabilities
- * are no longer valid.
- */
-
-static void __wake_requests(struct ceph_mds_client *mdsc,
- struct list_head *head);
-
-const static struct ceph_connection_operations mds_con_ops;
-
-
-/*
- * mds reply parsing
- */
-
-/*
- * parse individual inode info
- */
-static int parse_reply_info_in(void **p, void *end,
- struct ceph_mds_reply_info_in *info)
-{
- int err = -EIO;
-
- info->in = *p;
- *p += sizeof(struct ceph_mds_reply_inode) +
- sizeof(*info->in->fragtree.splits) *
- le32_to_cpu(info->in->fragtree.nsplits);
-
- ceph_decode_32_safe(p, end, info->symlink_len, bad);
- ceph_decode_need(p, end, info->symlink_len, bad);
- info->symlink = *p;
- *p += info->symlink_len;
-
- ceph_decode_32_safe(p, end, info->xattr_len, bad);
- ceph_decode_need(p, end, info->xattr_len, bad);
- info->xattr_data = *p;
- *p += info->xattr_len;
- return 0;
-bad:
- return err;
-}
-
-/*
- * parse a normal reply, which may contain a (dir+)dentry and/or a
- * target inode.
- */
-static int parse_reply_info_trace(void **p, void *end,
- struct ceph_mds_reply_info_parsed *info)
-{
- int err;
-
- if (info->head->is_dentry) {
- err = parse_reply_info_in(p, end, &info->diri);
- if (err < 0)
- goto out_bad;
-
- if (unlikely(*p + sizeof(*info->dirfrag) > end))
- goto bad;
- info->dirfrag = *p;
- *p += sizeof(*info->dirfrag) +
- sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
- if (unlikely(*p > end))
- goto bad;
-
- ceph_decode_32_safe(p, end, info->dname_len, bad);
- ceph_decode_need(p, end, info->dname_len, bad);
- info->dname = *p;
- *p += info->dname_len;
- info->dlease = *p;
- *p += sizeof(*info->dlease);
- }
-
- if (info->head->is_target) {
- err = parse_reply_info_in(p, end, &info->targeti);
- if (err < 0)
- goto out_bad;
- }
-
- if (unlikely(*p != end))
- goto bad;
- return 0;
-
-bad:
- err = -EIO;
-out_bad:
- pr_err("problem parsing mds trace %d\n", err);
- return err;
-}
-
-/*
- * parse readdir results
- */
-static int parse_reply_info_dir(void **p, void *end,
- struct ceph_mds_reply_info_parsed *info)
-{
- u32 num, i = 0;
- int err;
-
- info->dir_dir = *p;
- if (*p + sizeof(*info->dir_dir) > end)
- goto bad;
- *p += sizeof(*info->dir_dir) +
- sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
- if (*p > end)
- goto bad;
-
- ceph_decode_need(p, end, sizeof(num) + 2, bad);
- ceph_decode_32(p, num);
- ceph_decode_8(p, info->dir_end);
- ceph_decode_8(p, info->dir_complete);
- if (num == 0)
- goto done;
-
- /* alloc large array */
- info->dir_nr = num;
- info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
- sizeof(*info->dir_dname) +
- sizeof(*info->dir_dname_len) +
- sizeof(*info->dir_dlease),
- GFP_NOFS);
- if (info->dir_in == NULL) {
- err = -ENOMEM;
- goto out_bad;
- }
- info->dir_dname = (void *)(info->dir_in + num);
- info->dir_dname_len = (void *)(info->dir_dname + num);
- info->dir_dlease = (void *)(info->dir_dname_len + num);
-
- while (num) {
- /* dentry */
- ceph_decode_need(p, end, sizeof(u32)*2, bad);
- ceph_decode_32(p, info->dir_dname_len[i]);
- ceph_decode_need(p, end, info->dir_dname_len[i], bad);
- info->dir_dname[i] = *p;
- *p += info->dir_dname_len[i];
- dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
- info->dir_dname[i]);
- info->dir_dlease[i] = *p;
- *p += sizeof(struct ceph_mds_reply_lease);
-
- /* inode */
- err = parse_reply_info_in(p, end, &info->dir_in[i]);
- if (err < 0)
- goto out_bad;
- i++;
- num--;
- }
-
-done:
- if (*p != end)
- goto bad;
- return 0;
-
-bad:
- err = -EIO;
-out_bad:
- pr_err("problem parsing dir contents %d\n", err);
- return err;
-}
-
-/*
- * parse entire mds reply
- */
-static int parse_reply_info(struct ceph_msg *msg,
- struct ceph_mds_reply_info_parsed *info)
-{
- void *p, *end;
- u32 len;
- int err;
-
- info->head = msg->front.iov_base;
- p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
- end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
-
- /* trace */
- ceph_decode_32_safe(&p, end, len, bad);
- if (len > 0) {
- err = parse_reply_info_trace(&p, p+len, info);
- if (err < 0)
- goto out_bad;
- }
-
- /* dir content */
- ceph_decode_32_safe(&p, end, len, bad);
- if (len > 0) {
- err = parse_reply_info_dir(&p, p+len, info);
- if (err < 0)
- goto out_bad;
- }
-
- /* snap blob */
- ceph_decode_32_safe(&p, end, len, bad);
- info->snapblob_len = len;
- info->snapblob = p;
- p += len;
-
- if (p != end)
- goto bad;
- return 0;
-
-bad:
- err = -EIO;
-out_bad:
- pr_err("mds parse_reply err %d\n", err);
- return err;
-}
-
-static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
-{
- kfree(info->dir_in);
-}
-
-
-/*
- * sessions
- */
-static const char *session_state_name(int s)
-{
- switch (s) {
- case CEPH_MDS_SESSION_NEW: return "new";
- case CEPH_MDS_SESSION_OPENING: return "opening";
- case CEPH_MDS_SESSION_OPEN: return "open";
- case CEPH_MDS_SESSION_HUNG: return "hung";
- case CEPH_MDS_SESSION_CLOSING: return "closing";
- case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
- default: return "???";
- }
-}
-
-static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
-{
- if (atomic_inc_not_zero(&s->s_ref)) {
- dout("mdsc get_session %p %d -> %d\n", s,
- atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
- return s;
- } else {
- dout("mdsc get_session %p 0 -- FAIL", s);
- return NULL;
- }
-}
-
-void ceph_put_mds_session(struct ceph_mds_session *s)
-{
- dout("mdsc put_session %p %d -> %d\n", s,
- atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
- if (atomic_dec_and_test(&s->s_ref)) {
- ceph_con_shutdown(&s->s_con);
- kfree(s);
- }
-}
-
-/*
- * called under mdsc->mutex
- */
-struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
- int mds)
-{
- struct ceph_mds_session *session;
-
- if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
- return NULL;
- session = mdsc->sessions[mds];
- dout("lookup_mds_session %p %d\n", session,
- atomic_read(&session->s_ref));
- get_session(session);
- return session;
-}
-
-static bool __have_session(struct ceph_mds_client *mdsc, int mds)
-{
- if (mds >= mdsc->max_sessions)
- return false;
- return mdsc->sessions[mds];
-}
-
-/*
- * create+register a new session for given mds.
- * called under mdsc->mutex.
- */
-static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
- int mds)
-{
- struct ceph_mds_session *s;
-
- s = kzalloc(sizeof(*s), GFP_NOFS);
- s->s_mdsc = mdsc;
- s->s_mds = mds;
- s->s_state = CEPH_MDS_SESSION_NEW;
- s->s_ttl = 0;
- s->s_seq = 0;
- mutex_init(&s->s_mutex);
-
- ceph_con_init(mdsc->client->msgr, &s->s_con);
- s->s_con.private = s;
- s->s_con.ops = &mds_con_ops;
- s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
- s->s_con.peer_name.num = cpu_to_le64(mds);
- ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
-
- spin_lock_init(&s->s_cap_lock);
- s->s_cap_gen = 0;
- s->s_cap_ttl = 0;
- s->s_renew_requested = 0;
- s->s_renew_seq = 0;
- INIT_LIST_HEAD(&s->s_caps);
- s->s_nr_caps = 0;
- atomic_set(&s->s_ref, 1);
- INIT_LIST_HEAD(&s->s_waiting);
- INIT_LIST_HEAD(&s->s_unsafe);
- s->s_num_cap_releases = 0;
- INIT_LIST_HEAD(&s->s_cap_releases);
- INIT_LIST_HEAD(&s->s_cap_releases_done);
- INIT_LIST_HEAD(&s->s_cap_flushing);
- INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
-
- dout("register_session mds%d\n", mds);
- if (mds >= mdsc->max_sessions) {
- int newmax = 1 << get_count_order(mds+1);
- struct ceph_mds_session **sa;
-
- dout("register_session realloc to %d\n", newmax);
- sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
- if (sa == NULL)
- return ERR_PTR(-ENOMEM);
- if (mdsc->sessions) {
- memcpy(sa, mdsc->sessions,
- mdsc->max_sessions * sizeof(void *));
- kfree(mdsc->sessions);
- }
- mdsc->sessions = sa;
- mdsc->max_sessions = newmax;
- }
- mdsc->sessions[mds] = s;
- atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */
- return s;
-}
-
-/*
- * called under mdsc->mutex
- */
-static void unregister_session(struct ceph_mds_client *mdsc, int mds)
-{
- dout("unregister_session mds%d %p\n", mds, mdsc->sessions[mds]);
- ceph_put_mds_session(mdsc->sessions[mds]);
- mdsc->sessions[mds] = NULL;
-}
-
-/*
- * drop session refs in request.
- *
- * should be last request ref, or hold mdsc->mutex
- */
-static void put_request_session(struct ceph_mds_request *req)
-{
- if (req->r_session) {
- ceph_put_mds_session(req->r_session);
- req->r_session = NULL;
- }
-}
-
-void ceph_mdsc_put_request(struct ceph_mds_request *req)
-{
- dout("mdsc put_request %p %d -> %d\n", req,
- atomic_read(&req->r_ref), atomic_read(&req->r_ref)-1);
- if (atomic_dec_and_test(&req->r_ref)) {
- if (req->r_request)
- ceph_msg_put(req->r_request);
- if (req->r_reply) {
- ceph_msg_put(req->r_reply);
- destroy_reply_info(&req->r_reply_info);
- }
- if (req->r_inode) {
- ceph_put_cap_refs(ceph_inode(req->r_inode),
- CEPH_CAP_PIN);
- iput(req->r_inode);
- }
- if (req->r_locked_dir)
- ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
- CEPH_CAP_PIN);
- if (req->r_target_inode)
- iput(req->r_target_inode);
- if (req->r_dentry)
- dput(req->r_dentry);
- if (req->r_old_dentry) {
- ceph_put_cap_refs(
- ceph_inode(req->r_old_dentry->d_parent->d_inode),
- CEPH_CAP_PIN);
- dput(req->r_old_dentry);
- }
- kfree(req->r_path1);
- kfree(req->r_path2);
- put_request_session(req);
- ceph_unreserve_caps(&req->r_caps_reservation);
- kfree(req);
- }
-}
-
-/*
- * lookup session, bump ref if found.
- *
- * called under mdsc->mutex.
- */
-static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
- u64 tid)
-{
- struct ceph_mds_request *req;
- req = radix_tree_lookup(&mdsc->request_tree, tid);
- if (req)
- ceph_mdsc_get_request(req);
- return req;
-}
-
-/*
- * Register an in-flight request, and assign a tid. Link to directory
- * are modifying (if any).
- *
- * Called under mdsc->mutex.
- */
-static void __register_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req,
- struct inode *dir)
-{
- req->r_tid = ++mdsc->last_tid;
- if (req->r_num_caps)
- ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps);
- dout("__register_request %p tid %lld\n", req, req->r_tid);
- ceph_mdsc_get_request(req);
- radix_tree_insert(&mdsc->request_tree, req->r_tid, (void *)req);
-
- if (dir) {
- struct ceph_inode_info *ci = ceph_inode(dir);
-
- spin_lock(&ci->i_unsafe_lock);
- req->r_unsafe_dir = dir;
- list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
- spin_unlock(&ci->i_unsafe_lock);
- }
-}
-
-static void __unregister_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req)
-{
- dout("__unregister_request %p tid %lld\n", req, req->r_tid);
- radix_tree_delete(&mdsc->request_tree, req->r_tid);
- ceph_mdsc_put_request(req);
-
- if (req->r_unsafe_dir) {
- struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
-
- spin_lock(&ci->i_unsafe_lock);
- list_del_init(&req->r_unsafe_dir_item);
- spin_unlock(&ci->i_unsafe_lock);
- }
-}
-
-/*
- * Choose mds to send request to next. If there is a hint set in the
- * request (e.g., due to a prior forward hint from the mds), use that.
- * Otherwise, consult frag tree and/or caps to identify the
- * appropriate mds. If all else fails, choose randomly.
- *
- * Called under mdsc->mutex.
- */
-static int __choose_mds(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req)
-{
- struct inode *inode;
- struct ceph_inode_info *ci;
- struct ceph_cap *cap;
- int mode = req->r_direct_mode;
- int mds = -1;
- u32 hash = req->r_direct_hash;
- bool is_hash = req->r_direct_is_hash;
-
- /*
- * is there a specific mds we should try? ignore hint if we have
- * no session and the mds is not up (active or recovering).
- */
- if (req->r_resend_mds >= 0 &&
- (__have_session(mdsc, req->r_resend_mds) ||
- ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
- dout("choose_mds using resend_mds mds%d\n",
- req->r_resend_mds);
- return req->r_resend_mds;
- }
-
- if (mode == USE_RANDOM_MDS)
- goto random;
-
- inode = NULL;
- if (req->r_inode) {
- inode = req->r_inode;
- } else if (req->r_dentry) {
- if (req->r_dentry->d_inode) {
- inode = req->r_dentry->d_inode;
- } else {
- inode = req->r_dentry->d_parent->d_inode;
- hash = req->r_dentry->d_name.hash;
- is_hash = true;
- }
- }
- dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
- (int)hash, mode);
- if (!inode)
- goto random;
- ci = ceph_inode(inode);
-
- if (is_hash && S_ISDIR(inode->i_mode)) {
- struct ceph_inode_frag frag;
- int found;
-
- ceph_choose_frag(ci, hash, &frag, &found);
- if (found) {
- if (mode == USE_ANY_MDS && frag.ndist > 0) {
- u8 r;
-
- /* choose a random replica */
- get_random_bytes(&r, 1);
- r %= frag.ndist;
- mds = frag.dist[r];
- dout("choose_mds %p %llx.%llx "
- "frag %u mds%d (%d/%d)\n",
- inode, ceph_vinop(inode),
- frag.frag, frag.mds,
- (int)r, frag.ndist);
- return mds;
- }
-
- /* since this file/dir wasn't known to be
- * replicated, then we want to look for the
- * authoritative mds. */
- mode = USE_AUTH_MDS;
- if (frag.mds >= 0) {
- /* choose auth mds */
- mds = frag.mds;
- dout("choose_mds %p %llx.%llx "
- "frag %u mds%d (auth)\n",
- inode, ceph_vinop(inode), frag.frag, mds);
- return mds;
- }
- }
- }
-
- spin_lock(&inode->i_lock);
- cap = NULL;
- if (mode == USE_AUTH_MDS)
- cap = ci->i_auth_cap;
- if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
- cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
- if (!cap) {
- spin_unlock(&inode->i_lock);
- goto random;
- }
- mds = cap->session->s_mds;
- dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
- inode, ceph_vinop(inode), mds,
- cap == ci->i_auth_cap ? "auth " : "", cap);
- spin_unlock(&inode->i_lock);
- return mds;
-
-random:
- mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
- dout("choose_mds chose random mds%d\n", mds);
- return mds;
-}
-
-
-/*
- * session messages
- */
-static struct ceph_msg *create_session_msg(u32 op, u64 seq)
-{
- struct ceph_msg *msg;
- struct ceph_mds_session_head *h;
-
- msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
- if (IS_ERR(msg)) {
- pr_err("create_session_msg ENOMEM creating msg\n");
- return ERR_PTR(PTR_ERR(msg));
- }
- h = msg->front.iov_base;
- h->op = cpu_to_le32(op);
- h->seq = cpu_to_le64(seq);
- return msg;
-}
-
-/*
- * send session open request.
- *
- * called under mdsc->mutex
- */
-static int __open_session(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- struct ceph_msg *msg;
- int mstate;
- int mds = session->s_mds;
- int err = 0;
-
- /* wait for mds to go active? */
- mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
- dout("open_session to mds%d (%s)\n", mds,
- ceph_mds_state_name(mstate));
- session->s_state = CEPH_MDS_SESSION_OPENING;
- session->s_renew_requested = jiffies;
-
- /* send connect message */
- msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
- if (IS_ERR(msg)) {
- err = PTR_ERR(msg);
- goto out;
- }
- ceph_con_send(&session->s_con, msg);
-
-out:
- return 0;
-}
-
-/*
- * session caps
- */
-
-/*
- * Free preallocated cap messages assigned to this session
- */
-static void cleanup_cap_releases(struct ceph_mds_session *session)
-{
- struct ceph_msg *msg;
-
- spin_lock(&session->s_cap_lock);
- while (!list_empty(&session->s_cap_releases)) {
- msg = list_first_entry(&session->s_cap_releases,
- struct ceph_msg, list_head);
- list_del_init(&msg->list_head);
- ceph_msg_put(msg);
- }
- while (!list_empty(&session->s_cap_releases_done)) {
- msg = list_first_entry(&session->s_cap_releases_done,
- struct ceph_msg, list_head);
- list_del_init(&msg->list_head);
- ceph_msg_put(msg);
- }
- spin_unlock(&session->s_cap_lock);
-}
-
-/*
- * Helper to safely iterate over all caps associated with a session.
- *
- * caller must hold session s_mutex
- */
-static int iterate_session_caps(struct ceph_mds_session *session,
- int (*cb)(struct inode *, struct ceph_cap *,
- void *), void *arg)
-{
- struct ceph_cap *cap, *ncap;
- struct inode *inode;
- int ret;
-
- dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
- spin_lock(&session->s_cap_lock);
- list_for_each_entry_safe(cap, ncap, &session->s_caps, session_caps) {
- inode = igrab(&cap->ci->vfs_inode);
- if (!inode)
- continue;
- spin_unlock(&session->s_cap_lock);
- ret = cb(inode, cap, arg);
- iput(inode);
- if (ret < 0)
- return ret;
- spin_lock(&session->s_cap_lock);
- }
- spin_unlock(&session->s_cap_lock);
-
- return 0;
-}
-
-static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
- void *arg)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- dout("removing cap %p, ci is %p, inode is %p\n",
- cap, ci, &ci->vfs_inode);
- ceph_remove_cap(cap);
- return 0;
-}
-
-/*
- * caller must hold session s_mutex
- */
-static void remove_session_caps(struct ceph_mds_session *session)
-{
- dout("remove_session_caps on %p\n", session);
- iterate_session_caps(session, remove_session_caps_cb, NULL);
- BUG_ON(session->s_nr_caps > 0);
- cleanup_cap_releases(session);
-}
-
-/*
- * wake up any threads waiting on this session's caps. if the cap is
- * old (didn't get renewed on the client reconnect), remove it now.
- *
- * caller must hold s_mutex.
- */
-static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
- void *arg)
-{
- struct ceph_mds_session *session = arg;
-
- spin_lock(&inode->i_lock);
- if (cap->gen != session->s_cap_gen) {
- pr_err("failed reconnect %p %llx.%llx cap %p "
- "(gen %d < session %d)\n", inode, ceph_vinop(inode),
- cap, cap->gen, session->s_cap_gen);
- __ceph_remove_cap(cap, NULL);
- }
- wake_up(&ceph_inode(inode)->i_cap_wq);
- spin_unlock(&inode->i_lock);
- return 0;
-}
-
-static void wake_up_session_caps(struct ceph_mds_session *session)
-{
- dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
- iterate_session_caps(session, wake_up_session_cb, session);
-}
-
-/*
- * Send periodic message to MDS renewing all currently held caps. The
- * ack will reset the expiration for all caps from this session.
- *
- * caller holds s_mutex
- */
-static int send_renew_caps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- struct ceph_msg *msg;
- int state;
-
- if (time_after_eq(jiffies, session->s_cap_ttl) &&
- time_after_eq(session->s_cap_ttl, session->s_renew_requested))
- pr_info("mds%d caps stale\n", session->s_mds);
-
- /* do not try to renew caps until a recovering mds has reconnected
- * with its clients. */
- state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
- if (state < CEPH_MDS_STATE_RECONNECT) {
- dout("send_renew_caps ignoring mds%d (%s)\n",
- session->s_mds, ceph_mds_state_name(state));
- return 0;
- }
-
- dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
- ceph_mds_state_name(state));
- session->s_renew_requested = jiffies;
- msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
- ++session->s_renew_seq);
- if (IS_ERR(msg))
- return PTR_ERR(msg);
- ceph_con_send(&session->s_con, msg);
- return 0;
-}
-
-/*
- * Note new cap ttl, and any transition from stale -> not stale (fresh?).
- */
-static void renewed_caps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session, int is_renew)
-{
- int was_stale;
- int wake = 0;
-
- spin_lock(&session->s_cap_lock);
- was_stale = is_renew && (session->s_cap_ttl == 0 ||
- time_after_eq(jiffies, session->s_cap_ttl));
-
- session->s_cap_ttl = session->s_renew_requested +
- mdsc->mdsmap->m_session_timeout*HZ;
-
- if (was_stale) {
- if (time_before(jiffies, session->s_cap_ttl)) {
- pr_info("mds%d caps renewed\n", session->s_mds);
- wake = 1;
- } else {
- pr_info("mds%d caps still stale\n", session->s_mds);
- }
- }
- dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
- session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
- time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
- spin_unlock(&session->s_cap_lock);
-
- if (wake)
- wake_up_session_caps(session);
-}
-
-/*
- * send a session close request
- */
-static int request_close_session(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- struct ceph_msg *msg;
- int err = 0;
-
- dout("request_close_session mds%d state %s seq %lld\n",
- session->s_mds, session_state_name(session->s_state),
- session->s_seq);
- msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
- if (IS_ERR(msg))
- err = PTR_ERR(msg);
- else
- ceph_con_send(&session->s_con, msg);
- return err;
-}
-
-/*
- * Called with s_mutex held.
- */
-static int __close_session(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
- return 0;
- session->s_state = CEPH_MDS_SESSION_CLOSING;
- return request_close_session(mdsc, session);
-}
-
-/*
- * Trim old(er) caps.
- *
- * Because we can't cache an inode without one or more caps, we do
- * this indirectly: if a cap is unused, we prune its aliases, at which
- * point the inode will hopefully get dropped to.
- *
- * Yes, this is a bit sloppy. Our only real goal here is to respond to
- * memory pressure from the MDS, though, so it needn't be perfect.
- */
-static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
-{
- struct ceph_mds_session *session = arg;
- struct ceph_inode_info *ci = ceph_inode(inode);
- int used, oissued, mine;
-
- if (session->s_trim_caps <= 0)
- return -1;
-
- spin_lock(&inode->i_lock);
- mine = cap->issued | cap->implemented;
- used = __ceph_caps_used(ci);
- oissued = __ceph_caps_issued_other(ci, cap);
-
- dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
- inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
- ceph_cap_string(used));
- if (ci->i_dirty_caps)
- goto out; /* dirty caps */
- if ((used & ~oissued) & mine)
- goto out; /* we need these caps */
-
- session->s_trim_caps--;
- if (oissued) {
- /* we aren't the only cap.. just remove us */
- __ceph_remove_cap(cap, NULL);
- } else {
- /* try to drop referring dentries */
- spin_unlock(&inode->i_lock);
- d_prune_aliases(inode);
- dout("trim_caps_cb %p cap %p pruned, count now %d\n",
- inode, cap, atomic_read(&inode->i_count));
- return 0;
- }
-
-out:
- spin_unlock(&inode->i_lock);
- return 0;
-}
-
-/*
- * Trim session cap count down to some max number.
- */
-static int trim_caps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session,
- int max_caps)
-{
- int trim_caps = session->s_nr_caps - max_caps;
-
- dout("trim_caps mds%d start: %d / %d, trim %d\n",
- session->s_mds, session->s_nr_caps, max_caps, trim_caps);
- if (trim_caps > 0) {
- session->s_trim_caps = trim_caps;
- iterate_session_caps(session, trim_caps_cb, session);
- dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
- session->s_mds, session->s_nr_caps, max_caps,
- trim_caps - session->s_trim_caps);
- }
- return 0;
-}
-
-/*
- * Allocate cap_release messages. If there is a partially full message
- * in the queue, try to allocate enough to cover it's remainder, so that
- * we can send it immediately.
- *
- * Called under s_mutex.
- */
-static int add_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session,
- int extra)
-{
- struct ceph_msg *msg;
- struct ceph_mds_cap_release *head;
- int err = -ENOMEM;
-
- if (extra < 0)
- extra = mdsc->client->mount_args.cap_release_safety;
-
- spin_lock(&session->s_cap_lock);
-
- if (!list_empty(&session->s_cap_releases)) {
- msg = list_first_entry(&session->s_cap_releases,
- struct ceph_msg,
- list_head);
- head = msg->front.iov_base;
- extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
- }
-
- while (session->s_num_cap_releases < session->s_nr_caps + extra) {
- spin_unlock(&session->s_cap_lock);
- msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
- 0, 0, NULL);
- if (!msg)
- goto out_unlocked;
- dout("add_cap_releases %p msg %p now %d\n", session, msg,
- (int)msg->front.iov_len);
- head = msg->front.iov_base;
- head->num = cpu_to_le32(0);
- msg->front.iov_len = sizeof(*head);
- spin_lock(&session->s_cap_lock);
- list_add(&msg->list_head, &session->s_cap_releases);
- session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
- }
-
- if (!list_empty(&session->s_cap_releases)) {
- msg = list_first_entry(&session->s_cap_releases,
- struct ceph_msg,
- list_head);
- head = msg->front.iov_base;
- if (head->num) {
- dout(" queueing non-full %p (%d)\n", msg,
- le32_to_cpu(head->num));
- list_move_tail(&msg->list_head,
- &session->s_cap_releases_done);
- session->s_num_cap_releases -=
- CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
- }
- }
- err = 0;
- spin_unlock(&session->s_cap_lock);
-out_unlocked:
- return err;
-}
-
-/*
- * flush all dirty inode data to disk.
- *
- * returns true if we've flushed through want_flush_seq
- */
-static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
-{
- int mds, ret = 1;
-
- dout("check_cap_flush want %lld\n", want_flush_seq);
- mutex_lock(&mdsc->mutex);
- for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
- struct ceph_mds_session *session = mdsc->sessions[mds];
-
- if (!session)
- continue;
- get_session(session);
- mutex_unlock(&mdsc->mutex);
-
- mutex_lock(&session->s_mutex);
- if (!list_empty(&session->s_cap_flushing)) {
- struct ceph_inode_info *ci =
- list_entry(session->s_cap_flushing.next,
- struct ceph_inode_info,
- i_flushing_item);
- struct inode *inode = &ci->vfs_inode;
-
- spin_lock(&inode->i_lock);
- if (ci->i_cap_flush_seq <= want_flush_seq) {
- dout("check_cap_flush still flushing %p "
- "seq %lld <= %lld to mds%d\n", inode,
- ci->i_cap_flush_seq, want_flush_seq,
- session->s_mds);
- ret = 0;
- }
- spin_unlock(&inode->i_lock);
- }
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
-
- if (!ret)
- return ret;
- mutex_lock(&mdsc->mutex);
- }
-
- mutex_unlock(&mdsc->mutex);
- dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
- return ret;
-}
-
-/*
- * called under s_mutex
- */
-static void send_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- struct ceph_msg *msg;
-
- dout("send_cap_releases mds%d\n", session->s_mds);
- while (1) {
- spin_lock(&session->s_cap_lock);
- if (list_empty(&session->s_cap_releases_done))
- break;
- msg = list_first_entry(&session->s_cap_releases_done,
- struct ceph_msg, list_head);
- list_del_init(&msg->list_head);
- spin_unlock(&session->s_cap_lock);
- msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
- dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
- ceph_con_send(&session->s_con, msg);
- }
- spin_unlock(&session->s_cap_lock);
-}
-
-/*
- * requests
- */
-
-/*
- * Create an mds request.
- */
-struct ceph_mds_request *
-ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
-{
- struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
-
- if (!req)
- return ERR_PTR(-ENOMEM);
-
- req->r_started = jiffies;
- req->r_resend_mds = -1;
- INIT_LIST_HEAD(&req->r_unsafe_dir_item);
- req->r_fmode = -1;
- atomic_set(&req->r_ref, 1); /* one for request_tree, one for caller */
- INIT_LIST_HEAD(&req->r_wait);
- init_completion(&req->r_completion);
- init_completion(&req->r_safe_completion);
- INIT_LIST_HEAD(&req->r_unsafe_item);
-
- req->r_op = op;
- req->r_direct_mode = mode;
- return req;
-}
-
-/*
- * return oldest (lowest) tid in request tree, 0 if none.
- *
- * called under mdsc->mutex.
- */
-static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
-{
- struct ceph_mds_request *first;
- if (radix_tree_gang_lookup(&mdsc->request_tree,
- (void **)&first, 0, 1) <= 0)
- return 0;
- return first->r_tid;
-}
-
-/*
- * Build a dentry's path. Allocate on heap; caller must kfree. Based
- * on build_path_from_dentry in fs/cifs/dir.c.
- *
- * If @stop_on_nosnap, generate path relative to the first non-snapped
- * inode.
- *
- * Encode hidden .snap dirs as a double /, i.e.
- * foo/.snap/bar -> foo//bar
- */
-char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
- int stop_on_nosnap)
-{
- struct dentry *temp;
- char *path;
- int len, pos;
-
- if (dentry == NULL)
- return ERR_PTR(-EINVAL);
-
-retry:
- len = 0;
- for (temp = dentry; !IS_ROOT(temp);) {
- struct inode *inode = temp->d_inode;
- if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
- len++; /* slash only */
- else if (stop_on_nosnap && inode &&
- ceph_snap(inode) == CEPH_NOSNAP)
- break;
- else
- len += 1 + temp->d_name.len;
- temp = temp->d_parent;
- if (temp == NULL) {
- pr_err("build_path_dentry corrupt dentry %p\n", dentry);
- return ERR_PTR(-EINVAL);
- }
- }
- if (len)
- len--; /* no leading '/' */
-
- path = kmalloc(len+1, GFP_NOFS);
- if (path == NULL)
- return ERR_PTR(-ENOMEM);
- pos = len;
- path[pos] = 0; /* trailing null */
- for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
- struct inode *inode = temp->d_inode;
-
- if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
- dout("build_path_dentry path+%d: %p SNAPDIR\n",
- pos, temp);
- } else if (stop_on_nosnap && inode &&
- ceph_snap(inode) == CEPH_NOSNAP) {
- break;
- } else {
- pos -= temp->d_name.len;
- if (pos < 0)
- break;
- strncpy(path + pos, temp->d_name.name,
- temp->d_name.len);
- dout("build_path_dentry path+%d: %p '%.*s'\n",
- pos, temp, temp->d_name.len, path + pos);
- }
- if (pos)
- path[--pos] = '/';
- temp = temp->d_parent;
- if (temp == NULL) {
- pr_err("build_path_dentry corrupt dentry\n");
- kfree(path);
- return ERR_PTR(-EINVAL);
- }
- }
- if (pos != 0) {
- pr_err("build_path_dentry did not end path lookup where "
- "expected, namelen is %d, pos is %d\n", len, pos);
- /* presumably this is only possible if racing with a
- rename of one of the parent directories (we can not
- lock the dentries above us to prevent this, but
- retrying should be harmless) */
- kfree(path);
- goto retry;
- }
-
- *base = ceph_ino(temp->d_inode);
- *plen = len;
- dout("build_path_dentry on %p %d built %llx '%.*s'\n",
- dentry, atomic_read(&dentry->d_count), *base, len, path);
- return path;
-}
-
-static int build_dentry_path(struct dentry *dentry,
- const char **ppath, int *ppathlen, u64 *pino,
- int *pfreepath)
-{
- char *path;
-
- if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
- *pino = ceph_ino(dentry->d_parent->d_inode);
- *ppath = dentry->d_name.name;
- *ppathlen = dentry->d_name.len;
- return 0;
- }
- path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
- if (IS_ERR(path))
- return PTR_ERR(path);
- *ppath = path;
- *pfreepath = 1;
- return 0;
-}
-
-static int build_inode_path(struct inode *inode,
- const char **ppath, int *ppathlen, u64 *pino,
- int *pfreepath)
-{
- struct dentry *dentry;
- char *path;
-
- if (ceph_snap(inode) == CEPH_NOSNAP) {
- *pino = ceph_ino(inode);
- *ppathlen = 0;
- return 0;
- }
- dentry = d_find_alias(inode);
- path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
- dput(dentry);
- if (IS_ERR(path))
- return PTR_ERR(path);
- *ppath = path;
- *pfreepath = 1;
- return 0;
-}
-
-/*
- * request arguments may be specified via an inode *, a dentry *, or
- * an explicit ino+path.
- */
-static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
- const char *rpath, u64 rino,
- const char **ppath, int *pathlen,
- u64 *ino, int *freepath)
-{
- int r = 0;
-
- if (rinode) {
- r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
- dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
- ceph_snap(rinode));
- } else if (rdentry) {
- r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
- dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
- *ppath);
- } else if (rpath) {
- *ino = rino;
- *ppath = rpath;
- *pathlen = strlen(rpath);
- dout(" path %.*s\n", *pathlen, rpath);
- }
-
- return r;
-}
-
-/*
- * called under mdsc->mutex
- */
-static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req,
- int mds)
-{
- struct ceph_msg *msg;
- struct ceph_mds_request_head *head;
- const char *path1 = NULL;
- const char *path2 = NULL;
- u64 ino1 = 0, ino2 = 0;
- int pathlen1 = 0, pathlen2 = 0;
- int freepath1 = 0, freepath2 = 0;
- int len;
- u16 releases;
- void *p, *end;
- int ret;
-
- ret = set_request_path_attr(req->r_inode, req->r_dentry,
- req->r_path1, req->r_ino1.ino,
- &path1, &pathlen1, &ino1, &freepath1);
- if (ret < 0) {
- msg = ERR_PTR(ret);
- goto out;
- }
-
- ret = set_request_path_attr(NULL, req->r_old_dentry,
- req->r_path2, req->r_ino2.ino,
- &path2, &pathlen2, &ino2, &freepath2);
- if (ret < 0) {
- msg = ERR_PTR(ret);
- goto out_free1;
- }
-
- len = sizeof(*head) +
- pathlen1 + pathlen2 + 2*(sizeof(u32) + sizeof(u64));
-
- /* calculate (max) length for cap releases */
- len += sizeof(struct ceph_mds_request_release) *
- (!!req->r_inode_drop + !!req->r_dentry_drop +
- !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
- if (req->r_dentry_drop)
- len += req->r_dentry->d_name.len;
- if (req->r_old_dentry_drop)
- len += req->r_old_dentry->d_name.len;
-
- msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
- if (IS_ERR(msg))
- goto out_free2;
-
- head = msg->front.iov_base;
- p = msg->front.iov_base + sizeof(*head);
- end = msg->front.iov_base + msg->front.iov_len;
-
- head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
- head->op = cpu_to_le32(req->r_op);
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 29)
- head->caller_uid = cpu_to_le32(current_fsuid());
- head->caller_gid = cpu_to_le32(current_fsgid());
-#else
- head->caller_uid = cpu_to_le32(current->fsuid);
- head->caller_gid = cpu_to_le32(current->fsgid);
-#endif
- head->args = req->r_args;
-
- ceph_encode_filepath(&p, end, ino1, path1);
- ceph_encode_filepath(&p, end, ino2, path2);
-
- /* cap releases */
- releases = 0;
- if (req->r_inode_drop)
- releases += ceph_encode_inode_release(&p,
- req->r_inode ? req->r_inode : req->r_dentry->d_inode,
- mds, req->r_inode_drop, req->r_inode_unless, 0);
- if (req->r_dentry_drop)
- releases += ceph_encode_dentry_release(&p, req->r_dentry,
- mds, req->r_dentry_drop, req->r_dentry_unless);
- if (req->r_old_dentry_drop)
- releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
- mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
- if (req->r_old_inode_drop)
- releases += ceph_encode_inode_release(&p,
- req->r_old_dentry->d_inode,
- mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
- head->num_releases = cpu_to_le16(releases);
-
- BUG_ON(p > end);
- msg->front.iov_len = p - msg->front.iov_base;
- msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
-
- msg->pages = req->r_pages;
- msg->nr_pages = req->r_num_pages;
- msg->hdr.data_len = cpu_to_le32(req->r_data_len);
- msg->hdr.data_off = cpu_to_le16(0);
-
-out_free2:
- if (freepath2)
- kfree((char *)path2);
-out_free1:
- if (freepath1)
- kfree((char *)path1);
-out:
- return msg;
-}
-
-/*
- * called under mdsc->mutex if error, under no mutex if
- * success.
- */
-static void complete_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req)
-{
- if (req->r_callback)
- req->r_callback(mdsc, req);
- else
- complete(&req->r_completion);
-}
-
-/*
- * called under mdsc->mutex
- */
-static int __prepare_send_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req,
- int mds)
-{
- struct ceph_mds_request_head *rhead;
- struct ceph_msg *msg;
- int flags = 0;
-
- req->r_mds = mds;
- req->r_attempts++;
- dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
- req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
-
- if (req->r_request) {
- ceph_msg_put(req->r_request);
- req->r_request = NULL;
- }
- msg = create_request_message(mdsc, req, mds);
- if (IS_ERR(msg)) {
- req->r_reply = ERR_PTR(PTR_ERR(msg));
- complete_request(mdsc, req);
- return -PTR_ERR(msg);
- }
- req->r_request = msg;
-
- rhead = msg->front.iov_base;
- rhead->tid = cpu_to_le64(req->r_tid);
- rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
- if (req->r_got_unsafe)
- flags |= CEPH_MDS_FLAG_REPLAY;
- if (req->r_locked_dir)
- flags |= CEPH_MDS_FLAG_WANT_DENTRY;
- rhead->flags = cpu_to_le32(flags);
- rhead->num_fwd = req->r_num_fwd;
- rhead->num_retry = req->r_attempts - 1;
-
- dout(" r_locked_dir = %p\n", req->r_locked_dir);
-
- if (req->r_target_inode && req->r_got_unsafe)
- rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
- else
- rhead->ino = 0;
- return 0;
-}
-
-/*
- * send request, or put it on the appropriate wait list.
- */
-static int __do_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req)
-{
- struct ceph_mds_session *session = NULL;
- int mds = -1;
- int err = -EAGAIN;
-
- if (req->r_reply)
- goto out;
-
- if (req->r_timeout &&
- time_after_eq(jiffies, req->r_started + req->r_timeout)) {
- dout("do_request timed out\n");
- err = -EIO;
- goto finish;
- }
-
- mds = __choose_mds(mdsc, req);
- if (mds < 0 ||
- ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
- dout("do_request no mds or not active, waiting for map\n");
- list_add(&req->r_wait, &mdsc->waiting_for_map);
- goto out;
- }
-
- /* get, open session */
- session = __ceph_lookup_mds_session(mdsc, mds);
- if (!session)
- session = register_session(mdsc, mds);
- dout("do_request mds%d session %p state %s\n", mds, session,
- session_state_name(session->s_state));
- if (session->s_state != CEPH_MDS_SESSION_OPEN &&
- session->s_state != CEPH_MDS_SESSION_HUNG) {
- if (session->s_state == CEPH_MDS_SESSION_NEW ||
- session->s_state == CEPH_MDS_SESSION_CLOSING)
- __open_session(mdsc, session);
- list_add(&req->r_wait, &session->s_waiting);
- goto out_session;
- }
-
- /* send request */
- req->r_session = get_session(session);
- req->r_resend_mds = -1; /* forget any previous mds hint */
-
- if (req->r_request_started == 0) /* note request start time */
- req->r_request_started = jiffies;
-
- err = __prepare_send_request(mdsc, req, mds);
- if (!err) {
- ceph_msg_get(req->r_request);
- ceph_con_send(&session->s_con, req->r_request);
- }
-
-out_session:
- ceph_put_mds_session(session);
-out:
- return err;
-
-finish:
- req->r_reply = ERR_PTR(err);
- complete_request(mdsc, req);
- goto out;
-}
-
-/*
- * called under mdsc->mutex
- */
-static void __wake_requests(struct ceph_mds_client *mdsc,
- struct list_head *head)
-{
- struct ceph_mds_request *req, *nreq;
-
- list_for_each_entry_safe(req, nreq, head, r_wait) {
- list_del_init(&req->r_wait);
- __do_request(mdsc, req);
- }
-}
-
-/*
- * Wake up threads with requests pending for @mds, so that they can
- * resubmit their requests to a possibly different mds. If @all is set,
- * wake up if their requests has been forwarded to @mds, too.
- */
-static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
-{
- struct ceph_mds_request *reqs[10];
- u64 nexttid = 0;
- int i, got;
-
- dout("kick_requests mds%d\n", mds);
- while (nexttid <= mdsc->last_tid) {
- got = radix_tree_gang_lookup(&mdsc->request_tree,
- (void **)&reqs, nexttid, 10);
- if (got == 0)
- break;
- nexttid = reqs[got-1]->r_tid + 1;
- for (i = 0; i < got; i++) {
- if (reqs[i]->r_got_unsafe)
- continue;
- if (reqs[i]->r_session &&
- reqs[i]->r_session->s_mds == mds) {
- dout(" kicking tid %llu\n", reqs[i]->r_tid);
- put_request_session(reqs[i]);
- __do_request(mdsc, reqs[i]);
- }
- }
- }
-}
-
-void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req)
-{
- dout("submit_request on %p\n", req);
- mutex_lock(&mdsc->mutex);
- __register_request(mdsc, req, NULL);
- __do_request(mdsc, req);
- mutex_unlock(&mdsc->mutex);
-}
-
-/*
- * Synchrously perform an mds request. Take care of all of the
- * session setup, forwarding, retry details.
- */
-int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
- struct inode *dir,
- struct ceph_mds_request *req)
-{
- int err;
-
- dout("do_request on %p\n", req);
-
- /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
- if (req->r_inode)
- ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
- if (req->r_locked_dir)
- ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
- if (req->r_old_dentry)
- ceph_get_cap_refs(
- ceph_inode(req->r_old_dentry->d_parent->d_inode),
- CEPH_CAP_PIN);
-
- /* issue */
- mutex_lock(&mdsc->mutex);
- __register_request(mdsc, req, dir);
- __do_request(mdsc, req);
-
- /* wait */
- if (!req->r_reply) {
- mutex_unlock(&mdsc->mutex);
- if (req->r_timeout) {
- err = wait_for_completion_timeout(&req->r_completion,
- req->r_timeout);
- if (err > 0)
- err = 0;
- else if (err == 0)
- req->r_reply = ERR_PTR(-EIO);
- } else {
- wait_for_completion(&req->r_completion);
- }
- mutex_lock(&mdsc->mutex);
- }
-
- if (IS_ERR(req->r_reply)) {
- err = PTR_ERR(req->r_reply);
- req->r_reply = NULL;
-
- /* clean up */
- __unregister_request(mdsc, req);
- if (!list_empty(&req->r_unsafe_item))
- list_del_init(&req->r_unsafe_item);
- complete(&req->r_safe_completion);
- } else if (req->r_err) {
- err = req->r_err;
- } else {
- err = le32_to_cpu(req->r_reply_info.head->result);
- }
- mutex_unlock(&mdsc->mutex);
-
- dout("do_request %p done, result %d\n", req, err);
- return err;
-}
-
-/*
- * Handle mds reply.
- *
- * We take the session mutex and parse and process the reply immediately.
- * This preserves the logical ordering of replies, capabilities, etc., sent
- * by the MDS as they are applied to our local cache.
- */
-static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
-{
- struct ceph_mds_client *mdsc = session->s_mdsc;
- struct ceph_mds_request *req;
- struct ceph_mds_reply_head *head = msg->front.iov_base;
- struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */
- u64 tid;
- int err, result;
- int mds;
-
- if (msg->hdr.src.name.type != CEPH_ENTITY_TYPE_MDS)
- return;
- if (msg->front.iov_len < sizeof(*head)) {
- pr_err("mdsc_handle_reply got corrupt (short) reply\n");
- return;
- }
-
- /* get request, session */
- tid = le64_to_cpu(head->tid);
- mutex_lock(&mdsc->mutex);
- req = __lookup_request(mdsc, tid);
- if (!req) {
- dout("handle_reply on unknown tid %llu\n", tid);
- mutex_unlock(&mdsc->mutex);
- return;
- }
- dout("handle_reply %p\n", req);
- mds = le64_to_cpu(msg->hdr.src.name.num);
-
- /* correct session? */
- if (!req->r_session && req->r_session != session) {
- pr_err("mdsc_handle_reply got %llu on session mds%d"
- " not mds%d\n", tid, session->s_mds,
- req->r_session ? req->r_session->s_mds : -1);
- mutex_unlock(&mdsc->mutex);
- goto out;
- }
-
- /* dup? */
- if ((req->r_got_unsafe && !head->safe) ||
- (req->r_got_safe && head->safe)) {
- pr_warning("got a dup %s reply on %llu from mds%d\n",
- head->safe ? "safe" : "unsafe", tid, mds);
- mutex_unlock(&mdsc->mutex);
- goto out;
- }
-
- result = le32_to_cpu(head->result);
-
- /*
- * Tolerate 2 consecutive ESTALEs from the same mds.
- * FIXME: we should be looking at the cap migrate_seq.
- */
- if (result == -ESTALE) {
- req->r_direct_mode = USE_AUTH_MDS;
- req->r_num_stale++;
- if (req->r_num_stale <= 2) {
- __do_request(mdsc, req);
- mutex_unlock(&mdsc->mutex);
- goto out;
- }
- } else {
- req->r_num_stale = 0;
- }
-
- if (head->safe) {
- req->r_got_safe = true;
- __unregister_request(mdsc, req);
- complete(&req->r_safe_completion);
-
- if (req->r_got_unsafe) {
- /*
- * We already handled the unsafe response, now do the
- * cleanup. No need to examine the response; the MDS
- * doesn't include any result info in the safe
- * response. And even if it did, there is nothing
- * useful we could do with a revised return value.
- */
- dout("got safe reply %llu, mds%d\n", tid, mds);
- list_del_init(&req->r_unsafe_item);
-
- /* last unsafe request during umount? */
- if (mdsc->stopping && !__get_oldest_tid(mdsc))
- complete(&mdsc->safe_umount_waiters);
- mutex_unlock(&mdsc->mutex);
- goto out;
- }
- }
-
- BUG_ON(req->r_reply);
-
- if (!head->safe) {
- req->r_got_unsafe = true;
- list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
- }
-
- dout("handle_reply tid %lld result %d\n", tid, result);
- rinfo = &req->r_reply_info;
- err = parse_reply_info(msg, rinfo);
- mutex_unlock(&mdsc->mutex);
-
- mutex_lock(&session->s_mutex);
- if (err < 0) {
- pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
- goto out_err;
- }
-
- /* snap trace */
- if (rinfo->snapblob_len) {
- down_write(&mdsc->snap_rwsem);
- ceph_update_snap_trace(mdsc, rinfo->snapblob,
- rinfo->snapblob + rinfo->snapblob_len,
- le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
- downgrade_write(&mdsc->snap_rwsem);
- } else {
- down_read(&mdsc->snap_rwsem);
- }
-
- /* insert trace into our cache */
- err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
- if (err == 0) {
- if (result == 0 && rinfo->dir_nr)
- ceph_readdir_prepopulate(req, req->r_session);
- ceph_unreserve_caps(&req->r_caps_reservation);
- }
-
- up_read(&mdsc->snap_rwsem);
-out_err:
- if (err) {
- req->r_err = err;
- } else {
- req->r_reply = msg;
- ceph_msg_get(msg);
- }
-
- add_cap_releases(mdsc, req->r_session, -1);
- mutex_unlock(&session->s_mutex);
-
- /* kick calling process */
- complete_request(mdsc, req);
-out:
- ceph_mdsc_put_request(req);
- return;
-}
-
-
-
-/*
- * handle mds notification that our request has been forwarded.
- */
-static void handle_forward(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
-{
- struct ceph_mds_request *req;
- u64 tid;
- u32 next_mds;
- u32 fwd_seq;
- u8 must_resend;
- int err = -EINVAL;
- void *p = msg->front.iov_base;
- void *end = p + msg->front.iov_len;
- int from_mds, state;
-
- if (msg->hdr.src.name.type != CEPH_ENTITY_TYPE_MDS)
- goto bad;
- from_mds = le64_to_cpu(msg->hdr.src.name.num);
-
- ceph_decode_need(&p, end, sizeof(u64)+2*sizeof(u32), bad);
- ceph_decode_64(&p, tid);
- ceph_decode_32(&p, next_mds);
- ceph_decode_32(&p, fwd_seq);
- ceph_decode_8(&p, must_resend);
-
- WARN_ON(must_resend); /* shouldn't happen. */
-
- mutex_lock(&mdsc->mutex);
- req = __lookup_request(mdsc, tid);
- if (!req) {
- dout("forward %llu dne\n", tid);
- goto out; /* dup reply? */
- }
-
- state = mdsc->sessions[next_mds]->s_state;
- if (fwd_seq <= req->r_num_fwd) {
- dout("forward %llu to mds%d - old seq %d <= %d\n",
- tid, next_mds, req->r_num_fwd, fwd_seq);
- } else {
- /* resend. forward race not possible; mds would drop */
- dout("forward %llu to mds%d (we resend)\n", tid, next_mds);
- req->r_num_fwd = fwd_seq;
- req->r_resend_mds = next_mds;
- put_request_session(req);
- __do_request(mdsc, req);
- }
- ceph_mdsc_put_request(req);
-out:
- mutex_unlock(&mdsc->mutex);
- return;
-
-bad:
- pr_err("mdsc_handle_forward decode error err=%d\n", err);
-}
-
-/*
- * handle a mds session control message
- */
-static void handle_session(struct ceph_mds_session *session,
- struct ceph_msg *msg)
-{
- struct ceph_mds_client *mdsc = session->s_mdsc;
- u32 op;
- u64 seq;
- int mds;
- struct ceph_mds_session_head *h = msg->front.iov_base;
- int wake = 0;
-
- if (msg->hdr.src.name.type != CEPH_ENTITY_TYPE_MDS)
- return;
- mds = le64_to_cpu(msg->hdr.src.name.num);
-
- /* decode */
- if (msg->front.iov_len != sizeof(*h))
- goto bad;
- op = le32_to_cpu(h->op);
- seq = le64_to_cpu(h->seq);
-
- mutex_lock(&mdsc->mutex);
- /* FIXME: this ttl calculation is generous */
- session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
- mutex_unlock(&mdsc->mutex);
-
- mutex_lock(&session->s_mutex);
-
- dout("handle_session mds%d %s %p state %s seq %llu\n",
- mds, ceph_session_op_name(op), session,
- session_state_name(session->s_state), seq);
-
- if (session->s_state == CEPH_MDS_SESSION_HUNG) {
- session->s_state = CEPH_MDS_SESSION_OPEN;
- pr_info("mds%d came back\n", session->s_mds);
- }
-
- switch (op) {
- case CEPH_SESSION_OPEN:
- session->s_state = CEPH_MDS_SESSION_OPEN;
- renewed_caps(mdsc, session, 0);
- wake = 1;
- if (mdsc->stopping)
- __close_session(mdsc, session);
- break;
-
- case CEPH_SESSION_RENEWCAPS:
- if (session->s_renew_seq == seq)
- renewed_caps(mdsc, session, 1);
- break;
-
- case CEPH_SESSION_CLOSE:
- unregister_session(mdsc, mds);
- remove_session_caps(session);
- wake = 1; /* for good measure */
- complete(&mdsc->session_close_waiters);
- kick_requests(mdsc, mds, 0); /* cur only */
- break;
-
- case CEPH_SESSION_STALE:
- pr_info("mds%d caps went stale, renewing\n",
- session->s_mds);
- spin_lock(&session->s_cap_lock);
- session->s_cap_gen++;
- session->s_cap_ttl = 0;
- spin_unlock(&session->s_cap_lock);
- send_renew_caps(mdsc, session);
- break;
-
- case CEPH_SESSION_RECALL_STATE:
- trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
- break;
-
- default:
- pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
- WARN_ON(1);
- }
-
- mutex_unlock(&session->s_mutex);
- if (wake) {
- mutex_lock(&mdsc->mutex);
- __wake_requests(mdsc, &session->s_waiting);
- mutex_unlock(&mdsc->mutex);
- }
- return;
-
-bad:
- pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
- (int)msg->front.iov_len);
- return;
-}
-
-
-/*
- * called under session->mutex.
- */
-static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- struct ceph_mds_request *req, *nreq;
- int err;
-
- dout("replay_unsafe_requests mds%d\n", session->s_mds);
-
- mutex_lock(&mdsc->mutex);
- list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
- err = __prepare_send_request(mdsc, req, session->s_mds);
- if (!err) {
- ceph_msg_get(req->r_request);
- ceph_con_send(&session->s_con, req->r_request);
- }
- }
- mutex_unlock(&mdsc->mutex);
-}
-
-/*
- * Encode information about a cap for a reconnect with the MDS.
- */
-struct encode_caps_data {
- void **pp;
- void *end;
- int *num_caps;
-};
-
-static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
- void *arg)
-{
- struct ceph_mds_cap_reconnect *rec;
- struct ceph_inode_info *ci;
- struct encode_caps_data *data = (struct encode_caps_data *)arg;
- void *p = *(data->pp);
- void *end = data->end;
- char *path;
- int pathlen, err;
- u64 pathbase;
- struct dentry *dentry;
-
- ci = cap->ci;
-
- dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
- inode, ceph_vinop(inode), cap, cap->cap_id,
- ceph_cap_string(cap->issued));
- ceph_decode_need(&p, end, sizeof(u64), needmore);
- ceph_encode_64(&p, ceph_ino(inode));
-
- dentry = d_find_alias(inode);
- if (dentry) {
- path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- BUG_ON(err);
- }
- } else {
- path = NULL;
- pathlen = 0;
- }
- ceph_decode_need(&p, end, pathlen+4, needmore);
- ceph_encode_string(&p, end, path, pathlen);
-
- ceph_decode_need(&p, end, sizeof(*rec), needmore);
- rec = p;
- p += sizeof(*rec);
- BUG_ON(p > end);
- spin_lock(&inode->i_lock);
- cap->seq = 0; /* reset cap seq */
- cap->issue_seq = 0; /* and issue_seq */
- rec->cap_id = cpu_to_le64(cap->cap_id);
- rec->pathbase = cpu_to_le64(pathbase);
- rec->wanted = cpu_to_le32(__ceph_caps_wanted(ci));
- rec->issued = cpu_to_le32(cap->issued);
- rec->size = cpu_to_le64(inode->i_size);
- ceph_encode_timespec(&rec->mtime, &inode->i_mtime);
- ceph_encode_timespec(&rec->atime, &inode->i_atime);
- rec->snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- spin_unlock(&inode->i_lock);
-
- kfree(path);
- dput(dentry);
- (*data->num_caps)++;
- *(data->pp) = p;
- return 0;
-needmore:
- return -ENOSPC;
-}
-
-
-/*
- * If an MDS fails and recovers, clients need to reconnect in order to
- * reestablish shared state. This includes all caps issued through
- * this session _and_ the snap_realm hierarchy. Because it's not
- * clear which snap realms the mds cares about, we send everything we
- * know about.. that ensures we'll then get any new info the
- * recovering MDS might have.
- *
- * This is a relatively heavyweight operation, but it's rare.
- *
- * called with mdsc->mutex held.
- */
-static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
-{
- struct ceph_mds_session *session;
- struct ceph_msg *reply;
- int newlen, len = 4 + 1;
- void *p, *end;
- int err;
- int num_caps, num_realms = 0;
- int got;
- u64 next_snap_ino = 0;
- __le32 *pnum_caps, *pnum_realms;
- struct encode_caps_data iter_args;
-
- pr_info("reconnect to recovering mds%d\n", mds);
-
- /* find session */
- session = __ceph_lookup_mds_session(mdsc, mds);
- mutex_unlock(&mdsc->mutex); /* drop lock for duration */
-
- if (session) {
- mutex_lock(&session->s_mutex);
-
- session->s_state = CEPH_MDS_SESSION_RECONNECTING;
- session->s_seq = 0;
-
- ceph_con_open(&session->s_con,
- ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
-
- /* replay unsafe requests */
- replay_unsafe_requests(mdsc, session);
-
- /* estimate needed space */
- len += session->s_nr_caps *
- (100+sizeof(struct ceph_mds_cap_reconnect));
- pr_info("estimating i need %d bytes for %d caps\n",
- len, session->s_nr_caps);
- } else {
- dout("no session for mds%d, will send short reconnect\n",
- mds);
- }
-
- down_read(&mdsc->snap_rwsem);
-
-retry:
- /* build reply */
- reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, len, 0, 0, NULL);
- if (IS_ERR(reply)) {
- err = PTR_ERR(reply);
- pr_err("send_mds_reconnect ENOMEM on %d for mds%d\n",
- len, mds);
- goto out;
- }
- p = reply->front.iov_base;
- end = p + len;
-
- if (!session) {
- ceph_encode_8(&p, 1); /* session was closed */
- ceph_encode_32(&p, 0);
- goto send;
- }
- dout("session %p state %s\n", session,
- session_state_name(session->s_state));
-
- /* traverse this session's caps */
- ceph_encode_8(&p, 0);
- pnum_caps = p;
- ceph_encode_32(&p, session->s_nr_caps);
- num_caps = 0;
-
- iter_args.pp = &p;
- iter_args.end = end;
- iter_args.num_caps = &num_caps;
- err = iterate_session_caps(session, encode_caps_cb, &iter_args);
- if (err == -ENOSPC)
- goto needmore;
- if (err < 0)
- goto out;
- *pnum_caps = cpu_to_le32(num_caps);
-
- /*
- * snaprealms. we provide mds with the ino, seq (version), and
- * parent for all of our realms. If the mds has any newer info,
- * it will tell us.
- */
- next_snap_ino = 0;
- /* save some space for the snaprealm count */
- pnum_realms = p;
- ceph_decode_need(&p, end, sizeof(*pnum_realms), needmore);
- p += sizeof(*pnum_realms);
- num_realms = 0;
- while (1) {
- struct ceph_snap_realm *realm;
- struct ceph_mds_snaprealm_reconnect *sr_rec;
- got = radix_tree_gang_lookup(&mdsc->snap_realms,
- (void **)&realm, next_snap_ino, 1);
- if (!got)
- break;
-
- dout(" adding snap realm %llx seq %lld parent %llx\n",
- realm->ino, realm->seq, realm->parent_ino);
- ceph_decode_need(&p, end, sizeof(*sr_rec), needmore);
- sr_rec = p;
- sr_rec->ino = cpu_to_le64(realm->ino);
- sr_rec->seq = cpu_to_le64(realm->seq);
- sr_rec->parent = cpu_to_le64(realm->parent_ino);
- p += sizeof(*sr_rec);
- num_realms++;
- next_snap_ino = realm->ino + 1;
- }
- *pnum_realms = cpu_to_le32(num_realms);
-
-send:
- reply->front.iov_len = p - reply->front.iov_base;
- reply->hdr.front_len = cpu_to_le32(reply->front.iov_len);
- dout("final len was %u (guessed %d)\n",
- (unsigned)reply->front.iov_len, len);
- ceph_con_send(&session->s_con, reply);
-
- if (session) {
- session->s_state = CEPH_MDS_SESSION_OPEN;
- __wake_requests(mdsc, &session->s_waiting);
- }
-
-out:
- up_read(&mdsc->snap_rwsem);
- if (session) {
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
- }
- mutex_lock(&mdsc->mutex);
- return;
-
-needmore:
- /*
- * we need a larger buffer. this doesn't very accurately
- * factor in snap realms, but it's safe.
- */
- num_caps += num_realms;
- newlen = len * ((100 * (session->s_nr_caps+3)) / (num_caps + 1)) / 100;
- pr_info("i guessed %d, and did %d of %d caps, retrying with %d\n",
- len, num_caps, session->s_nr_caps, newlen);
- len = newlen;
- ceph_msg_put(reply);
- goto retry;
-}
-
-
-/*
- * compare old and new mdsmaps, kicking requests
- * and closing out old connections as necessary
- *
- * called under mdsc->mutex.
- */
-static void check_new_map(struct ceph_mds_client *mdsc,
- struct ceph_mdsmap *newmap,
- struct ceph_mdsmap *oldmap)
-{
- int i;
- int oldstate, newstate;
- struct ceph_mds_session *s;
-
- dout("check_new_map new %u old %u\n",
- newmap->m_epoch, oldmap->m_epoch);
-
- for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
- if (mdsc->sessions[i] == NULL)
- continue;
- s = mdsc->sessions[i];
- oldstate = ceph_mdsmap_get_state(oldmap, i);
- newstate = ceph_mdsmap_get_state(newmap, i);
-
- dout("check_new_map mds%d state %s -> %s (session %s)\n",
- i, ceph_mds_state_name(oldstate),
- ceph_mds_state_name(newstate),
- session_state_name(s->s_state));
-
- if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
- ceph_mdsmap_get_addr(newmap, i),
- sizeof(struct ceph_entity_addr))) {
- if (s->s_state == CEPH_MDS_SESSION_OPENING) {
- /* the session never opened, just close it
- * out now */
- __wake_requests(mdsc, &s->s_waiting);
- unregister_session(mdsc, i);
- } else {
- /* just close it */
- mutex_unlock(&mdsc->mutex);
- mutex_lock(&s->s_mutex);
- mutex_lock(&mdsc->mutex);
- ceph_con_close(&s->s_con);
- mutex_unlock(&s->s_mutex);
- s->s_state = CEPH_MDS_SESSION_RESTARTING;
- }
-
- /* kick any requests waiting on the recovering mds */
- kick_requests(mdsc, i, 1);
- } else if (oldstate == newstate) {
- continue; /* nothing new with this mds */
- }
-
- /*
- * send reconnect?
- */
- if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
- newstate >= CEPH_MDS_STATE_RECONNECT)
- send_mds_reconnect(mdsc, i);
-
- /*
- * kick requests on any mds that has gone active.
- *
- * kick requests on cur or forwarder: we may have sent
- * the request to mds1, mds1 told us it forwarded it
- * to mds2, but then we learn mds1 failed and can't be
- * sure it successfully forwarded our request before
- * it died.
- */
- if (oldstate < CEPH_MDS_STATE_ACTIVE &&
- newstate >= CEPH_MDS_STATE_ACTIVE) {
- kick_requests(mdsc, i, 1);
- ceph_kick_flushing_caps(mdsc, s);
- }
- }
-}
-
-
-
-/*
- * leases
- */
-
-/*
- * caller must hold session s_mutex, dentry->d_lock
- */
-void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
-{
- struct ceph_dentry_info *di = ceph_dentry(dentry);
-
- ceph_put_mds_session(di->lease_session);
- di->lease_session = NULL;
-}
-
-static void handle_lease(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
-{
- struct super_block *sb = mdsc->client->sb;
- struct inode *inode;
- struct ceph_mds_session *session;
- struct ceph_inode_info *ci;
- struct dentry *parent, *dentry;
- struct ceph_dentry_info *di;
- int mds;
- struct ceph_mds_lease *h = msg->front.iov_base;
- struct ceph_vino vino;
- int mask;
- struct qstr dname;
- int release = 0;
-
- if (msg->hdr.src.name.type != CEPH_ENTITY_TYPE_MDS)
- return;
- mds = le64_to_cpu(msg->hdr.src.name.num);
- dout("handle_lease from mds%d\n", mds);
-
- /* decode */
- if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
- goto bad;
- vino.ino = le64_to_cpu(h->ino);
- vino.snap = CEPH_NOSNAP;
- mask = le16_to_cpu(h->mask);
- dname.name = (void *)h + sizeof(*h) + sizeof(u32);
- dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
- if (dname.len != get_unaligned_le32(h+1))
- goto bad;
-
- /* find session */
- mutex_lock(&mdsc->mutex);
- session = __ceph_lookup_mds_session(mdsc, mds);
- mutex_unlock(&mdsc->mutex);
- if (!session) {
- pr_err("handle_lease got lease but no session mds%d\n", mds);
- return;
- }
-
- mutex_lock(&session->s_mutex);
- session->s_seq++;
-
- /* lookup inode */
- inode = ceph_find_inode(sb, vino);
- dout("handle_lease '%s', mask %d, ino %llx %p\n",
- ceph_lease_op_name(h->action), mask, vino.ino, inode);
- if (inode == NULL) {
- dout("handle_lease no inode %llx\n", vino.ino);
- goto release;
- }
- ci = ceph_inode(inode);
-
- /* dentry */
- parent = d_find_alias(inode);
- if (!parent) {
- dout("no parent dentry on inode %p\n", inode);
- WARN_ON(1);
- goto release; /* hrm... */
- }
- dname.hash = full_name_hash(dname.name, dname.len);
- dentry = d_lookup(parent, &dname);
- dput(parent);
- if (!dentry)
- goto release;
-
- spin_lock(&dentry->d_lock);
- di = ceph_dentry(dentry);
- switch (h->action) {
- case CEPH_MDS_LEASE_REVOKE:
- if (di && di->lease_session == session) {
- h->seq = cpu_to_le32(di->lease_seq);
- __ceph_mdsc_drop_dentry_lease(dentry);
- }
- release = 1;
- break;
-
- case CEPH_MDS_LEASE_RENEW:
- if (di && di->lease_session == session &&
- di->lease_gen == session->s_cap_gen &&
- di->lease_renew_from &&
- di->lease_renew_after == 0) {
- unsigned long duration =
- le32_to_cpu(h->duration_ms) * HZ / 1000;
-
- di->lease_seq = le32_to_cpu(h->seq);
- dentry->d_time = di->lease_renew_from + duration;
- di->lease_renew_after = di->lease_renew_from +
- (duration >> 1);
- di->lease_renew_from = 0;
- }
- break;
- }
- spin_unlock(&dentry->d_lock);
- dput(dentry);
-
- if (!release)
- goto out;
-
-release:
- /* let's just reuse the same message */
- h->action = CEPH_MDS_LEASE_REVOKE_ACK;
- ceph_msg_get(msg);
- ceph_con_send(&session->s_con, msg);
-
-out:
- iput(inode);
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
- return;
-
-bad:
- pr_err("corrupt lease message\n");
-}
-
-void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
- struct inode *inode,
- struct dentry *dentry, char action,
- u32 seq)
-{
- struct ceph_msg *msg;
- struct ceph_mds_lease *lease;
- int len = sizeof(*lease) + sizeof(u32);
- int dnamelen = 0;
-
- dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
- inode, dentry, ceph_lease_op_name(action), session->s_mds);
- dnamelen = dentry->d_name.len;
- len += dnamelen;
-
- msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
- if (IS_ERR(msg))
- return;
- lease = msg->front.iov_base;
- lease->action = action;
- lease->mask = cpu_to_le16(CEPH_LOCK_DN);
- lease->ino = cpu_to_le64(ceph_vino(inode).ino);
- lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
- lease->seq = cpu_to_le32(seq);
- put_unaligned_le32(dnamelen, lease + 1);
- memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
-
- /*
- * if this is a preemptive lease RELEASE, no need to
- * flush request stream, since the actual request will
- * soon follow.
- */
- msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
-
- ceph_con_send(&session->s_con, msg);
-}
-
-/*
- * Preemptively release a lease we expect to invalidate anyway.
- * Pass @inode always, @dentry is optional.
- */
-void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
- struct dentry *dentry, int mask)
-{
- struct ceph_dentry_info *di;
- struct ceph_mds_session *session;
- u32 seq;
-
- BUG_ON(inode == NULL);
- BUG_ON(dentry == NULL);
- BUG_ON(mask != CEPH_LOCK_DN);
-
- /* is dentry lease valid? */
- spin_lock(&dentry->d_lock);
- di = ceph_dentry(dentry);
- if (!di || !di->lease_session ||
- di->lease_session->s_mds < 0 ||
- di->lease_gen != di->lease_session->s_cap_gen ||
- !time_before(jiffies, dentry->d_time)) {
- dout("lease_release inode %p dentry %p -- "
- "no lease on %d\n",
- inode, dentry, mask);
- spin_unlock(&dentry->d_lock);
- return;
- }
-
- /* we do have a lease on this dentry; note mds and seq */
- session = ceph_get_mds_session(di->lease_session);
- seq = di->lease_seq;
- __ceph_mdsc_drop_dentry_lease(dentry);
- spin_unlock(&dentry->d_lock);
-
- dout("lease_release inode %p dentry %p mask %d to mds%d\n",
- inode, dentry, mask, session->s_mds);
- ceph_mdsc_lease_send_msg(session, inode, dentry,
- CEPH_MDS_LEASE_RELEASE, seq);
- ceph_put_mds_session(session);
-}
-
-/*
- * drop all leases (and dentry refs) in preparation for umount
- */
-static void drop_leases(struct ceph_mds_client *mdsc)
-{
- int i;
-
- dout("drop_leases\n");
- mutex_lock(&mdsc->mutex);
- for (i = 0; i < mdsc->max_sessions; i++) {
- struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
- if (!s)
- continue;
- mutex_unlock(&mdsc->mutex);
- mutex_lock(&s->s_mutex);
- mutex_unlock(&s->s_mutex);
- ceph_put_mds_session(s);
- mutex_lock(&mdsc->mutex);
- }
- mutex_unlock(&mdsc->mutex);
-}
-
-
-
-/*
- * delayed work -- periodically trim expired leases, renew caps with mds
- */
-static void schedule_delayed(struct ceph_mds_client *mdsc)
-{
- int delay = 5;
- unsigned hz = round_jiffies_relative(HZ * delay);
- schedule_delayed_work(&mdsc->delayed_work, hz);
-}
-
-static void delayed_work(struct work_struct *work)
-{
- int i;
- struct ceph_mds_client *mdsc =
- container_of(work, struct ceph_mds_client, delayed_work.work);
- int renew_interval;
- int renew_caps;
-
- dout("mdsc delayed_work\n");
- ceph_check_delayed_caps(mdsc, 0);
-
- mutex_lock(&mdsc->mutex);
- renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
- renew_caps = time_after_eq(jiffies, HZ*renew_interval +
- mdsc->last_renew_caps);
- if (renew_caps)
- mdsc->last_renew_caps = jiffies;
-
- for (i = 0; i < mdsc->max_sessions; i++) {
- struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
- if (s == NULL)
- continue;
- if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
- dout("resending session close request for mds%d\n",
- s->s_mds);
- request_close_session(mdsc, s);
- ceph_put_mds_session(s);
- continue;
- }
- if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
- if (s->s_state == CEPH_MDS_SESSION_OPEN) {
- s->s_state = CEPH_MDS_SESSION_HUNG;
- pr_info("mds%d hung\n", s->s_mds);
- }
- }
- if (s->s_state < CEPH_MDS_SESSION_OPEN) {
- /* this mds is failed or recovering, just wait */
- ceph_put_mds_session(s);
- continue;
- }
- mutex_unlock(&mdsc->mutex);
-
- mutex_lock(&s->s_mutex);
- if (renew_caps)
- send_renew_caps(mdsc, s);
- else
- ceph_con_keepalive(&s->s_con);
- add_cap_releases(mdsc, s, -1);
- send_cap_releases(mdsc, s);
- mutex_unlock(&s->s_mutex);
- ceph_put_mds_session(s);
-
- mutex_lock(&mdsc->mutex);
- }
- mutex_unlock(&mdsc->mutex);
-
- schedule_delayed(mdsc);
-}
-
-
-void ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
-{
- mdsc->client = client;
- mutex_init(&mdsc->mutex);
- mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
- init_completion(&mdsc->safe_umount_waiters);
- init_completion(&mdsc->session_close_waiters);
- INIT_LIST_HEAD(&mdsc->waiting_for_map);
- mdsc->sessions = NULL;
- mdsc->max_sessions = 0;
- mdsc->stopping = 0;
- init_rwsem(&mdsc->snap_rwsem);
- INIT_RADIX_TREE(&mdsc->snap_realms, GFP_NOFS);
- INIT_LIST_HEAD(&mdsc->snap_empty);
- spin_lock_init(&mdsc->snap_empty_lock);
- mdsc->last_tid = 0;
- INIT_RADIX_TREE(&mdsc->request_tree, GFP_NOFS);
- INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
- mdsc->last_renew_caps = jiffies;
- INIT_LIST_HEAD(&mdsc->cap_delay_list);
- spin_lock_init(&mdsc->cap_delay_lock);
- INIT_LIST_HEAD(&mdsc->snap_flush_list);
- spin_lock_init(&mdsc->snap_flush_lock);
- mdsc->cap_flush_seq = 0;
- INIT_LIST_HEAD(&mdsc->cap_dirty);
- mdsc->num_cap_flushing = 0;
- spin_lock_init(&mdsc->cap_dirty_lock);
- init_waitqueue_head(&mdsc->cap_flushing_wq);
- spin_lock_init(&mdsc->dentry_lru_lock);
- INIT_LIST_HEAD(&mdsc->dentry_lru);
-}
-
-/*
- * Wait for safe replies on open mds requests. If we time out, drop
- * all requests from the tree to avoid dangling dentry refs.
- */
-static void wait_requests(struct ceph_mds_client *mdsc)
-{
- struct ceph_mds_request *req;
- struct ceph_client *client = mdsc->client;
-
- mutex_lock(&mdsc->mutex);
- if (__get_oldest_tid(mdsc)) {
- mutex_unlock(&mdsc->mutex);
- dout("wait_requests waiting for requests\n");
- wait_for_completion_timeout(&mdsc->safe_umount_waiters,
- client->mount_args.mount_timeout * HZ);
- mutex_lock(&mdsc->mutex);
-
- /* tear down remaining requests */
- while (radix_tree_gang_lookup(&mdsc->request_tree,
- (void **)&req, 0, 1)) {
- dout("wait_requests timed out on tid %llu\n",
- req->r_tid);
- radix_tree_delete(&mdsc->request_tree, req->r_tid);
- ceph_mdsc_put_request(req);
- }
- }
- mutex_unlock(&mdsc->mutex);
- dout("wait_requests done\n");
-}
-
-/*
- * called before mount is ro, and before dentries are torn down.
- * (hmm, does this still race with new lookups?)
- */
-void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
-{
- dout("pre_umount\n");
- mdsc->stopping = 1;
-
- drop_leases(mdsc);
- ceph_check_delayed_caps(mdsc, 1);
- wait_requests(mdsc);
-}
-
-/*
- * wait for all write mds requests to flush.
- */
-static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
-{
- struct ceph_mds_request *req;
- u64 next_tid = 0;
- int got;
-
- mutex_lock(&mdsc->mutex);
- dout("wait_unsafe_requests want %lld\n", want_tid);
- while (1) {
- got = radix_tree_gang_lookup(&mdsc->request_tree, (void **)&req,
- next_tid, 1);
- if (!got)
- break;
- if (req->r_tid > want_tid)
- break;
-
- next_tid = req->r_tid + 1;
- if ((req->r_op & CEPH_MDS_OP_WRITE) == 0)
- continue; /* not a write op */
-
- ceph_mdsc_get_request(req);
- mutex_unlock(&mdsc->mutex);
- dout("wait_unsafe_requests wait on %llu (want %llu)\n",
- req->r_tid, want_tid);
- wait_for_completion(&req->r_safe_completion);
- mutex_lock(&mdsc->mutex);
- ceph_mdsc_put_request(req);
- }
- mutex_unlock(&mdsc->mutex);
- dout("wait_unsafe_requests done\n");
-}
-
-void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
-{
- u64 want_tid, want_flush;
-
- dout("sync\n");
- mutex_lock(&mdsc->mutex);
- want_tid = mdsc->last_tid;
- want_flush = mdsc->cap_flush_seq;
- mutex_unlock(&mdsc->mutex);
- dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
-
- ceph_check_delayed_caps(mdsc, 1);
-
- wait_unsafe_requests(mdsc, want_tid);
- wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
-}
-
-
-/*
- * called after sb is ro.
- */
-void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
-{
- struct ceph_mds_session *session;
- int i;
- int n;
- struct ceph_client *client = mdsc->client;
- unsigned long started, timeout = client->mount_args.mount_timeout * HZ;
-
- dout("close_sessions\n");
-
- mutex_lock(&mdsc->mutex);
-
- /* close sessions */
- started = jiffies;
- while (time_before(jiffies, started + timeout)) {
- dout("closing sessions\n");
- n = 0;
- for (i = 0; i < mdsc->max_sessions; i++) {
- session = __ceph_lookup_mds_session(mdsc, i);
- if (!session)
- continue;
- mutex_unlock(&mdsc->mutex);
- mutex_lock(&session->s_mutex);
- __close_session(mdsc, session);
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
- mutex_lock(&mdsc->mutex);
- n++;
- }
- if (n == 0)
- break;
-
- if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
- break;
-
- dout("waiting for sessions to close\n");
- mutex_unlock(&mdsc->mutex);
- wait_for_completion_timeout(&mdsc->session_close_waiters,
- timeout);
- mutex_lock(&mdsc->mutex);
- }
-
- /* tear down remaining sessions */
- for (i = 0; i < mdsc->max_sessions; i++) {
- if (mdsc->sessions[i]) {
- session = get_session(mdsc->sessions[i]);
- unregister_session(mdsc, i);
- mutex_unlock(&mdsc->mutex);
- mutex_lock(&session->s_mutex);
- remove_session_caps(session);
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
- mutex_lock(&mdsc->mutex);
- }
- }
-
- WARN_ON(!list_empty(&mdsc->cap_delay_list));
-
- mutex_unlock(&mdsc->mutex);
-
- ceph_cleanup_empty_realms(mdsc);
-
- cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
-
- dout("stopped\n");
-}
-
-void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
-{
- dout("stop\n");
- cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
- if (mdsc->mdsmap)
- ceph_mdsmap_destroy(mdsc->mdsmap);
- kfree(mdsc->sessions);
-}
-
-
-/*
- * handle mds map update.
- */
-void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
-{
- u32 epoch;
- u32 maplen;
- void *p = msg->front.iov_base;
- void *end = p + msg->front.iov_len;
- struct ceph_mdsmap *newmap, *oldmap;
- struct ceph_fsid fsid;
- int err = -EINVAL;
-
- ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
- ceph_decode_copy(&p, &fsid, sizeof(fsid));
- if (ceph_fsid_compare(&fsid, &mdsc->client->monc.monmap->fsid)) {
- pr_err("got mdsmap with wrong fsid\n");
- return;
- }
- ceph_decode_32(&p, epoch);
- ceph_decode_32(&p, maplen);
- dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
-
- /* do we need it? */
- ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
- mutex_lock(&mdsc->mutex);
- if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
- dout("handle_map epoch %u <= our %u\n",
- epoch, mdsc->mdsmap->m_epoch);
- mutex_unlock(&mdsc->mutex);
- return;
- }
-
- newmap = ceph_mdsmap_decode(&p, end);
- if (IS_ERR(newmap)) {
- err = PTR_ERR(newmap);
- goto bad_unlock;
- }
-
- /* swap into place */
- if (mdsc->mdsmap) {
- oldmap = mdsc->mdsmap;
- mdsc->mdsmap = newmap;
- check_new_map(mdsc, newmap, oldmap);
- ceph_mdsmap_destroy(oldmap);
- } else {
- mdsc->mdsmap = newmap; /* first mds map */
- }
- mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
-
- __wake_requests(mdsc, &mdsc->waiting_for_map);
-
- mutex_unlock(&mdsc->mutex);
- schedule_delayed(mdsc);
- return;
-
-bad_unlock:
- mutex_unlock(&mdsc->mutex);
-bad:
- pr_err("error decoding mdsmap %d\n", err);
- return;
-}
-
-static struct ceph_connection *con_get(struct ceph_connection *con)
-{
- struct ceph_mds_session *s = con->private;
-
- if (get_session(s)) {
- dout("mdsc con_get %p %d -> %d\n", s,
- atomic_read(&s->s_ref) - 1, atomic_read(&s->s_ref));
- return con;
- }
- dout("mdsc con_get %p FAIL\n", s);
- return NULL;
-}
-
-static void con_put(struct ceph_connection *con)
-{
- struct ceph_mds_session *s = con->private;
-
- dout("mdsc con_put %p %d -> %d\n", s, atomic_read(&s->s_ref),
- atomic_read(&s->s_ref) - 1);
- ceph_put_mds_session(s);
-}
-
-/*
- * if the client is unresponsive for long enough, the mds will kill
- * the session entirely.
- */
-static void peer_reset(struct ceph_connection *con)
-{
- struct ceph_mds_session *s = con->private;
-
- pr_err("mds%d gave us the boot. IMPLEMENT RECONNECT.\n",
- s->s_mds);
-}
-
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
-{
- struct ceph_mds_session *s = con->private;
- struct ceph_mds_client *mdsc = s->s_mdsc;
- int type = le16_to_cpu(msg->hdr.type);
-
- switch (type) {
- case CEPH_MSG_MDS_MAP:
- ceph_mdsc_handle_map(mdsc, msg);
- break;
- case CEPH_MSG_CLIENT_SESSION:
- handle_session(s, msg);
- break;
- case CEPH_MSG_CLIENT_REPLY:
- handle_reply(s, msg);
- break;
- case CEPH_MSG_CLIENT_REQUEST_FORWARD:
- handle_forward(mdsc, msg);
- break;
- case CEPH_MSG_CLIENT_CAPS:
- ceph_handle_caps(s, msg);
- break;
- case CEPH_MSG_CLIENT_SNAP:
- ceph_handle_snap(mdsc, msg);
- break;
- case CEPH_MSG_CLIENT_LEASE:
- handle_lease(mdsc, msg);
- break;
-
- default:
- pr_err("received unknown message type %d %s\n", type,
- ceph_msg_type_name(type));
- }
- ceph_msg_put(msg);
-}
-
-const static struct ceph_connection_operations mds_con_ops = {
- .get = con_get,
- .put = con_put,
- .dispatch = dispatch,
- .peer_reset = peer_reset,
- .alloc_msg = ceph_alloc_msg,
- .alloc_middle = ceph_alloc_middle,
-};
-
-
-
-
-/* eof */
+++ /dev/null
-#ifndef _FS_CEPH_MDS_CLIENT_H
-#define _FS_CEPH_MDS_CLIENT_H
-
-#include <linux/completion.h>
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/radix-tree.h>
-#include <linux/spinlock.h>
-
-#include "types.h"
-#include "messenger.h"
-#include "mdsmap.h"
-
-/*
- * Some lock dependencies:
- *
- * session->s_mutex
- * mdsc->mutex
- *
- * mdsc->snap_rwsem
- *
- * inode->i_lock
- * mdsc->snap_flush_lock
- * mdsc->cap_delay_lock
- *
- */
-
-struct ceph_client;
-struct ceph_cap;
-
-/*
- * parsed info about a single inode. pointers are into the encoded
- * on-wire structures within the mds reply message payload.
- */
-struct ceph_mds_reply_info_in {
- struct ceph_mds_reply_inode *in;
- u32 symlink_len;
- char *symlink;
- u32 xattr_len;
- char *xattr_data;
-};
-
-/*
- * parsed info about an mds reply, including information about the
- * target inode and/or its parent directory and dentry, and directory
- * contents (for readdir results).
- */
-struct ceph_mds_reply_info_parsed {
- struct ceph_mds_reply_head *head;
-
- struct ceph_mds_reply_info_in diri, targeti;
- struct ceph_mds_reply_dirfrag *dirfrag;
- char *dname;
- u32 dname_len;
- struct ceph_mds_reply_lease *dlease;
-
- struct ceph_mds_reply_dirfrag *dir_dir;
- int dir_nr;
- char **dir_dname;
- u32 *dir_dname_len;
- struct ceph_mds_reply_lease **dir_dlease;
- struct ceph_mds_reply_info_in *dir_in;
- u8 dir_complete, dir_end;
-
- /* encoded blob describing snapshot contexts for certain
- operations (e.g., open) */
- void *snapblob;
- int snapblob_len;
-};
-
-
-/*
- * cap releases are batched and sent to the MDS en masse.
- */
-#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \
- sizeof(struct ceph_mds_cap_release)) / \
- sizeof(struct ceph_mds_cap_item))
-
-
-/*
- * state associated with each MDS<->client session
- */
-enum {
- CEPH_MDS_SESSION_NEW = 1,
- CEPH_MDS_SESSION_OPENING = 2,
- CEPH_MDS_SESSION_OPEN = 3,
- CEPH_MDS_SESSION_HUNG = 4,
- CEPH_MDS_SESSION_CLOSING = 5,
- CEPH_MDS_SESSION_RESTARTING = 6,
- CEPH_MDS_SESSION_RECONNECTING = 7,
-};
-
-struct ceph_mds_session {
- struct ceph_mds_client *s_mdsc;
- int s_mds;
- int s_state;
- unsigned long s_ttl; /* time until mds kills us */
- u64 s_seq; /* incoming msg seq # */
- struct mutex s_mutex; /* serialize session messages */
-
- struct ceph_connection s_con;
-
- /* protected by s_cap_lock */
- spinlock_t s_cap_lock;
- u32 s_cap_gen; /* inc each time we get mds stale msg */
- unsigned long s_cap_ttl; /* when session caps expire */
- struct list_head s_caps; /* all caps issued by this session */
- int s_nr_caps, s_trim_caps;
- int s_num_cap_releases;
- struct list_head s_cap_releases; /* waiting cap_release messages */
- struct list_head s_cap_releases_done; /* ready to send */
-
- /* protected by mutex */
- struct list_head s_cap_flushing; /* inodes w/ flushing caps */
- struct list_head s_cap_snaps_flushing;
- unsigned long s_renew_requested; /* last time we sent a renew req */
- u64 s_renew_seq;
-
- atomic_t s_ref;
- struct list_head s_waiting; /* waiting requests */
- struct list_head s_unsafe; /* unsafe requests */
-};
-
-/*
- * modes of choosing which MDS to send a request to
- */
-enum {
- USE_ANY_MDS,
- USE_RANDOM_MDS,
- USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */
-};
-
-struct ceph_mds_request;
-struct ceph_mds_client;
-
-/*
- * request completion callback
- */
-typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req);
-
-/*
- * an in-flight mds request
- */
-struct ceph_mds_request {
- u64 r_tid; /* transaction id */
-
- int r_op; /* mds op code */
- int r_mds;
-
- /* operation on what? */
- struct inode *r_inode; /* arg1 */
- struct dentry *r_dentry; /* arg1 */
- struct dentry *r_old_dentry; /* arg2: rename from or link from */
- char *r_path1, *r_path2;
- struct ceph_vino r_ino1, r_ino2;
-
- struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
- struct inode *r_target_inode; /* resulting inode */
-
- union ceph_mds_request_args r_args;
- int r_fmode; /* file mode, if expecting cap */
-
- /* for choosing which mds to send this request to */
- int r_direct_mode;
- u32 r_direct_hash; /* choose dir frag based on this dentry hash */
- bool r_direct_is_hash; /* true if r_direct_hash is valid */
-
- /* data payload is used for xattr ops */
- struct page **r_pages;
- int r_num_pages;
- int r_data_len;
-
- /* what caps shall we drop? */
- int r_inode_drop, r_inode_unless;
- int r_dentry_drop, r_dentry_unless;
- int r_old_dentry_drop, r_old_dentry_unless;
- struct inode *r_old_inode;
- int r_old_inode_drop, r_old_inode_unless;
-
- struct ceph_msg *r_request; /* original request */
- struct ceph_msg *r_reply;
- struct ceph_mds_reply_info_parsed r_reply_info;
- int r_err;
-
- unsigned long r_timeout; /* optional. jiffies */
- unsigned long r_started; /* start time to measure timeout against */
- unsigned long r_request_started; /* start time for mds request only,
- used to measure lease durations */
-
- /* link unsafe requests to parent directory, for fsync */
- struct inode *r_unsafe_dir;
- struct list_head r_unsafe_dir_item;
-
- struct ceph_mds_session *r_session;
-
- int r_attempts; /* resend attempts */
- int r_num_fwd; /* number of forward attempts */
- int r_num_stale;
- int r_resend_mds; /* mds to resend to next, if any*/
-
- atomic_t r_ref;
- struct list_head r_wait;
- struct completion r_completion;
- struct completion r_safe_completion;
- ceph_mds_request_callback_t r_callback;
- struct list_head r_unsafe_item; /* per-session unsafe list item */
- bool r_got_unsafe, r_got_safe;
-
- bool r_did_prepopulate;
- u32 r_readdir_offset;
-
- struct ceph_cap_reservation r_caps_reservation;
- int r_num_caps;
-};
-
-/*
- * mds client state
- */
-struct ceph_mds_client {
- struct ceph_client *client;
- struct mutex mutex; /* all nested structures */
-
- struct ceph_mdsmap *mdsmap;
- struct completion safe_umount_waiters, session_close_waiters;
- struct list_head waiting_for_map;
-
- struct ceph_mds_session **sessions; /* NULL for mds if no session */
- int max_sessions; /* len of s_mds_sessions */
- int stopping; /* true if shutting down */
-
- /*
- * snap_rwsem will cover cap linkage into snaprealms, and
- * realm snap contexts. (later, we can do per-realm snap
- * contexts locks..) the empty list contains realms with no
- * references (implying they contain no inodes with caps) that
- * should be destroyed.
- */
- struct rw_semaphore snap_rwsem;
- struct radix_tree_root snap_realms;
- struct list_head snap_empty;
- spinlock_t snap_empty_lock; /* protect snap_empty */
-
- u64 last_tid; /* most recent mds request */
- struct radix_tree_root request_tree; /* pending mds requests */
- struct delayed_work delayed_work; /* delayed work */
- unsigned long last_renew_caps; /* last time we renewed our caps */
- struct list_head cap_delay_list; /* caps with delayed release */
- spinlock_t cap_delay_lock; /* protects cap_delay_list */
- struct list_head snap_flush_list; /* cap_snaps ready to flush */
- spinlock_t snap_flush_lock;
-
- u64 cap_flush_seq;
- struct list_head cap_dirty; /* inodes with dirty caps */
- int num_cap_flushing; /* # caps we are flushing */
- spinlock_t cap_dirty_lock; /* protects above items */
- wait_queue_head_t cap_flushing_wq;
-
- struct dentry *debugfs_file;
-
- spinlock_t dentry_lru_lock;
- struct list_head dentry_lru;
- int num_dentry;
-};
-
-extern const char *ceph_mds_op_name(int op);
-
-extern struct ceph_mds_session *
-__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
-
-static inline struct ceph_mds_session *
-ceph_get_mds_session(struct ceph_mds_session *s)
-{
- atomic_inc(&s->s_ref);
- return s;
-}
-
-extern void ceph_put_mds_session(struct ceph_mds_session *s);
-
-extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
- struct ceph_msg *msg, int mds);
-
-extern void ceph_mdsc_init(struct ceph_mds_client *mdsc,
- struct ceph_client *client);
-extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
-extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
-
-extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
-
-extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
- struct inode *inode,
- struct dentry *dn, int mask);
-
-extern struct ceph_mds_request *
-ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
-extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req);
-extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
- struct inode *dir,
- struct ceph_mds_request *req);
-static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
-{
- atomic_inc(&req->r_ref);
-}
-extern void ceph_mdsc_put_request(struct ceph_mds_request *req);
-
-extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
-
-extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
- int stop_on_nosnap);
-
-extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
-extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
- struct inode *inode,
- struct dentry *dentry, char action,
- u32 seq);
-
-extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
- struct ceph_msg *msg);
-
-#endif
+++ /dev/null
-#include "ceph_debug.h"
-
-#include <linux/bug.h>
-#include <linux/err.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-#include <linux/types.h>
-
-#include "mdsmap.h"
-#include "messenger.h"
-#include "decode.h"
-
-#include "super.h"
-
-
-/*
- * choose a random mds that is "up" (i.e. has a state > 0), or -1.
- */
-int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
-{
- int n = 0;
- int i;
- char r;
-
- /* count */
- for (i = 0; i < m->m_max_mds; i++)
- if (m->m_info[i].state > 0)
- n++;
- if (n == 0)
- return -1;
-
- /* pick */
- get_random_bytes(&r, 1);
- n = r % n;
- i = 0;
- for (i = 0; n > 0; i++, n--)
- while (m->m_info[i].state <= 0)
- i++;
-
- return i;
-}
-
-/*
- * Decode an MDS map
- *
- * Ignore any fields we don't care about (there are quite a few of
- * them).
- */
-struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
-{
- struct ceph_mdsmap *m;
- int i, j, n;
- int err = -EINVAL;
- u16 version;
-
- m = kzalloc(sizeof(*m), GFP_NOFS);
- if (m == NULL)
- return ERR_PTR(-ENOMEM);
-
- ceph_decode_16_safe(p, end, version, bad);
-
- ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
- ceph_decode_32(p, m->m_epoch);
- ceph_decode_32(p, m->m_client_epoch);
- ceph_decode_32(p, m->m_last_failure);
- ceph_decode_32(p, m->m_root);
- ceph_decode_32(p, m->m_session_timeout);
- ceph_decode_32(p, m->m_session_autoclose);
- ceph_decode_64(p, m->m_max_file_size);
- ceph_decode_32(p, m->m_max_mds);
-
- m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
- if (m->m_info == NULL)
- goto badmem;
-
- /* pick out active nodes from mds_info (state > 0) */
- ceph_decode_32(p, n);
- for (i = 0; i < n; i++) {
- u32 namelen;
- s32 mds, inc, state;
- u64 state_seq;
- u8 infoversion;
- struct ceph_entity_addr addr;
- u32 num_export_targets;
- void *pexport_targets = NULL;
-
- ceph_decode_need(p, end, sizeof(addr) + 1 + sizeof(u32), bad);
- *p += sizeof(addr); /* skip addr key */
- ceph_decode_8(p, infoversion);
- ceph_decode_32(p, namelen); /* skip mds name */
- *p += namelen;
-
- ceph_decode_need(p, end,
- 5*sizeof(u32) + sizeof(u64) +
- sizeof(addr) + sizeof(struct ceph_timespec),
- bad);
- ceph_decode_32(p, mds);
- ceph_decode_32(p, inc);
- ceph_decode_32(p, state);
- ceph_decode_64(p, state_seq);
- ceph_decode_copy(p, &addr, sizeof(addr));
- *p += sizeof(struct ceph_timespec);
- *p += sizeof(u32);
- ceph_decode_32_safe(p, end, namelen, bad);
- *p += sizeof(namelen);
- if (infoversion >= 2) {
- ceph_decode_32_safe(p, end, num_export_targets, bad);
- pexport_targets = *p;
- *p += sizeof(num_export_targets * sizeof(u32));
- } else {
- num_export_targets = 0;
- }
-
- dout("mdsmap_decode %d/%d mds%d.%d %s %s\n",
- i+1, n, mds, inc, pr_addr(&addr.in_addr),
- ceph_mds_state_name(state));
- if (mds >= 0 && mds < m->m_max_mds && state > 0) {
- m->m_info[mds].state = state;
- m->m_info[mds].addr = addr;
- m->m_info[mds].num_export_targets = num_export_targets;
- if (num_export_targets) {
- m->m_info[mds].export_targets =
- kcalloc(num_export_targets, sizeof(u32),
- GFP_NOFS);
- for (j = 0; j < num_export_targets; j++)
- ceph_decode_32(&pexport_targets,
- m->m_info[mds].export_targets[j]);
- } else {
- m->m_info[mds].export_targets = NULL;
- }
- }
- }
-
- /* pg_pools */
- ceph_decode_32_safe(p, end, n, bad);
- m->m_num_data_pg_pools = n;
- m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
- if (!m->m_data_pg_pools)
- goto badmem;
- ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
- for (i = 0; i < n; i++)
- ceph_decode_32(p, m->m_data_pg_pools[i]);
- ceph_decode_32(p, m->m_cas_pg_pool);
-
- /* ok, we don't care about the rest. */
- dout("mdsmap_decode success epoch %u\n", m->m_epoch);
- return m;
-
-badmem:
- err = -ENOMEM;
-bad:
- pr_err("corrupt mdsmap\n");
- ceph_mdsmap_destroy(m);
- return ERR_PTR(-EINVAL);
-}
-
-void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
-{
- int i;
-
- for (i = 0; i < m->m_max_mds; i++)
- kfree(m->m_info[i].export_targets);
- kfree(m->m_info);
- kfree(m->m_data_pg_pools);
- kfree(m);
-}
+++ /dev/null
-#ifndef _FS_CEPH_MDSMAP_H
-#define _FS_CEPH_MDSMAP_H
-
-#include "types.h"
-
-/*
- * mds map - describe servers in the mds cluster.
- *
- * we limit fields to those the client actually xcares about
- */
-struct ceph_mds_info {
- struct ceph_entity_addr addr;
- s32 state;
- int num_export_targets;
- u32 *export_targets;
-};
-
-struct ceph_mdsmap {
- u32 m_epoch, m_client_epoch, m_last_failure;
- u32 m_root;
- u32 m_session_timeout; /* seconds */
- u32 m_session_autoclose; /* seconds */
- u64 m_max_file_size;
- u32 m_max_mds; /* size of m_addr, m_state arrays */
- struct ceph_mds_info *m_info;
-
- /* which object pools file data can be stored in */
- int m_num_data_pg_pools;
- u32 *m_data_pg_pools;
- u32 m_cas_pg_pool;
-};
-
-static inline struct ceph_entity_addr *
-ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
-{
- if (w >= m->m_max_mds)
- return NULL;
- return &m->m_info[w].addr;
-}
-
-static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
-{
- BUG_ON(w < 0);
- if (w >= m->m_max_mds)
- return CEPH_MDS_STATE_DNE;
- return m->m_info[w].state;
-}
-
-extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
-extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
-extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
-
-#endif
+++ /dev/null
-#include "ceph_debug.h"
-
-#include <linux/crc32c.h>
-#include <linux/ctype.h>
-#include <linux/highmem.h>
-#include <linux/inet.h>
-#include <linux/kthread.h>
-#include <linux/net.h>
-#include <linux/socket.h>
-#include <linux/string.h>
-#include <net/tcp.h>
-
-#include "super.h"
-#include "messenger.h"
-
-/*
- * Ceph uses the messenger to exchange ceph_msg messages with other
- * hosts in the system. The messenger provides ordered and reliable
- * delivery. We tolerate TCP disconnects by reconnecting (with
- * exponential backoff) in the case of a fault (disconnection, bad
- * crc, protocol error). Acks allow sent messages to be discarded by
- * the sender.
- */
-
-/* static tag bytes (protocol control messages) */
-static char tag_msg = CEPH_MSGR_TAG_MSG;
-static char tag_ack = CEPH_MSGR_TAG_ACK;
-static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
-
-
-static void queue_con(struct ceph_connection *con);
-static void con_work(struct work_struct *);
-static void ceph_fault(struct ceph_connection *con);
-
-const char *ceph_name_type_str(int t)
-{
- switch (t) {
- case CEPH_ENTITY_TYPE_MON: return "mon";
- case CEPH_ENTITY_TYPE_MDS: return "mds";
- case CEPH_ENTITY_TYPE_OSD: return "osd";
- case CEPH_ENTITY_TYPE_CLIENT: return "client";
- case CEPH_ENTITY_TYPE_ADMIN: return "admin";
- default: return "???";
- }
-}
-
-/*
- * nicely render a sockaddr as a string.
- */
-#define MAX_ADDR_STR 20
-static char addr_str[MAX_ADDR_STR][40];
-static DEFINE_SPINLOCK(addr_str_lock);
-static int last_addr_str;
-
-const char *pr_addr(const struct sockaddr_storage *ss)
-{
- int i;
- char *s;
- struct sockaddr_in *in4 = (void *)ss;
- unsigned char *quad = (void *)&in4->sin_addr.s_addr;
- struct sockaddr_in6 *in6 = (void *)ss;
-
- spin_lock(&addr_str_lock);
- i = last_addr_str++;
- if (last_addr_str == MAX_ADDR_STR)
- last_addr_str = 0;
- spin_unlock(&addr_str_lock);
- s = addr_str[i];
-
- switch (ss->ss_family) {
- case AF_INET:
- sprintf(s, "%u.%u.%u.%u:%u",
- (unsigned int)quad[0],
- (unsigned int)quad[1],
- (unsigned int)quad[2],
- (unsigned int)quad[3],
- (unsigned int)ntohs(in4->sin_port));
- break;
-
- case AF_INET6:
- sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u",
- in6->sin6_addr.s6_addr16[0],
- in6->sin6_addr.s6_addr16[1],
- in6->sin6_addr.s6_addr16[2],
- in6->sin6_addr.s6_addr16[3],
- in6->sin6_addr.s6_addr16[4],
- in6->sin6_addr.s6_addr16[5],
- in6->sin6_addr.s6_addr16[6],
- in6->sin6_addr.s6_addr16[7],
- (unsigned int)ntohs(in6->sin6_port));
- break;
-
- default:
- sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
- }
-
- return s;
-}
-
-/*
- * work queue for all reading and writing to/from the socket.
- */
-struct workqueue_struct *ceph_msgr_wq;
-
-int __init ceph_msgr_init(void)
-{
- ceph_msgr_wq = create_workqueue("ceph-msgr");
- if (IS_ERR(ceph_msgr_wq)) {
- int ret = PTR_ERR(ceph_msgr_wq);
- pr_err("msgr_init failed to create workqueue: %d\n", ret);
- ceph_msgr_wq = NULL;
- return ret;
- }
- return 0;
-}
-
-void ceph_msgr_exit(void)
-{
- destroy_workqueue(ceph_msgr_wq);
-}
-
-/*
- * socket callback functions
- */
-
-/* data available on socket, or listen socket received a connect */
-static void ceph_data_ready(struct sock *sk, int count_unused)
-{
- struct ceph_connection *con =
- (struct ceph_connection *)sk->sk_user_data;
- if (sk->sk_state != TCP_CLOSE_WAIT) {
- dout("ceph_data_ready on %p state = %lu, queueing work\n",
- con, con->state);
- queue_con(con);
- }
-}
-
-/* socket has buffer space for writing */
-static void ceph_write_space(struct sock *sk)
-{
- struct ceph_connection *con =
- (struct ceph_connection *)sk->sk_user_data;
-
- /* only queue to workqueue if there is data we want to write. */
- if (test_bit(WRITE_PENDING, &con->state)) {
- dout("ceph_write_space %p queueing write work\n", con);
- queue_con(con);
- } else {
- dout("ceph_write_space %p nothing to write\n", con);
- }
-
- /* since we have our own write_space, clear the SOCK_NOSPACE flag */
- clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-}
-
-/* socket's state has changed */
-static void ceph_state_change(struct sock *sk)
-{
- struct ceph_connection *con =
- (struct ceph_connection *)sk->sk_user_data;
-
- dout("ceph_state_change %p state = %lu sk_state = %u\n",
- con, con->state, sk->sk_state);
-
- if (test_bit(CLOSED, &con->state))
- return;
-
- switch (sk->sk_state) {
- case TCP_CLOSE:
- dout("ceph_state_change TCP_CLOSE\n");
- case TCP_CLOSE_WAIT:
- dout("ceph_state_change TCP_CLOSE_WAIT\n");
- if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
- if (test_bit(CONNECTING, &con->state))
- con->error_msg = "connection failed";
- else
- con->error_msg = "socket closed";
- queue_con(con);
- }
- break;
- case TCP_ESTABLISHED:
- dout("ceph_state_change TCP_ESTABLISHED\n");
- queue_con(con);
- break;
- }
-}
-
-/*
- * set up socket callbacks
- */
-static void set_sock_callbacks(struct socket *sock,
- struct ceph_connection *con)
-{
- struct sock *sk = sock->sk;
- sk->sk_user_data = (void *)con;
- sk->sk_data_ready = ceph_data_ready;
- sk->sk_write_space = ceph_write_space;
- sk->sk_state_change = ceph_state_change;
-}
-
-
-/*
- * socket helpers
- */
-
-/*
- * initiate connection to a remote socket.
- */
-static struct socket *ceph_tcp_connect(struct ceph_connection *con)
-{
- struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr;
- struct socket *sock;
- int ret;
-
- BUG_ON(con->sock);
- ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
- if (ret)
- return ERR_PTR(ret);
- con->sock = sock;
- sock->sk->sk_allocation = GFP_NOFS;
-
- set_sock_callbacks(sock, con);
-
- dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
-
- ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK);
- if (ret == -EINPROGRESS) {
- dout("connect %s EINPROGRESS sk_state = %u\n",
- pr_addr(&con->peer_addr.in_addr),
- sock->sk->sk_state);
- ret = 0;
- }
- if (ret < 0) {
- pr_err("connect %s error %d\n",
- pr_addr(&con->peer_addr.in_addr), ret);
- sock_release(sock);
- con->sock = NULL;
- con->error_msg = "connect error";
- }
-
- if (ret < 0)
- return ERR_PTR(ret);
- return sock;
-}
-
-static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
-{
- struct kvec iov = {buf, len};
- struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-
- return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
-}
-
-/*
- * write something. @more is true if caller will be sending more data
- * shortly.
- */
-static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
- size_t kvlen, size_t len, int more)
-{
- struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-
- if (more)
- msg.msg_flags |= MSG_MORE;
- else
- msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
-
- return kernel_sendmsg(sock, &msg, iov, kvlen, len);
-}
-
-
-/*
- * Shutdown/close the socket for the given connection.
- */
-static int con_close_socket(struct ceph_connection *con)
-{
- int rc;
-
- dout("con_close_socket on %p sock %p\n", con, con->sock);
- if (!con->sock)
- return 0;
- set_bit(SOCK_CLOSED, &con->state);
- rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
- sock_release(con->sock);
- con->sock = NULL;
- clear_bit(SOCK_CLOSED, &con->state);
- return rc;
-}
-
-/*
- * Reset a connection. Discard all incoming and outgoing messages
- * and clear *_seq state.
- */
-static void ceph_msg_remove(struct ceph_msg *msg)
-{
- list_del_init(&msg->list_head);
- ceph_msg_put(msg);
-}
-static void ceph_msg_remove_list(struct list_head *head)
-{
- while (!list_empty(head)) {
- struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
- list_head);
- ceph_msg_remove(msg);
- }
-}
-
-static void reset_connection(struct ceph_connection *con)
-{
- /* reset connection, out_queue, msg_ and connect_seq */
- /* discard existing out_queue and msg_seq */
- mutex_lock(&con->out_mutex);
- ceph_msg_remove_list(&con->out_queue);
- ceph_msg_remove_list(&con->out_sent);
-
- con->connect_seq = 0;
- con->out_seq = 0;
- con->out_msg = NULL;
- con->in_seq = 0;
- mutex_unlock(&con->out_mutex);
-}
-
-/*
- * mark a peer down. drop any open connections.
- */
-void ceph_con_close(struct ceph_connection *con)
-{
- dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
- set_bit(CLOSED, &con->state); /* in case there's queued work */
- clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
- reset_connection(con);
- queue_con(con);
-}
-
-/*
- * clean up connection state
- */
-void ceph_con_shutdown(struct ceph_connection *con)
-{
- dout("con_shutdown %p\n", con);
- reset_connection(con);
- set_bit(DEAD, &con->state);
- con_close_socket(con); /* silently ignore errors */
-}
-
-/*
- * Reopen a closed connection, with a new peer address.
- */
-void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
-{
- dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
- set_bit(OPENING, &con->state);
- clear_bit(CLOSED, &con->state);
- memcpy(&con->peer_addr, addr, sizeof(*addr));
- queue_con(con);
-}
-
-/*
- * generic get/put
- */
-struct ceph_connection *ceph_con_get(struct ceph_connection *con)
-{
- dout("con_get %p nref = %d -> %d\n", con,
- atomic_read(&con->nref), atomic_read(&con->nref) + 1);
- if (atomic_inc_not_zero(&con->nref))
- return con;
- return NULL;
-}
-
-void ceph_con_put(struct ceph_connection *con)
-{
- dout("con_put %p nref = %d -> %d\n", con,
- atomic_read(&con->nref), atomic_read(&con->nref) - 1);
- BUG_ON(atomic_read(&con->nref) == 0);
- if (atomic_dec_and_test(&con->nref)) {
- ceph_con_shutdown(con);
- kfree(con);
- }
-}
-
-/*
- * initialize a new connection.
- */
-void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
-{
- dout("con_init %p\n", con);
- memset(con, 0, sizeof(*con));
- atomic_set(&con->nref, 1);
- con->msgr = msgr;
- mutex_init(&con->out_mutex);
- INIT_LIST_HEAD(&con->out_queue);
- INIT_LIST_HEAD(&con->out_sent);
- INIT_DELAYED_WORK(&con->work, con_work);
-}
-
-
-/*
- * We maintain a global counter to order connection attempts. Get
- * a unique seq greater than @gt.
- */
-static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
-{
- u32 ret;
-
- spin_lock(&msgr->global_seq_lock);
- if (msgr->global_seq < gt)
- msgr->global_seq = gt;
- ret = ++msgr->global_seq;
- spin_unlock(&msgr->global_seq_lock);
- return ret;
-}
-
-
-/*
- * Prepare footer for currently outgoing message, and finish things
- * off. Assumes out_kvec* are already valid.. we just add on to the end.
- */
-static void prepare_write_message_footer(struct ceph_connection *con, int v)
-{
- struct ceph_msg *m = con->out_msg;
-
- dout("prepare_write_message_footer %p\n", con);
- con->out_kvec_is_msg = true;
- con->out_kvec[v].iov_base = &m->footer;
- con->out_kvec[v].iov_len = sizeof(m->footer);
- con->out_kvec_bytes += sizeof(m->footer);
- con->out_kvec_left++;
- con->out_more = m->more_to_follow;
- con->out_msg = NULL; /* we're done with this one */
-}
-
-/*
- * Prepare headers for the next outgoing message.
- */
-static void prepare_write_message(struct ceph_connection *con)
-{
- struct ceph_msg *m;
- int v = 0;
-
- con->out_kvec_bytes = 0;
- con->out_kvec_is_msg = true;
-
- /* Sneak an ack in there first? If we can get it into the same
- * TCP packet that's a good thing. */
- if (con->in_seq > con->in_seq_acked) {
- con->in_seq_acked = con->in_seq;
- con->out_kvec[v].iov_base = &tag_ack;
- con->out_kvec[v++].iov_len = 1;
- con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
- con->out_kvec[v].iov_base = &con->out_temp_ack;
- con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
- con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
- }
-
- /* move message to sending/sent list */
- m = list_first_entry(&con->out_queue,
- struct ceph_msg, list_head);
- list_move_tail(&m->list_head, &con->out_sent);
- con->out_msg = m; /* we don't bother taking a reference here. */
-
- m->hdr.seq = cpu_to_le64(++con->out_seq);
-
- dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
- m, con->out_seq, le16_to_cpu(m->hdr.type),
- le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
- le32_to_cpu(m->hdr.data_len),
- m->nr_pages);
- BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
-
- /* tag + hdr + front + middle */
- con->out_kvec[v].iov_base = &tag_msg;
- con->out_kvec[v++].iov_len = 1;
- con->out_kvec[v].iov_base = &m->hdr;
- con->out_kvec[v++].iov_len = sizeof(m->hdr);
- con->out_kvec[v++] = m->front;
- if (m->middle)
- con->out_kvec[v++] = m->middle->vec;
- con->out_kvec_left = v;
- con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
- (m->middle ? m->middle->vec.iov_len : 0);
- con->out_kvec_cur = con->out_kvec;
-
- /* fill in crc (except data pages), footer */
- con->out_msg->hdr.crc =
- cpu_to_le32(crc32c(0, (void *)&m->hdr,
- sizeof(m->hdr) - sizeof(m->hdr.crc)));
- con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
- con->out_msg->footer.front_crc =
- cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
- if (m->middle)
- con->out_msg->footer.middle_crc =
- cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
- m->middle->vec.iov_len));
- else
- con->out_msg->footer.middle_crc = 0;
- con->out_msg->footer.data_crc = 0;
- dout("prepare_write_message front_crc %u data_crc %u\n",
- le32_to_cpu(con->out_msg->footer.front_crc),
- le32_to_cpu(con->out_msg->footer.middle_crc));
-
- /* is there a data payload? */
- if (le32_to_cpu(m->hdr.data_len) > 0) {
- /* initialize page iterator */
- con->out_msg_pos.page = 0;
- con->out_msg_pos.page_pos =
- le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
- con->out_msg_pos.data_pos = 0;
- con->out_msg_pos.did_page_crc = 0;
- con->out_more = 1; /* data + footer will follow */
- } else {
- /* no, queue up footer too and be done */
- prepare_write_message_footer(con, v);
- }
-
- set_bit(WRITE_PENDING, &con->state);
-}
-
-/*
- * Prepare an ack.
- */
-static void prepare_write_ack(struct ceph_connection *con)
-{
- dout("prepare_write_ack %p %llu -> %llu\n", con,
- con->in_seq_acked, con->in_seq);
- con->in_seq_acked = con->in_seq;
-
- con->out_kvec[0].iov_base = &tag_ack;
- con->out_kvec[0].iov_len = 1;
- con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
- con->out_kvec[1].iov_base = &con->out_temp_ack;
- con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
- con->out_kvec_left = 2;
- con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
- con->out_kvec_cur = con->out_kvec;
- con->out_more = 1; /* more will follow.. eventually.. */
- set_bit(WRITE_PENDING, &con->state);
-}
-
-/*
- * Prepare to write keepalive byte.
- */
-static void prepare_write_keepalive(struct ceph_connection *con)
-{
- dout("prepare_write_keepalive %p\n", con);
- con->out_kvec[0].iov_base = &tag_keepalive;
- con->out_kvec[0].iov_len = 1;
- con->out_kvec_left = 1;
- con->out_kvec_bytes = 1;
- con->out_kvec_cur = con->out_kvec;
- set_bit(WRITE_PENDING, &con->state);
-}
-
-/*
- * Connection negotiation.
- */
-
-/*
- * We connected to a peer and are saying hello.
- */
-static void prepare_write_connect(struct ceph_messenger *msgr,
- struct ceph_connection *con)
-{
- int len = strlen(CEPH_BANNER);
- unsigned global_seq = get_global_seq(con->msgr, 0);
- int proto;
-
- switch (con->peer_name.type) {
- case CEPH_ENTITY_TYPE_MON:
- proto = CEPH_MONC_PROTOCOL;
- break;
- case CEPH_ENTITY_TYPE_OSD:
- proto = CEPH_OSDC_PROTOCOL;
- break;
- case CEPH_ENTITY_TYPE_MDS:
- proto = CEPH_MDSC_PROTOCOL;
- break;
- default:
- BUG();
- }
-
- dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
- con->connect_seq, global_seq, proto);
- con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
- con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
- con->out_connect.global_seq = cpu_to_le32(global_seq);
- con->out_connect.protocol_version = cpu_to_le32(proto);
- con->out_connect.flags = 0;
- if (test_bit(LOSSYTX, &con->state))
- con->out_connect.flags = CEPH_MSG_CONNECT_LOSSY;
-
- con->out_kvec[0].iov_base = CEPH_BANNER;
- con->out_kvec[0].iov_len = len;
- con->out_kvec[1].iov_base = &msgr->inst.addr;
- con->out_kvec[1].iov_len = sizeof(msgr->inst.addr);
- con->out_kvec[2].iov_base = &con->out_connect;
- con->out_kvec[2].iov_len = sizeof(con->out_connect);
- con->out_kvec_left = 3;
- con->out_kvec_bytes = len + sizeof(msgr->inst.addr) +
- sizeof(con->out_connect);
- con->out_kvec_cur = con->out_kvec;
- con->out_more = 0;
- set_bit(WRITE_PENDING, &con->state);
-}
-
-static void prepare_write_connect_retry(struct ceph_messenger *msgr,
- struct ceph_connection *con)
-{
- dout("prepare_write_connect_retry %p\n", con);
- con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
- con->out_connect.global_seq =
- cpu_to_le32(get_global_seq(con->msgr, 0));
-
- con->out_kvec[0].iov_base = &con->out_connect;
- con->out_kvec[0].iov_len = sizeof(con->out_connect);
- con->out_kvec_left = 1;
- con->out_kvec_bytes = sizeof(con->out_connect);
- con->out_kvec_cur = con->out_kvec;
- con->out_more = 0;
- set_bit(WRITE_PENDING, &con->state);
-}
-
-
-/*
- * write as much of pending kvecs to the socket as we can.
- * 1 -> done
- * 0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_kvec(struct ceph_connection *con)
-{
- int ret;
-
- dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
- while (con->out_kvec_bytes > 0) {
- ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
- con->out_kvec_left, con->out_kvec_bytes,
- con->out_more);
- if (ret <= 0)
- goto out;
- con->out_kvec_bytes -= ret;
- if (con->out_kvec_bytes == 0)
- break; /* done */
- while (ret > 0) {
- if (ret >= con->out_kvec_cur->iov_len) {
- ret -= con->out_kvec_cur->iov_len;
- con->out_kvec_cur++;
- con->out_kvec_left--;
- } else {
- con->out_kvec_cur->iov_len -= ret;
- con->out_kvec_cur->iov_base += ret;
- ret = 0;
- break;
- }
- }
- }
- con->out_kvec_left = 0;
- con->out_kvec_is_msg = false;
- ret = 1;
-out:
- dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
- con->out_kvec_bytes, con->out_kvec_left, ret);
- return ret; /* done! */
-}
-
-/*
- * Write as much message data payload as we can. If we finish, queue
- * up the footer.
- * 1 -> done, footer is now queued in out_kvec[].
- * 0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_msg_pages(struct ceph_connection *con)
-{
- struct ceph_msg *msg = con->out_msg;
- unsigned data_len = le32_to_cpu(msg->hdr.data_len);
- size_t len;
- int crc = con->msgr->nocrc;
- int ret;
-
- dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
- con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
- con->out_msg_pos.page_pos);
-
- while (con->out_msg_pos.page < con->out_msg->nr_pages) {
- struct page *page = NULL;
- void *kaddr = NULL;
-
- /*
- * if we are calculating the data crc (the default), we need
- * to map the page. if our pages[] has been revoked, use the
- * zero page.
- */
- if (msg->pages) {
- page = msg->pages[con->out_msg_pos.page];
- if (crc)
- kaddr = kmap(page);
- } else {
- page = con->msgr->zero_page;
- if (crc)
- kaddr = page_address(con->msgr->zero_page);
- }
- len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
- (int)(data_len - con->out_msg_pos.data_pos));
- if (crc && !con->out_msg_pos.did_page_crc) {
- void *base = kaddr + con->out_msg_pos.page_pos;
- u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
-
- BUG_ON(kaddr == NULL);
- con->out_msg->footer.data_crc =
- cpu_to_le32(crc32c(tmpcrc, base, len));
- con->out_msg_pos.did_page_crc = 1;
- }
-
- ret = kernel_sendpage(con->sock, page,
- con->out_msg_pos.page_pos, len,
- MSG_DONTWAIT | MSG_NOSIGNAL |
- MSG_MORE);
-
- if (crc && msg->pages)
- kunmap(page);
-
- if (ret <= 0)
- goto out;
-
- con->out_msg_pos.data_pos += ret;
- con->out_msg_pos.page_pos += ret;
- if (ret == len) {
- con->out_msg_pos.page_pos = 0;
- con->out_msg_pos.page++;
- con->out_msg_pos.did_page_crc = 0;
- }
- }
-
- dout("write_partial_msg_pages %p msg %p done\n", con, msg);
-
- /* prepare and queue up footer, too */
- if (!crc)
- con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
- con->out_kvec_bytes = 0;
- con->out_kvec_left = 0;
- con->out_kvec_cur = con->out_kvec;
- prepare_write_message_footer(con, 0);
- ret = 1;
-out:
- return ret;
-}
-
-/*
- * write some zeros
- */
-static int write_partial_skip(struct ceph_connection *con)
-{
- int ret;
-
- while (con->out_skip > 0) {
- struct kvec iov = {
- .iov_base = page_address(con->msgr->zero_page),
- .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
- };
-
- ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
- if (ret <= 0)
- goto out;
- con->out_skip -= ret;
- }
- ret = 1;
-out:
- return ret;
-}
-
-/*
- * Prepare to read connection handshake, or an ack.
- */
-static void prepare_read_connect(struct ceph_connection *con)
-{
- dout("prepare_read_connect %p\n", con);
- con->in_base_pos = 0;
-}
-
-static void prepare_read_ack(struct ceph_connection *con)
-{
- dout("prepare_read_ack %p\n", con);
- con->in_base_pos = 0;
-}
-
-static void prepare_read_tag(struct ceph_connection *con)
-{
- dout("prepare_read_tag %p\n", con);
- con->in_base_pos = 0;
- con->in_tag = CEPH_MSGR_TAG_READY;
-}
-
-/*
- * Prepare to read a message.
- */
-static int prepare_read_message(struct ceph_connection *con)
-{
- dout("prepare_read_message %p\n", con);
- BUG_ON(con->in_msg != NULL);
- con->in_base_pos = 0;
- con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
- return 0;
-}
-
-
-static int read_partial(struct ceph_connection *con,
- int *to, int size, void *object)
-{
- *to += size;
- while (con->in_base_pos < *to) {
- int left = *to - con->in_base_pos;
- int have = size - left;
- int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
- if (ret <= 0)
- return ret;
- con->in_base_pos += ret;
- }
- return 1;
-}
-
-
-/*
- * Read all or part of the connect-side handshake on a new connection
- */
-static int read_partial_connect(struct ceph_connection *con)
-{
- int ret, to = 0;
-
- dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
-
- /* peer's banner */
- ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
- if (ret <= 0)
- goto out;
- ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
- &con->actual_peer_addr);
- if (ret <= 0)
- goto out;
- ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
- &con->peer_addr_for_me);
- if (ret <= 0)
- goto out;
- ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
- if (ret <= 0)
- goto out;
-
- dout("read_partial_connect %p connect_seq = %u, global_seq = %u\n",
- con, le32_to_cpu(con->in_reply.connect_seq),
- le32_to_cpu(con->in_reply.global_seq));
-out:
- return ret;
-}
-
-/*
- * Verify the hello banner looks okay.
- */
-static int verify_hello(struct ceph_connection *con)
-{
- if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
- pr_err("connect to/from %s has bad banner\n",
- pr_addr(&con->peer_addr.in_addr));
- con->error_msg = "protocol error, bad banner";
- return -1;
- }
- return 0;
-}
-
-static bool addr_is_blank(struct sockaddr_storage *ss)
-{
- switch (ss->ss_family) {
- case AF_INET:
- return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
- case AF_INET6:
- return
- ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
- ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
- ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
- ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
- }
- return false;
-}
-
-static int addr_port(struct sockaddr_storage *ss)
-{
- switch (ss->ss_family) {
- case AF_INET:
- return ((struct sockaddr_in *)ss)->sin_port;
- case AF_INET6:
- return ((struct sockaddr_in6 *)ss)->sin6_port;
- }
- return 0;
-}
-
-static void addr_set_port(struct sockaddr_storage *ss, int p)
-{
- switch (ss->ss_family) {
- case AF_INET:
- ((struct sockaddr_in *)ss)->sin_port = htons(p);
- case AF_INET6:
- ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
- }
-}
-
-/*
- * Parse an ip[:port] list into an addr array. Use the default
- * monitor port if a port isn't specified.
- */
-int ceph_parse_ips(const char *c, const char *end,
- struct ceph_entity_addr *addr,
- int max_count, int *count)
-{
- int i;
- const char *p = c;
-
- dout("parse_ips on '%.*s'\n", (int)(end-c), c);
- for (i = 0; i < max_count; i++) {
- const char *ipend;
- struct sockaddr_storage *ss = &addr[i].in_addr;
- struct sockaddr_in *in4 = (void *)ss;
- struct sockaddr_in6 *in6 = (void *)ss;
- int port;
-
- memset(ss, 0, sizeof(*ss));
- if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
- ',', &ipend)) {
- ss->ss_family = AF_INET;
- } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
- ',', &ipend)) {
- ss->ss_family = AF_INET6;
- } else {
- goto bad;
- }
- p = ipend;
-
- /* port? */
- if (p < end && *p == ':') {
- port = 0;
- p++;
- while (p < end && *p >= '0' && *p <= '9') {
- port = (port * 10) + (*p - '0');
- p++;
- }
- if (port > 65535 || port == 0)
- goto bad;
- } else {
- port = CEPH_MON_PORT;
- }
-
- addr_set_port(ss, port);
-
- dout("parse_ips got %s\n", pr_addr(ss));
-
- if (p == end)
- break;
- if (*p != ',')
- goto bad;
- p++;
- }
-
- if (p != end)
- goto bad;
-
- if (count)
- *count = i + 1;
- return 0;
-
-bad:
- pr_err("parse_ips bad ip '%s'\n", c);
- return -EINVAL;
-}
-
-static int process_connect(struct ceph_connection *con)
-{
- dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
-
- if (verify_hello(con) < 0)
- return -1;
-
- /*
- * Make sure the other end is who we wanted. note that the other
- * end may not yet know their ip address, so if it's 0.0.0.0, give
- * them the benefit of the doubt.
- */
- if (!ceph_entity_addr_is_local(&con->peer_addr,
- &con->actual_peer_addr) &&
- !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
- con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
- pr_err("wrong peer, want %s/%d, "
- "got %s/%d, wtf\n",
- pr_addr(&con->peer_addr.in_addr),
- con->peer_addr.nonce,
- pr_addr(&con->actual_peer_addr.in_addr),
- con->actual_peer_addr.nonce);
- con->error_msg = "protocol error, wrong peer";
- return -1;
- }
-
- /*
- * did we learn our address?
- */
- if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
- int port = addr_port(&con->msgr->inst.addr.in_addr);
-
- memcpy(&con->msgr->inst.addr.in_addr,
- &con->peer_addr_for_me.in_addr,
- sizeof(con->peer_addr_for_me.in_addr));
- addr_set_port(&con->msgr->inst.addr.in_addr, port);
- dout("process_connect learned my addr is %s\n",
- pr_addr(&con->msgr->inst.addr.in_addr));
- }
-
- switch (con->in_reply.tag) {
- case CEPH_MSGR_TAG_BADPROTOVER:
- dout("process_connect got BADPROTOVER my %d != their %d\n",
- le32_to_cpu(con->out_connect.protocol_version),
- le32_to_cpu(con->in_reply.protocol_version));
- pr_err("%s%lld %s protocol version mismatch,"
- " my %d != server's %d\n",
- ENTITY_NAME(con->peer_name),
- pr_addr(&con->peer_addr.in_addr),
- le32_to_cpu(con->out_connect.protocol_version),
- le32_to_cpu(con->in_reply.protocol_version));
- con->error_msg = "protocol version mismatch";
- if (con->ops->bad_proto)
- con->ops->bad_proto(con);
- reset_connection(con);
- set_bit(CLOSED, &con->state); /* in case there's queued work */
- return -1;
-
-
- case CEPH_MSGR_TAG_RESETSESSION:
- /*
- * If we connected with a large connect_seq but the peer
- * has no record of a session with us (no connection, or
- * connect_seq == 0), they will send RESETSESION to indicate
- * that they must have reset their session, and may have
- * dropped messages.
- */
- dout("process_connect got RESET peer seq %u\n",
- le32_to_cpu(con->in_connect.connect_seq));
- pr_err("%s%lld %s connection reset\n",
- ENTITY_NAME(con->peer_name),
- pr_addr(&con->peer_addr.in_addr));
- reset_connection(con);
- prepare_write_connect_retry(con->msgr, con);
- prepare_read_connect(con);
-
- /* Tell ceph about it. */
- pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
- if (con->ops->peer_reset)
- con->ops->peer_reset(con);
- break;
-
- case CEPH_MSGR_TAG_RETRY_SESSION:
- /*
- * If we sent a smaller connect_seq than the peer has, try
- * again with a larger value.
- */
- dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
- le32_to_cpu(con->out_connect.connect_seq),
- le32_to_cpu(con->in_connect.connect_seq));
- con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
- prepare_write_connect_retry(con->msgr, con);
- prepare_read_connect(con);
- break;
-
- case CEPH_MSGR_TAG_RETRY_GLOBAL:
- /*
- * If we sent a smaller global_seq than the peer has, try
- * again with a larger value.
- */
- dout("process_connect got RETRY_GLOBAL my %u, peer_gseq = %u\n",
- con->peer_global_seq,
- le32_to_cpu(con->in_connect.global_seq));
- get_global_seq(con->msgr,
- le32_to_cpu(con->in_connect.global_seq));
- prepare_write_connect_retry(con->msgr, con);
- prepare_read_connect(con);
- break;
-
- case CEPH_MSGR_TAG_READY:
- clear_bit(CONNECTING, &con->state);
- if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
- set_bit(LOSSYRX, &con->state);
- con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
- con->connect_seq++;
- dout("process_connect got READY gseq %d cseq %d (%d)\n",
- con->peer_global_seq,
- le32_to_cpu(con->in_reply.connect_seq),
- con->connect_seq);
- WARN_ON(con->connect_seq !=
- le32_to_cpu(con->in_reply.connect_seq));
-
- con->delay = 0; /* reset backoff memory */
- prepare_read_tag(con);
- break;
-
- case CEPH_MSGR_TAG_WAIT:
- /*
- * If there is a connection race (we are opening
- * connections to each other), one of us may just have
- * to WAIT. This shouldn't happen if we are the
- * client.
- */
- pr_err("process_connect peer connecting WAIT\n");
-
- default:
- pr_err("connect protocol error, will retry\n");
- con->error_msg = "protocol error, garbage tag during connect";
- return -1;
- }
- return 0;
-}
-
-
-/*
- * read (part of) an ack
- */
-static int read_partial_ack(struct ceph_connection *con)
-{
- int to = 0;
-
- return read_partial(con, &to, sizeof(con->in_temp_ack),
- &con->in_temp_ack);
-}
-
-
-/*
- * We can finally discard anything that's been acked.
- */
-static void process_ack(struct ceph_connection *con)
-{
- struct ceph_msg *m;
- u64 ack = le64_to_cpu(con->in_temp_ack);
- u64 seq;
-
- mutex_lock(&con->out_mutex);
- while (!list_empty(&con->out_sent)) {
- m = list_first_entry(&con->out_sent, struct ceph_msg,
- list_head);
- seq = le64_to_cpu(m->hdr.seq);
- if (seq > ack)
- break;
- dout("got ack for seq %llu type %d at %p\n", seq,
- le16_to_cpu(m->hdr.type), m);
- ceph_msg_remove(m);
- }
- mutex_unlock(&con->out_mutex);
- prepare_read_tag(con);
-}
-
-
-
-
-
-
-/*
- * read (part of) a message.
- */
-static int read_partial_message(struct ceph_connection *con)
-{
- struct ceph_msg *m = con->in_msg;
- void *p;
- int ret;
- int to, want, left;
- unsigned front_len, middle_len, data_len, data_off;
- int datacrc = con->msgr->nocrc;
-
- dout("read_partial_message con %p msg %p\n", con, m);
-
- /* header */
- while (con->in_base_pos < sizeof(con->in_hdr)) {
- left = sizeof(con->in_hdr) - con->in_base_pos;
- ret = ceph_tcp_recvmsg(con->sock,
- (char *)&con->in_hdr + con->in_base_pos,
- left);
- if (ret <= 0)
- return ret;
- con->in_base_pos += ret;
- if (con->in_base_pos == sizeof(con->in_hdr)) {
- u32 crc = crc32c(0, (void *)&con->in_hdr,
- sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
- if (crc != le32_to_cpu(con->in_hdr.crc)) {
- pr_err("read_partial_message bad hdr "
- " crc %u != expected %u\n",
- crc, con->in_hdr.crc);
- return -EBADMSG;
- }
- }
- }
-
- front_len = le32_to_cpu(con->in_hdr.front_len);
- if (front_len > CEPH_MSG_MAX_FRONT_LEN)
- return -EIO;
- middle_len = le32_to_cpu(con->in_hdr.middle_len);
- if (middle_len > CEPH_MSG_MAX_DATA_LEN)
- return -EIO;
- data_len = le32_to_cpu(con->in_hdr.data_len);
- if (data_len > CEPH_MSG_MAX_DATA_LEN)
- return -EIO;
-
- /* allocate message? */
- if (!con->in_msg) {
- dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
- con->in_hdr.front_len, con->in_hdr.data_len);
- con->in_msg = con->ops->alloc_msg(con, &con->in_hdr);
- if (!con->in_msg) {
- /* skip this message */
- dout("alloc_msg returned NULL, skipping message\n");
- con->in_base_pos = -front_len - middle_len - data_len -
- sizeof(m->footer);
- con->in_tag = CEPH_MSGR_TAG_READY;
- return 0;
- }
- if (IS_ERR(con->in_msg)) {
- ret = PTR_ERR(con->in_msg);
- con->in_msg = NULL;
- con->error_msg = "out of memory for incoming message";
- return ret;
- }
- m = con->in_msg;
- m->front.iov_len = 0; /* haven't read it yet */
- memcpy(&m->hdr, &con->in_hdr, sizeof(con->in_hdr));
- }
-
- /* front */
- while (m->front.iov_len < front_len) {
- BUG_ON(m->front.iov_base == NULL);
- left = front_len - m->front.iov_len;
- ret = ceph_tcp_recvmsg(con->sock, (char *)m->front.iov_base +
- m->front.iov_len, left);
- if (ret <= 0)
- return ret;
- m->front.iov_len += ret;
- if (m->front.iov_len == front_len)
- con->in_front_crc = crc32c(0, m->front.iov_base,
- m->front.iov_len);
- }
-
- /* middle */
- while (middle_len > 0 && (!m->middle ||
- m->middle->vec.iov_len < middle_len)) {
- if (m->middle == NULL) {
- ret = -EOPNOTSUPP;
- if (con->ops->alloc_middle)
- ret = con->ops->alloc_middle(con, m);
- if (ret < 0) {
- dout("alloc_middle failed, skipping payload\n");
- con->in_base_pos = -middle_len - data_len
- - sizeof(m->footer);
- ceph_msg_put(con->in_msg);
- con->in_msg = NULL;
- con->in_tag = CEPH_MSGR_TAG_READY;
- return 0;
- }
- m->middle->vec.iov_len = 0;
- }
- left = middle_len - m->middle->vec.iov_len;
- ret = ceph_tcp_recvmsg(con->sock,
- (char *)m->middle->vec.iov_base +
- m->middle->vec.iov_len, left);
- if (ret <= 0)
- return ret;
- m->middle->vec.iov_len += ret;
- if (m->middle->vec.iov_len == middle_len)
- con->in_middle_crc = crc32c(0, m->middle->vec.iov_base,
- m->middle->vec.iov_len);
- }
-
- /* (page) data */
- data_off = le16_to_cpu(m->hdr.data_off);
- if (data_len == 0)
- goto no_data;
-
- if (m->nr_pages == 0) {
- con->in_msg_pos.page = 0;
- con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
- con->in_msg_pos.data_pos = 0;
- /* find pages for data payload */
- want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
- ret = -1;
- if (con->ops->prepare_pages)
- ret = con->ops->prepare_pages(con, m, want);
- if (ret < 0) {
- dout("%p prepare_pages failed, skipping payload\n", m);
- con->in_base_pos = -data_len - sizeof(m->footer);
- ceph_msg_put(con->in_msg);
- con->in_msg = NULL;
- con->in_tag = CEPH_MSGR_TAG_READY;
- return 0;
- }
- BUG_ON(m->nr_pages < want);
- }
- while (con->in_msg_pos.data_pos < data_len) {
- left = min((int)(data_len - con->in_msg_pos.data_pos),
- (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
- BUG_ON(m->pages == NULL);
- p = kmap(m->pages[con->in_msg_pos.page]);
- ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
- left);
- if (ret > 0 && datacrc)
- con->in_data_crc =
- crc32c(con->in_data_crc,
- p + con->in_msg_pos.page_pos, ret);
- kunmap(m->pages[con->in_msg_pos.page]);
- if (ret <= 0)
- return ret;
- con->in_msg_pos.data_pos += ret;
- con->in_msg_pos.page_pos += ret;
- if (con->in_msg_pos.page_pos == PAGE_SIZE) {
- con->in_msg_pos.page_pos = 0;
- con->in_msg_pos.page++;
- }
- }
-
-no_data:
- /* footer */
- to = sizeof(m->hdr) + sizeof(m->footer);
- while (con->in_base_pos < to) {
- left = to - con->in_base_pos;
- ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
- (con->in_base_pos - sizeof(m->hdr)),
- left);
- if (ret <= 0)
- return ret;
- con->in_base_pos += ret;
- }
- dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
- m, front_len, m->footer.front_crc, middle_len,
- m->footer.middle_crc, data_len, m->footer.data_crc);
-
- /* crc ok? */
- if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
- pr_err("read_partial_message %p front crc %u != exp. %u\n",
- m, con->in_front_crc, m->footer.front_crc);
- return -EBADMSG;
- }
- if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
- pr_err("read_partial_message %p middle crc %u != exp %u\n",
- m, con->in_middle_crc, m->footer.middle_crc);
- return -EBADMSG;
- }
- if (datacrc &&
- (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
- con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
- pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
- con->in_data_crc, le32_to_cpu(m->footer.data_crc));
- return -EBADMSG;
- }
-
- return 1; /* done! */
-}
-
-/*
- * Process message. This happens in the worker thread. The callback should
- * be careful not to do anything that waits on other incoming messages or it
- * may deadlock.
- */
-static void process_message(struct ceph_connection *con)
-{
- struct ceph_msg *msg = con->in_msg;
-
- con->in_msg = NULL;
-
- /* if first message, set peer_name */
- if (con->peer_name.type == 0)
- con->peer_name = msg->hdr.src.name;
-
- mutex_lock(&con->out_mutex);
- con->in_seq++;
- mutex_unlock(&con->out_mutex);
-
- dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
- msg, le64_to_cpu(msg->hdr.seq),
- ENTITY_NAME(msg->hdr.src.name),
- le16_to_cpu(msg->hdr.type),
- ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
- le32_to_cpu(msg->hdr.front_len),
- le32_to_cpu(msg->hdr.data_len),
- con->in_front_crc, con->in_middle_crc, con->in_data_crc);
- con->ops->dispatch(con, msg);
- prepare_read_tag(con);
-}
-
-
-/*
- * Write something to the socket. Called in a worker thread when the
- * socket appears to be writeable and we have something ready to send.
- */
-static int try_write(struct ceph_connection *con)
-{
- struct ceph_messenger *msgr = con->msgr;
- int ret = 1;
-
- dout("try_write start %p state %lu nref %d\n", con, con->state,
- atomic_read(&con->nref));
-
- mutex_lock(&con->out_mutex);
-more:
- dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
-
- /* open the socket first? */
- if (con->sock == NULL) {
- /*
- * if we were STANDBY and are reconnecting _this_
- * connection, bump connect_seq now. Always bump
- * global_seq.
- */
- if (test_and_clear_bit(STANDBY, &con->state))
- con->connect_seq++;
-
- prepare_write_connect(msgr, con);
- prepare_read_connect(con);
- set_bit(CONNECTING, &con->state);
-
- con->in_tag = CEPH_MSGR_TAG_READY;
- dout("try_write initiating connect on %p new state %lu\n",
- con, con->state);
- con->sock = ceph_tcp_connect(con);
- if (IS_ERR(con->sock)) {
- con->sock = NULL;
- con->error_msg = "connect error";
- ret = -1;
- goto out;
- }
- }
-
-more_kvec:
- /* kvec data queued? */
- if (con->out_skip) {
- ret = write_partial_skip(con);
- if (ret <= 0)
- goto done;
- if (ret < 0) {
- dout("try_write write_partial_skip err %d\n", ret);
- goto done;
- }
- }
- if (con->out_kvec_left) {
- ret = write_partial_kvec(con);
- if (ret <= 0)
- goto done;
- if (ret < 0) {
- dout("try_write write_partial_kvec err %d\n", ret);
- goto done;
- }
- }
-
- /* msg pages? */
- if (con->out_msg) {
- ret = write_partial_msg_pages(con);
- if (ret == 1)
- goto more_kvec; /* we need to send the footer, too! */
- if (ret == 0)
- goto done;
- if (ret < 0) {
- dout("try_write write_partial_msg_pages err %d\n",
- ret);
- goto done;
- }
- }
-
- if (!test_bit(CONNECTING, &con->state)) {
- /* is anything else pending? */
- if (!list_empty(&con->out_queue)) {
- prepare_write_message(con);
- goto more;
- }
- if (con->in_seq > con->in_seq_acked) {
- prepare_write_ack(con);
- goto more;
- }
- if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
- prepare_write_keepalive(con);
- goto more;
- }
- }
-
- /* Nothing to do! */
- clear_bit(WRITE_PENDING, &con->state);
- dout("try_write nothing else to write.\n");
-done:
- ret = 0;
-out:
- mutex_unlock(&con->out_mutex);
- dout("try_write done on %p\n", con);
- return ret;
-}
-
-
-
-/*
- * Read what we can from the socket.
- */
-static int try_read(struct ceph_connection *con)
-{
- struct ceph_messenger *msgr;
- int ret = -1;
-
- if (!con->sock)
- return 0;
-
- if (test_bit(STANDBY, &con->state))
- return 0;
-
- dout("try_read start on %p\n", con);
- msgr = con->msgr;
-
-more:
- dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
- con->in_base_pos);
- if (test_bit(CONNECTING, &con->state)) {
- dout("try_read connecting\n");
- ret = read_partial_connect(con);
- if (ret <= 0)
- goto done;
- if (process_connect(con) < 0) {
- ret = -1;
- goto out;
- }
- goto more;
- }
-
- if (con->in_base_pos < 0) {
- /*
- * skipping + discarding content.
- *
- * FIXME: there must be a better way to do this!
- */
- static char buf[1024];
- int skip = min(1024, -con->in_base_pos);
- dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
- ret = ceph_tcp_recvmsg(con->sock, buf, skip);
- if (ret <= 0)
- goto done;
- con->in_base_pos += ret;
- if (con->in_base_pos)
- goto more;
- }
- if (con->in_tag == CEPH_MSGR_TAG_READY) {
- /*
- * what's next?
- */
- ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
- if (ret <= 0)
- goto done;
- dout("try_read got tag %d\n", (int)con->in_tag);
- switch (con->in_tag) {
- case CEPH_MSGR_TAG_MSG:
- prepare_read_message(con);
- break;
- case CEPH_MSGR_TAG_ACK:
- prepare_read_ack(con);
- break;
- case CEPH_MSGR_TAG_CLOSE:
- set_bit(CLOSED, &con->state); /* fixme */
- goto done;
- default:
- goto bad_tag;
- }
- }
- if (con->in_tag == CEPH_MSGR_TAG_MSG) {
- ret = read_partial_message(con);
- if (ret <= 0) {
- switch (ret) {
- case -EBADMSG:
- con->error_msg = "bad crc";
- ret = -EIO;
- goto out;
- case -EIO:
- con->error_msg = "io error";
- goto out;
- default:
- goto done;
- }
- }
- if (con->in_tag == CEPH_MSGR_TAG_READY)
- goto more;
- process_message(con);
- goto more;
- }
- if (con->in_tag == CEPH_MSGR_TAG_ACK) {
- ret = read_partial_ack(con);
- if (ret <= 0)
- goto done;
- process_ack(con);
- goto more;
- }
-
-done:
- ret = 0;
-out:
- dout("try_read done on %p\n", con);
- return ret;
-
-bad_tag:
- pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
- con->error_msg = "protocol error, garbage tag";
- ret = -1;
- goto out;
-}
-
-
-/*
- * Atomically queue work on a connection. Bump @con reference to
- * avoid races with connection teardown.
- *
- * There is some trickery going on with QUEUED and BUSY because we
- * only want a _single_ thread operating on each connection at any
- * point in time, but we want to use all available CPUs.
- *
- * The worker thread only proceeds if it can atomically set BUSY. It
- * clears QUEUED and does it's thing. When it thinks it's done, it
- * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
- * (tries again to set BUSY).
- *
- * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
- * try to queue work. If that fails (work is already queued, or BUSY)
- * we give up (work also already being done or is queued) but leave QUEUED
- * set so that the worker thread will loop if necessary.
- */
-static void queue_con(struct ceph_connection *con)
-{
- if (test_bit(DEAD, &con->state)) {
- dout("queue_con %p ignoring: DEAD\n",
- con);
- return;
- }
-
- if (!con->ops->get(con)) {
- dout("queue_con %p ref count 0\n", con);
- return;
- }
-
- set_bit(QUEUED, &con->state);
- if (test_bit(BUSY, &con->state)) {
- dout("queue_con %p - already BUSY\n", con);
- con->ops->put(con);
- } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
- dout("queue_con %p - already queued\n", con);
- con->ops->put(con);
- } else {
- dout("queue_con %p\n", con);
- }
-}
-
-/*
- * Do some work on a connection. Drop a connection ref when we're done.
- */
-static void con_work(struct work_struct *work)
-{
- struct ceph_connection *con = container_of(work, struct ceph_connection,
- work.work);
- int backoff = 0;
-
-more:
- if (test_and_set_bit(BUSY, &con->state) != 0) {
- dout("con_work %p BUSY already set\n", con);
- goto out;
- }
- dout("con_work %p start, clearing QUEUED\n", con);
- clear_bit(QUEUED, &con->state);
-
- if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
- dout("con_work CLOSED\n");
- con_close_socket(con);
- goto done;
- }
- if (test_and_clear_bit(OPENING, &con->state)) {
- /* reopen w/ new peer */
- dout("con_work OPENING\n");
- con_close_socket(con);
- }
-
- if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
- try_read(con) < 0 ||
- try_write(con) < 0) {
- backoff = 1;
- ceph_fault(con); /* error/fault path */
- }
-
-done:
- clear_bit(BUSY, &con->state);
- dout("con->state=%lu\n", con->state);
- if (test_bit(QUEUED, &con->state)) {
- if (!backoff) {
- dout("con_work %p QUEUED reset, looping\n", con);
- goto more;
- }
- dout("con_work %p QUEUED reset, but just faulted\n", con);
- clear_bit(QUEUED, &con->state);
- }
- dout("con_work %p done\n", con);
-
-out:
- con->ops->put(con);
-}
-
-
-/*
- * Generic error/fault handler. A retry mechanism is used with
- * exponential backoff
- */
-static void ceph_fault(struct ceph_connection *con)
-{
- pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
- pr_addr(&con->peer_addr.in_addr), con->error_msg);
- dout("fault %p state %lu to peer %s\n",
- con, con->state, pr_addr(&con->peer_addr.in_addr));
-
- if (test_bit(LOSSYTX, &con->state)) {
- dout("fault on LOSSYTX channel\n");
- goto out;
- }
-
- clear_bit(BUSY, &con->state); /* to avoid an improbable race */
-
- con_close_socket(con);
- con->in_msg = NULL;
-
- /* If there are no messages in the queue, place the connection
- * in a STANDBY state (i.e., don't try to reconnect just yet). */
- mutex_lock(&con->out_mutex);
- if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
- dout("fault setting STANDBY\n");
- set_bit(STANDBY, &con->state);
- mutex_unlock(&con->out_mutex);
- goto out;
- }
-
- /* Requeue anything that hasn't been acked, and retry after a
- * delay. */
- list_splice_init(&con->out_sent, &con->out_queue);
- mutex_unlock(&con->out_mutex);
-
- if (con->delay == 0)
- con->delay = BASE_DELAY_INTERVAL;
- else if (con->delay < MAX_DELAY_INTERVAL)
- con->delay *= 2;
-
- /* explicitly schedule work to try to reconnect again later. */
- dout("fault queueing %p delay %lu\n", con, con->delay);
- con->ops->get(con);
- if (queue_delayed_work(ceph_msgr_wq, &con->work,
- round_jiffies_relative(con->delay)) == 0)
- con->ops->put(con);
-
-out:
- if (con->ops->fault)
- con->ops->fault(con);
-}
-
-
-
-/*
- * create a new messenger instance
- */
-struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
-{
- struct ceph_messenger *msgr;
-
- msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
- if (msgr == NULL)
- return ERR_PTR(-ENOMEM);
-
- spin_lock_init(&msgr->global_seq_lock);
-
- /* the zero page is needed if a request is "canceled" while the message
- * is being written over the socket */
- msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!msgr->zero_page) {
- kfree(msgr);
- return ERR_PTR(-ENOMEM);
- }
- kmap(msgr->zero_page);
-
- if (myaddr)
- msgr->inst.addr = *myaddr;
-
- /* select a random nonce */
- get_random_bytes(&msgr->inst.addr.nonce,
- sizeof(msgr->inst.addr.nonce));
-
- dout("messenger_create %p\n", msgr);
- return msgr;
-}
-
-void ceph_messenger_destroy(struct ceph_messenger *msgr)
-{
- dout("destroy %p\n", msgr);
- kunmap(msgr->zero_page);
- __free_page(msgr->zero_page);
- kfree(msgr);
- dout("destroyed messenger %p\n", msgr);
-}
-
-/*
- * Queue up an outgoing message on the given connection.
- */
-void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
-{
- if (test_bit(CLOSED, &con->state)) {
- dout("con_send %p closed, dropping %p\n", con, msg);
- ceph_msg_put(msg);
- return;
- }
-
- /* set src+dst */
- msg->hdr.src = con->msgr->inst;
- msg->hdr.orig_src = con->msgr->inst;
- msg->hdr.dst_erank = con->peer_addr.erank;
-
- /* queue */
- mutex_lock(&con->out_mutex);
- BUG_ON(!list_empty(&msg->list_head));
- list_add_tail(&msg->list_head, &con->out_queue);
- dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
- ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
- ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
- le32_to_cpu(msg->hdr.front_len),
- le32_to_cpu(msg->hdr.middle_len),
- le32_to_cpu(msg->hdr.data_len));
- mutex_unlock(&con->out_mutex);
-
- /* if there wasn't anything waiting to send before, queue
- * new work */
- if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
- queue_con(con);
-}
-
-/*
- * Revoke a message that was previously queued for send
- */
-void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
-{
- mutex_lock(&con->out_mutex);
- if (!list_empty(&msg->list_head)) {
- dout("con_revoke %p msg %p\n", con, msg);
- list_del_init(&msg->list_head);
- ceph_msg_put(msg);
- msg->hdr.seq = 0;
- if (con->out_msg == msg)
- con->out_msg = NULL;
- if (con->out_kvec_is_msg) {
- con->out_skip = con->out_kvec_bytes;
- con->out_kvec_is_msg = false;
- }
- } else {
- dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
- }
- mutex_unlock(&con->out_mutex);
-}
-
-/*
- * Queue a keepalive byte to ensure the tcp connection is alive.
- */
-void ceph_con_keepalive(struct ceph_connection *con)
-{
- if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
- test_and_set_bit(WRITE_PENDING, &con->state) == 0)
- queue_con(con);
-}
-
-
-/*
- * construct a new message with given type, size
- * the new msg has a ref count of 1.
- */
-struct ceph_msg *ceph_msg_new(int type, int front_len,
- int page_len, int page_off, struct page **pages)
-{
- struct ceph_msg *m;
-
- m = kmalloc(sizeof(*m), GFP_NOFS);
- if (m == NULL)
- goto out;
- atomic_set(&m->nref, 1);
- INIT_LIST_HEAD(&m->list_head);
-
- m->hdr.type = cpu_to_le16(type);
- m->hdr.front_len = cpu_to_le32(front_len);
- m->hdr.middle_len = 0;
- m->hdr.data_len = cpu_to_le32(page_len);
- m->hdr.data_off = cpu_to_le16(page_off);
- m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
- m->footer.front_crc = 0;
- m->footer.middle_crc = 0;
- m->footer.data_crc = 0;
- m->front_max = front_len;
- m->front_is_vmalloc = false;
- m->more_to_follow = false;
- m->pool = NULL;
-
- /* front */
- if (front_len) {
- if (front_len > PAGE_CACHE_SIZE) {
- m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
- PAGE_KERNEL);
- m->front_is_vmalloc = true;
- } else {
- m->front.iov_base = kmalloc(front_len, GFP_NOFS);
- }
- if (m->front.iov_base == NULL) {
- pr_err("msg_new can't allocate %d bytes\n",
- front_len);
- goto out2;
- }
- } else {
- m->front.iov_base = NULL;
- }
- m->front.iov_len = front_len;
-
- /* middle */
- m->middle = NULL;
-
- /* data */
- m->nr_pages = calc_pages_for(page_off, page_len);
- m->pages = pages;
-
- dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
- m->nr_pages);
- return m;
-
-out2:
- ceph_msg_put(m);
-out:
- pr_err("msg_new can't create type %d len %d\n", type, front_len);
- return ERR_PTR(-ENOMEM);
-}
-
-/*
- * Generic message allocator, for incoming messages.
- */
-struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
- struct ceph_msg_header *hdr)
-{
- int type = le16_to_cpu(hdr->type);
- int front_len = le32_to_cpu(hdr->front_len);
- struct ceph_msg *msg = ceph_msg_new(type, front_len, 0, 0, NULL);
-
- if (!msg) {
- pr_err("unable to allocate msg type %d len %d\n",
- type, front_len);
- return ERR_PTR(-ENOMEM);
- }
- return msg;
-}
-
-/*
- * Allocate "middle" portion of a message, if it is needed and wasn't
- * allocated by alloc_msg. This allows us to read a small fixed-size
- * per-type header in the front and then gracefully fail (i.e.,
- * propagate the error to the caller based on info in the front) when
- * the middle is too large.
- */
-int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
-{
- int type = le16_to_cpu(msg->hdr.type);
- int middle_len = le32_to_cpu(msg->hdr.middle_len);
-
- dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
- ceph_msg_type_name(type), middle_len);
- BUG_ON(!middle_len);
- BUG_ON(msg->middle);
-
- msg->middle = ceph_buffer_new_alloc(middle_len, GFP_NOFS);
- if (!msg->middle)
- return -ENOMEM;
- return 0;
-}
-
-
-/*
- * Free a generically kmalloc'd message.
- */
-void ceph_msg_kfree(struct ceph_msg *m)
-{
- dout("msg_kfree %p\n", m);
- if (m->front_is_vmalloc)
- vfree(m->front.iov_base);
- else
- kfree(m->front.iov_base);
- kfree(m);
-}
-
-/*
- * Drop a msg ref. Destroy as needed.
- */
-void ceph_msg_put(struct ceph_msg *m)
-{
- dout("ceph_msg_put %p %d -> %d\n", m, atomic_read(&m->nref),
- atomic_read(&m->nref)-1);
- if (atomic_read(&m->nref) <= 0) {
- pr_err("bad ceph_msg_put on %p %llu %d=%s %d+%d\n",
- m, le64_to_cpu(m->hdr.seq),
- le16_to_cpu(m->hdr.type),
- ceph_msg_type_name(le16_to_cpu(m->hdr.type)),
- le32_to_cpu(m->hdr.front_len),
- le32_to_cpu(m->hdr.data_len));
- WARN_ON(1);
- }
- if (atomic_dec_and_test(&m->nref)) {
- dout("ceph_msg_put last one on %p\n", m);
- WARN_ON(!list_empty(&m->list_head));
-
- /* drop middle, data, if any */
- if (m->middle) {
- ceph_buffer_put(m->middle);
- m->middle = NULL;
- }
- m->nr_pages = 0;
- m->pages = NULL;
-
- if (m->pool)
- ceph_msgpool_put(m->pool, m);
- else
- ceph_msg_kfree(m);
- }
-}
+++ /dev/null
-#ifndef __FS_CEPH_MESSENGER_H
-#define __FS_CEPH_MESSENGER_H
-
-#include <linux/mutex.h>
-#include <linux/net.h>
-#include <linux/radix-tree.h>
-#include <linux/uio.h>
-#include <linux/version.h>
-#include <linux/workqueue.h>
-
-#include "types.h"
-#include "buffer.h"
-
-struct ceph_msg;
-struct ceph_connection;
-
-extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */
-
-/*
- * Ceph defines these callbacks for handling connection events.
- */
-struct ceph_connection_operations {
- struct ceph_connection *(*get)(struct ceph_connection *);
- void (*put)(struct ceph_connection *);
-
- /* handle an incoming message. */
- void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
-
- /* protocol version mismatch */
- void (*bad_proto) (struct ceph_connection *con);
-
- /* there was some error on the socket (disconnect, whatever) */
- void (*fault) (struct ceph_connection *con);
-
- /* a remote host as terminated a message exchange session, and messages
- * we sent (or they tried to send us) may be lost. */
- void (*peer_reset) (struct ceph_connection *con);
-
- struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
- struct ceph_msg_header *hdr);
- int (*alloc_middle) (struct ceph_connection *con,
- struct ceph_msg *msg);
- /* an incoming message has a data payload; tell me what pages I
- * should read the data into. */
- int (*prepare_pages) (struct ceph_connection *con, struct ceph_msg *m,
- int want);
-};
-
-extern const char *ceph_name_type_str(int t);
-
-/* use format string %s%d */
-#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
-
-struct ceph_messenger {
- struct ceph_entity_inst inst; /* my name+address */
- struct page *zero_page; /* used in certain error cases */
-
- bool nocrc;
-
- /*
- * the global_seq counts connections i (attempt to) initiate
- * in order to disambiguate certain connect race conditions.
- */
- u32 global_seq;
- spinlock_t global_seq_lock;
-};
-
-/*
- * a single message. it contains a header (src, dest, message type, etc.),
- * footer (crc values, mainly), a "front" message body, and possibly a
- * data payload (stored in some number of pages).
- */
-struct ceph_msg {
- struct ceph_msg_header hdr; /* header */
- struct ceph_msg_footer footer; /* footer */
- struct kvec front; /* unaligned blobs of message */
- struct ceph_buffer *middle;
- struct page **pages; /* data payload. NOT OWNER. */
- unsigned nr_pages; /* size of page array */
- struct list_head list_head;
- atomic_t nref;
- bool front_is_vmalloc;
- bool more_to_follow;
- int front_max;
-
- struct ceph_msgpool *pool;
-};
-
-struct ceph_msg_pos {
- int page, page_pos; /* which page; offset in page */
- int data_pos; /* offset in data payload */
- int did_page_crc; /* true if we've calculated crc for current page */
-};
-
-/* ceph connection fault delay defaults, for exponential backoff */
-#define BASE_DELAY_INTERVAL (HZ/2)
-#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
-
-/*
- * ceph_connection state bit flags
- *
- * QUEUED and BUSY are used together to ensure that only a single
- * thread is currently opening, reading or writing data to the socket.
- */
-#define LOSSYTX 0 /* we can close channel or drop messages on errors */
-#define LOSSYRX 1 /* peer may reset/drop messages */
-#define CONNECTING 2
-#define KEEPALIVE_PENDING 3
-#define WRITE_PENDING 4 /* we have data ready to send */
-#define QUEUED 5 /* there is work queued on this connection */
-#define BUSY 6 /* work is being done */
-#define STANDBY 8 /* no outgoing messages, socket closed. we keep
- * the ceph_connection around to maintain shared
- * state with the peer. */
-#define CLOSED 10 /* we've closed the connection */
-#define SOCK_CLOSED 11 /* socket state changed to closed */
-#define REGISTERED 12 /* connection appears in con_tree */
-#define OPENING 13 /* open connection w/ (possibly new) peer */
-#define DEAD 14 /* dead, about to kfree */
-
-/*
- * A single connection with another host.
- *
- * We maintain a queue of outgoing messages, and some session state to
- * ensure that we can preserve the lossless, ordered delivery of
- * messages in the case of a TCP disconnect.
- */
-struct ceph_connection {
- void *private;
- atomic_t nref;
-
- const struct ceph_connection_operations *ops;
-
- struct ceph_messenger *msgr;
- struct socket *sock;
- unsigned long state; /* connection state (see flags above) */
- const char *error_msg; /* error message, if any */
-
- struct ceph_entity_addr peer_addr; /* peer address */
- struct ceph_entity_name peer_name; /* peer name */
- struct ceph_entity_addr peer_addr_for_me;
- u32 connect_seq; /* identify the most recent connection
- attempt for this connection, client */
- u32 peer_global_seq; /* peer's global seq for this connection */
-
- /* out queue */
- struct mutex out_mutex;
- struct list_head out_queue;
- struct list_head out_sent; /* sending or sent but unacked */
- u64 out_seq; /* last message queued for send */
- u64 out_seq_sent; /* last message sent */
- bool out_keepalive_pending;
-
- u64 in_seq, in_seq_acked; /* last message received, acked */
-
- /* connection negotiation temps */
- char in_banner[CEPH_BANNER_MAX_LEN];
- union {
- struct { /* outgoing connection */
- struct ceph_msg_connect out_connect;
- struct ceph_msg_connect_reply in_reply;
- };
- struct { /* incoming */
- struct ceph_msg_connect in_connect;
- struct ceph_msg_connect_reply out_reply;
- };
- };
- struct ceph_entity_addr actual_peer_addr;
-
- /* message out temps */
- struct ceph_msg *out_msg; /* sending message (== tail of
- out_sent) */
- struct ceph_msg_pos out_msg_pos;
-
- struct kvec out_kvec[8], /* sending header/footer data */
- *out_kvec_cur;
- int out_kvec_left; /* kvec's left in out_kvec */
- int out_skip; /* skip this many bytes */
- int out_kvec_bytes; /* total bytes left */
- bool out_kvec_is_msg; /* kvec refers to out_msg */
- int out_more; /* there is more data after the kvecs */
- __le64 out_temp_ack; /* for writing an ack */
-
- /* message in temps */
- struct ceph_msg_header in_hdr;
- struct ceph_msg *in_msg;
- struct ceph_msg_pos in_msg_pos;
- u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
-
- char in_tag; /* protocol control byte */
- int in_base_pos; /* bytes read */
- __le64 in_temp_ack; /* for reading an ack */
-
- struct delayed_work work; /* send|recv work */
- unsigned long delay; /* current delay interval */
-};
-
-
-extern const char *pr_addr(const struct sockaddr_storage *ss);
-extern int ceph_parse_ips(const char *c, const char *end,
- struct ceph_entity_addr *addr,
- int max_count, int *count);
-
-
-extern int ceph_msgr_init(void);
-extern void ceph_msgr_exit(void);
-
-extern struct ceph_messenger *ceph_messenger_create(
- struct ceph_entity_addr *myaddr);
-extern void ceph_messenger_destroy(struct ceph_messenger *);
-
-extern void ceph_con_init(struct ceph_messenger *msgr,
- struct ceph_connection *con);
-extern void ceph_con_shutdown(struct ceph_connection *con);
-extern void ceph_con_open(struct ceph_connection *con,
- struct ceph_entity_addr *addr);
-extern void ceph_con_close(struct ceph_connection *con);
-extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
-extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
-extern void ceph_con_keepalive(struct ceph_connection *con);
-extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
-extern void ceph_con_put(struct ceph_connection *con);
-
-extern struct ceph_msg *ceph_msg_new(int type, int front_len,
- int page_len, int page_off,
- struct page **pages);
-extern void ceph_msg_kfree(struct ceph_msg *m);
-
-extern struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
- struct ceph_msg_header *hdr);
-extern int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg);
-
-
-static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
-{
- dout("ceph_msg_get %p %d -> %d\n", msg, atomic_read(&msg->nref),
- atomic_read(&msg->nref)+1);
- atomic_inc(&msg->nref);
- return msg;
-}
-extern void ceph_msg_put(struct ceph_msg *msg);
-
-#endif
+++ /dev/null
-#include "ceph_debug.h"
-
-#include <linux/types.h>
-#include <linux/random.h>
-#include <linux/sched.h>
-
-#include "mon_client.h"
-#include "super.h"
-#include "decode.h"
-
-/*
- * Interact with Ceph monitor cluster. Handle requests for new map
- * versions, and periodically resend as needed. Also implement
- * statfs() and umount().
- *
- * A small cluster of Ceph "monitors" are responsible for managing critical
- * cluster configuration and state information. An odd number (e.g., 3, 5)
- * of cmon daemons use a modified version of the Paxos part-time parliament
- * algorithm to manage the MDS map (mds cluster membership), OSD map, and
- * list of clients who have mounted the file system.
- *
- * We maintain an open, active session with a monitor at all times in order to
- * receive timely MDSMap updates. We periodically send a keepalive byte on the
- * TCP socket to ensure we detect a failure. If the connection does break, we
- * randomly hunt for a new monitor. Once the connection is reestablished, we
- * resend any outstanding requests.
- */
-
-const static struct ceph_connection_operations mon_con_ops;
-
-/*
- * Decode a monmap blob (e.g., during mount).
- */
-struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
-{
- struct ceph_monmap *m = NULL;
- int i, err = -EINVAL;
- struct ceph_fsid fsid;
- u32 epoch, num_mon;
- u16 version;
-
- dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
-
- ceph_decode_16_safe(&p, end, version, bad);
-
- ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
- ceph_decode_copy(&p, &fsid, sizeof(fsid));
- ceph_decode_32(&p, epoch);
-
- ceph_decode_32(&p, num_mon);
- ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
-
- if (num_mon >= CEPH_MAX_MON)
- goto bad;
- m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
- if (m == NULL)
- return ERR_PTR(-ENOMEM);
- m->fsid = fsid;
- m->epoch = epoch;
- m->num_mon = num_mon;
- ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
-
- if (p != end)
- goto bad;
-
- dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
- m->num_mon);
- for (i = 0; i < m->num_mon; i++)
- dout("monmap_decode mon%d is %s\n", i,
- pr_addr(&m->mon_inst[i].addr.in_addr));
- return m;
-
-bad:
- dout("monmap_decode failed with %d\n", err);
- kfree(m);
- return ERR_PTR(err);
-}
-
-/*
- * return true if *addr is included in the monmap.
- */
-int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
-{
- int i;
-
- for (i = 0; i < m->num_mon; i++)
- if (ceph_entity_addr_equal(addr, &m->mon_inst[i].addr))
- return 1;
- return 0;
-}
-
-/*
- * Close monitor session, if any.
- */
-static void __close_session(struct ceph_mon_client *monc)
-{
- if (monc->con) {
- dout("__close_session closing mon%d\n", monc->cur_mon);
- ceph_con_close(monc->con);
- monc->cur_mon = -1;
- }
-}
-
-/*
- * Open a session with a (new) monitor.
- */
-static int __open_session(struct ceph_mon_client *monc)
-{
- char r;
-
- if (monc->cur_mon < 0) {
- get_random_bytes(&r, 1);
- monc->cur_mon = r % monc->monmap->num_mon;
- dout("open_session num=%d r=%d -> mon%d\n",
- monc->monmap->num_mon, r, monc->cur_mon);
- monc->sub_sent = 0;
- monc->sub_renew_after = jiffies; /* i.e., expired */
- monc->want_next_osdmap = !!monc->want_next_osdmap;
-
- dout("open_session mon%d opening\n", monc->cur_mon);
- monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
- monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
- ceph_con_open(monc->con,
- &monc->monmap->mon_inst[monc->cur_mon].addr);
- } else {
- dout("open_session mon%d already open\n", monc->cur_mon);
- }
- return 0;
-}
-
-static bool __sub_expired(struct ceph_mon_client *monc)
-{
- return time_after_eq(jiffies, monc->sub_renew_after);
-}
-
-/*
- * Reschedule delayed work timer.
- */
-static void __schedule_delayed(struct ceph_mon_client *monc)
-{
- unsigned delay;
-
- if (monc->cur_mon < 0 || monc->want_mount || __sub_expired(monc))
- delay = 10 * HZ;
- else
- delay = 20 * HZ;
- dout("__schedule_delayed after %u\n", delay);
- schedule_delayed_work(&monc->delayed_work, delay);
-}
-
-/*
- * Send subscribe request for mdsmap and/or osdmap.
- */
-static void __send_subscribe(struct ceph_mon_client *monc)
-{
- dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
- (unsigned)monc->sub_sent, __sub_expired(monc),
- monc->want_next_osdmap);
- if ((__sub_expired(monc) && !monc->sub_sent) ||
- monc->want_next_osdmap == 1) {
- struct ceph_msg *msg;
- struct ceph_mon_subscribe_item *i;
- void *p, *end;
-
- msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 64, 0, 0, NULL);
- if (!msg)
- return;
-
- p = msg->front.iov_base;
- end = p + msg->front.iov_len;
-
- dout("__send_subscribe to 'mdsmap' %u+\n",
- (unsigned)monc->have_mdsmap);
- if (monc->want_next_osdmap) {
- dout("__send_subscribe to 'osdmap' %u\n",
- (unsigned)monc->have_osdmap);
- ceph_encode_32(&p, 2);
- ceph_encode_string(&p, end, "osdmap", 6);
- i = p;
- i->have = cpu_to_le64(monc->have_osdmap);
- i->onetime = 1;
- p += sizeof(*i);
- monc->want_next_osdmap = 2; /* requested */
- } else {
- ceph_encode_32(&p, 1);
- }
- ceph_encode_string(&p, end, "mdsmap", 6);
- i = p;
- i->have = cpu_to_le64(monc->have_mdsmap);
- i->onetime = 0;
- p += sizeof(*i);
-
- msg->front.iov_len = p - msg->front.iov_base;
- msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
- ceph_con_send(monc->con, msg);
-
- monc->sub_sent = jiffies | 1; /* never 0 */
- }
-}
-
-static void handle_subscribe_ack(struct ceph_mon_client *monc,
- struct ceph_msg *msg)
-{
- unsigned seconds;
- void *p = msg->front.iov_base;
- void *end = p + msg->front.iov_len;
-
- ceph_decode_32_safe(&p, end, seconds, bad);
- mutex_lock(&monc->mutex);
- if (monc->hunting) {
- pr_info("mon%d %s session established\n",
- monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
- monc->hunting = false;
- }
- dout("handle_subscribe_ack after %d seconds\n", seconds);
- monc->sub_renew_after = monc->sub_sent + seconds*HZ - 1;
- monc->sub_sent = 0;
- mutex_unlock(&monc->mutex);
- return;
-bad:
- pr_err("got corrupt subscribe-ack msg\n");
-}
-
-/*
- * Keep track of which maps we have
- */
-int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
-{
- mutex_lock(&monc->mutex);
- monc->have_mdsmap = got;
- mutex_unlock(&monc->mutex);
- return 0;
-}
-
-int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
-{
- mutex_lock(&monc->mutex);
- monc->have_osdmap = got;
- monc->want_next_osdmap = 0;
- mutex_unlock(&monc->mutex);
- return 0;
-}
-
-/*
- * Register interest in the next osdmap
- */
-void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
-{
- dout("request_next_osdmap have %u\n", monc->have_osdmap);
- mutex_lock(&monc->mutex);
- if (!monc->want_next_osdmap)
- monc->want_next_osdmap = 1;
- if (monc->want_next_osdmap < 2)
- __send_subscribe(monc);
- mutex_unlock(&monc->mutex);
-}
-
-
-/*
- * mount
- */
-static void __request_mount(struct ceph_mon_client *monc)
-{
- struct ceph_msg *msg;
- struct ceph_client_mount *h;
- int err;
-
- dout("__request_mount\n");
- err = __open_session(monc);
- if (err)
- return;
- msg = ceph_msg_new(CEPH_MSG_CLIENT_MOUNT, sizeof(*h), 0, 0, NULL);
- if (IS_ERR(msg))
- return;
- h = msg->front.iov_base;
- h->have_version = 0;
- ceph_con_send(monc->con, msg);
-}
-
-int ceph_monc_request_mount(struct ceph_mon_client *monc)
-{
- if (!monc->con) {
- monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
- if (!monc->con)
- return -ENOMEM;
- ceph_con_init(monc->client->msgr, monc->con);
- monc->con->private = monc;
- monc->con->ops = &mon_con_ops;
- }
-
- mutex_lock(&monc->mutex);
- __request_mount(monc);
- __schedule_delayed(monc);
- mutex_unlock(&monc->mutex);
- return 0;
-}
-
-/*
- * The monitor responds with mount ack indicate mount success. The
- * included client ticket allows the client to talk to MDSs and OSDs.
- */
-static void handle_mount_ack(struct ceph_mon_client *monc, struct ceph_msg *msg)
-{
- struct ceph_client *client = monc->client;
- struct ceph_monmap *monmap = NULL, *old = monc->monmap;
- void *p, *end;
- s32 result;
- u32 len;
- s64 cnum;
- int err = -EINVAL;
-
- if (client->whoami >= 0) {
- dout("handle_mount_ack - already mounted\n");
- return;
- }
-
- mutex_lock(&monc->mutex);
-
- dout("handle_mount_ack\n");
- p = msg->front.iov_base;
- end = p + msg->front.iov_len;
-
- ceph_decode_64_safe(&p, end, cnum, bad);
- ceph_decode_32_safe(&p, end, result, bad);
- ceph_decode_32_safe(&p, end, len, bad);
- if (result) {
- pr_err("mount denied: %.*s (%d)\n", len, (char *)p,
- result);
- err = result;
- goto out;
- }
- p += len;
-
- ceph_decode_32_safe(&p, end, len, bad);
- ceph_decode_need(&p, end, len, bad);
- monmap = ceph_monmap_decode(p, p + len);
- if (IS_ERR(monmap)) {
- pr_err("problem decoding monmap, %d\n",
- (int)PTR_ERR(monmap));
- err = -EINVAL;
- goto out;
- }
- p += len;
-
- client->monc.monmap = monmap;
- kfree(old);
-
- client->signed_ticket = NULL;
- client->signed_ticket_len = 0;
-
- monc->want_mount = false;
-
- client->whoami = cnum;
- client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
- client->msgr->inst.name.num = cpu_to_le64(cnum);
- pr_info("client%lld fsid " FSID_FORMAT "\n",
- client->whoami, PR_FSID(&client->monc.monmap->fsid));
-
- ceph_debugfs_client_init(client);
- __send_subscribe(monc);
-
- err = 0;
- goto out;
-
-bad:
- pr_err("error decoding mount_ack message\n");
-out:
- client->mount_err = err;
- mutex_unlock(&monc->mutex);
- wake_up(&client->mount_wq);
-}
-
-
-
-
-/*
- * statfs
- */
-static void handle_statfs_reply(struct ceph_mon_client *monc,
- struct ceph_msg *msg)
-{
- struct ceph_mon_statfs_request *req;
- struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
- u64 tid;
-
- if (msg->front.iov_len != sizeof(*reply))
- goto bad;
- tid = le64_to_cpu(reply->tid);
- dout("handle_statfs_reply %p tid %llu\n", msg, tid);
-
- mutex_lock(&monc->mutex);
- req = radix_tree_lookup(&monc->statfs_request_tree, tid);
- if (req) {
- *req->buf = reply->st;
- req->result = 0;
- }
- mutex_unlock(&monc->mutex);
- if (req)
- complete(&req->completion);
- return;
-
-bad:
- pr_err("corrupt statfs reply, no tid\n");
-}
-
-/*
- * (re)send a statfs request
- */
-static int send_statfs(struct ceph_mon_client *monc,
- struct ceph_mon_statfs_request *req)
-{
- struct ceph_msg *msg;
- struct ceph_mon_statfs *h;
- int err;
-
- dout("send_statfs tid %llu\n", req->tid);
- err = __open_session(monc);
- if (err)
- return err;
- msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
- if (IS_ERR(msg))
- return PTR_ERR(msg);
- req->request = msg;
- h = msg->front.iov_base;
- h->have_version = 0;
- h->fsid = monc->monmap->fsid;
- h->tid = cpu_to_le64(req->tid);
- ceph_con_send(monc->con, msg);
- return 0;
-}
-
-/*
- * Do a synchronous statfs().
- */
-int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
-{
- struct ceph_mon_statfs_request req;
- int err;
-
- req.buf = buf;
- init_completion(&req.completion);
-
- /* allocate memory for reply */
- err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
- if (err)
- return err;
-
- /* register request */
- mutex_lock(&monc->mutex);
- req.tid = ++monc->last_tid;
- req.last_attempt = jiffies;
- req.delay = BASE_DELAY_INTERVAL;
- if (radix_tree_insert(&monc->statfs_request_tree, req.tid, &req) < 0) {
- mutex_unlock(&monc->mutex);
- pr_err("ENOMEM in do_statfs\n");
- return -ENOMEM;
- }
- monc->num_statfs_requests++;
- mutex_unlock(&monc->mutex);
-
- /* send request and wait */
- err = send_statfs(monc, &req);
- if (!err)
- err = wait_for_completion_interruptible(&req.completion);
-
- mutex_lock(&monc->mutex);
- radix_tree_delete(&monc->statfs_request_tree, req.tid);
- monc->num_statfs_requests--;
- ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
- mutex_unlock(&monc->mutex);
-
- if (!err)
- err = req.result;
- return err;
-}
-
-/*
- * Resend pending statfs requests.
- */
-static void __resend_statfs(struct ceph_mon_client *monc)
-{
- u64 next_tid = 0;
- int got;
- int did = 0;
- struct ceph_mon_statfs_request *req;
-
- while (1) {
- got = radix_tree_gang_lookup(&monc->statfs_request_tree,
- (void **)&req,
- next_tid, 1);
- if (got == 0)
- break;
- did++;
- next_tid = req->tid + 1;
-
- send_statfs(monc, req);
- }
-}
-
-/*
- * Delayed work. If we haven't mounted yet, retry. Otherwise,
- * renew/retry subscription as needed (in case it is timing out, or we
- * got an ENOMEM). And keep the monitor connection alive.
- */
-static void delayed_work(struct work_struct *work)
-{
- struct ceph_mon_client *monc =
- container_of(work, struct ceph_mon_client, delayed_work.work);
-
- dout("monc delayed_work\n");
- mutex_lock(&monc->mutex);
- if (monc->want_mount) {
- __request_mount(monc);
- } else {
- if (__sub_expired(monc)) {
- __close_session(monc);
- __open_session(monc); /* continue hunting */
- } else {
- ceph_con_keepalive(monc->con);
- }
- }
- __send_subscribe(monc);
- __schedule_delayed(monc);
- mutex_unlock(&monc->mutex);
-}
-
-int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
-{
- int err = 0;
-
- dout("init\n");
- memset(monc, 0, sizeof(*monc));
- monc->client = cl;
- monc->monmap = NULL;
- mutex_init(&monc->mutex);
-
- monc->con = NULL;
-
- /* msg pools */
- err = ceph_msgpool_init(&monc->msgpool_mount_ack, 4096, 1, false);
- if (err < 0)
- goto out;
- err = ceph_msgpool_init(&monc->msgpool_subscribe_ack, 8, 1, false);
- if (err < 0)
- goto out;
- err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
- sizeof(struct ceph_mon_statfs_reply), 0, false);
- if (err < 0)
- goto out;
-
- monc->cur_mon = -1;
- monc->hunting = false; /* not really */
- monc->sub_renew_after = jiffies;
- monc->sub_sent = 0;
-
- INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
- INIT_RADIX_TREE(&monc->statfs_request_tree, GFP_NOFS);
- monc->num_statfs_requests = 0;
- monc->last_tid = 0;
-
- monc->have_mdsmap = 0;
- monc->have_osdmap = 0;
- monc->want_next_osdmap = 1;
- monc->want_mount = true;
-out:
- return err;
-}
-
-void ceph_monc_stop(struct ceph_mon_client *monc)
-{
- dout("stop\n");
- cancel_delayed_work_sync(&monc->delayed_work);
-
- mutex_lock(&monc->mutex);
- __close_session(monc);
- if (monc->con) {
- monc->con->private = NULL;
- monc->con->ops->put(monc->con);
- monc->con = NULL;
- }
- mutex_unlock(&monc->mutex);
-
- ceph_msgpool_destroy(&monc->msgpool_mount_ack);
- ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
- ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
-
- kfree(monc->monmap);
-}
-
-
-/*
- * handle incoming message
- */
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
-{
- struct ceph_mon_client *monc = con->private;
- int type = le16_to_cpu(msg->hdr.type);
-
- if (!monc)
- return;
-
- switch (type) {
- case CEPH_MSG_CLIENT_MOUNT_ACK:
- handle_mount_ack(monc, msg);
- break;
-
- case CEPH_MSG_MON_SUBSCRIBE_ACK:
- handle_subscribe_ack(monc, msg);
- break;
-
- case CEPH_MSG_STATFS_REPLY:
- handle_statfs_reply(monc, msg);
- break;
-
- case CEPH_MSG_MDS_MAP:
- ceph_mdsc_handle_map(&monc->client->mdsc, msg);
- break;
-
- case CEPH_MSG_OSD_MAP:
- ceph_osdc_handle_map(&monc->client->osdc, msg);
- break;
-
- default:
- pr_err("received unknown message type %d %s\n", type,
- ceph_msg_type_name(type));
- }
- ceph_msg_put(msg);
-}
-
-/*
- * Allocate memory for incoming message
- */
-static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
- struct ceph_msg_header *hdr)
-{
- struct ceph_mon_client *monc = con->private;
- int type = le16_to_cpu(hdr->type);
-
- switch (type) {
- case CEPH_MSG_CLIENT_MOUNT_ACK:
- return ceph_msgpool_get(&monc->msgpool_mount_ack);
- case CEPH_MSG_MON_SUBSCRIBE_ACK:
- return ceph_msgpool_get(&monc->msgpool_subscribe_ack);
- case CEPH_MSG_STATFS_REPLY:
- return ceph_msgpool_get(&monc->msgpool_statfs_reply);
- }
- return ceph_alloc_msg(con, hdr);
-}
-
-/*
- * If the monitor connection resets, pick a new monitor and resubmit
- * any pending requests.
- */
-static void mon_fault(struct ceph_connection *con)
-{
- struct ceph_mon_client *monc = con->private;
-
- if (!monc)
- return;
-
- dout("mon_fault\n");
- mutex_lock(&monc->mutex);
- if (!con->private)
- goto out;
-
- if (monc->con && !monc->hunting)
- pr_info("mon%d %s session lost, "
- "hunting for new mon\n", monc->cur_mon,
- pr_addr(&monc->con->peer_addr.in_addr));
-
- __close_session(monc);
- if (!monc->hunting) {
- /* start hunting */
- monc->hunting = true;
- if (__open_session(monc) == 0) {
- __send_subscribe(monc);
- __resend_statfs(monc);
- }
- } else {
- /* already hunting, let's wait a bit */
- __schedule_delayed(monc);
- }
-out:
- mutex_unlock(&monc->mutex);
-}
-
-const static struct ceph_connection_operations mon_con_ops = {
- .get = ceph_con_get,
- .put = ceph_con_put,
- .dispatch = dispatch,
- .fault = mon_fault,
- .alloc_msg = mon_alloc_msg,
- .alloc_middle = ceph_alloc_middle,
-};
+++ /dev/null
-#ifndef _FS_CEPH_MON_CLIENT_H
-#define _FS_CEPH_MON_CLIENT_H
-
-#include <linux/completion.h>
-#include <linux/radix-tree.h>
-
-#include "messenger.h"
-#include "msgpool.h"
-
-struct ceph_client;
-struct ceph_mount_args;
-
-/*
- * The monitor map enumerates the set of all monitors.
- */
-struct ceph_monmap {
- struct ceph_fsid fsid;
- u32 epoch;
- u32 num_mon;
- struct ceph_entity_inst mon_inst[0];
-};
-
-struct ceph_mon_client;
-struct ceph_mon_statfs_request;
-
-
-/*
- * Generic mechanism for resending monitor requests.
- */
-typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
- int newmon);
-
-/* a pending monitor request */
-struct ceph_mon_request {
- struct ceph_mon_client *monc;
- struct delayed_work delayed_work;
- unsigned long delay;
- ceph_monc_request_func_t do_request;
-};
-
-/*
- * statfs() is done a bit differently because we need to get data back
- * to the caller
- */
-struct ceph_mon_statfs_request {
- u64 tid;
- int result;
- struct ceph_statfs *buf;
- struct completion completion;
- unsigned long last_attempt, delay; /* jiffies */
- struct ceph_msg *request; /* original request */
-};
-
-struct ceph_mon_client {
- struct ceph_client *client;
- struct ceph_monmap *monmap;
-
- struct mutex mutex;
- struct delayed_work delayed_work;
-
- bool hunting;
- int cur_mon; /* last monitor i contacted */
- unsigned long sub_sent, sub_renew_after;
- struct ceph_connection *con;
-
- /* msg pools */
- struct ceph_msgpool msgpool_mount_ack;
- struct ceph_msgpool msgpool_subscribe_ack;
- struct ceph_msgpool msgpool_statfs_reply;
-
- /* pending statfs requests */
- struct radix_tree_root statfs_request_tree;
- int num_statfs_requests;
- u64 last_tid;
-
- /* mds/osd map or mount requests */
- bool want_mount;
- int want_next_osdmap; /* 1 = want, 2 = want+asked */
- u32 have_osdmap, have_mdsmap;
-
- struct dentry *debugfs_file;
-};
-
-extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
-extern int ceph_monmap_contains(struct ceph_monmap *m,
- struct ceph_entity_addr *addr);
-
-extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
-extern void ceph_monc_stop(struct ceph_mon_client *monc);
-
-/*
- * The model here is to indicate that we need a new map of at least
- * epoch @want, and also call in when we receive a map. We will
- * periodically rerequest the map from the monitor cluster until we
- * get what we want.
- */
-extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
-extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
-
-extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
-
-extern int ceph_monc_request_mount(struct ceph_mon_client *monc);
-
-extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
- struct ceph_statfs *buf);
-
-
-
-#endif
+++ /dev/null
-#include "ceph_debug.h"
-
-#include <linux/err.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/vmalloc.h>
-
-#include "msgpool.h"
-
-/*
- * We use msg pools to preallocate memory for messages we expect to
- * receive over the wire, to avoid getting ourselves into OOM
- * conditions at unexpected times. We take use a few different
- * strategies:
- *
- * - for request/response type interactions, we preallocate the
- * memory needed for the response when we generate the request.
- *
- * - for messages we can receive at any time from the MDS, we preallocate
- * a pool of messages we can re-use.
- *
- * - for writeback, we preallocate some number of messages to use for
- * requests and their replies, so that we always make forward
- * progress.
- *
- * The msgpool behaves like a mempool_t, but keeps preallocated
- * ceph_msgs strung together on a list_head instead of using a pointer
- * vector. This avoids vector reallocation when we adjust the number
- * of preallocated items (which happens frequently).
- */
-
-
-/*
- * Allocate or release as necessary to meet our target pool size.
- */
-static int __fill_msgpool(struct ceph_msgpool *pool)
-{
- struct ceph_msg *msg;
-
- while (pool->num < pool->min) {
- dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
- pool->min);
- spin_unlock(&pool->lock);
- msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
- spin_lock(&pool->lock);
- if (IS_ERR(msg))
- return PTR_ERR(msg);
- msg->pool = pool;
- list_add(&msg->list_head, &pool->msgs);
- pool->num++;
- }
- while (pool->num > pool->min) {
- msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
- dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
- pool->min, msg);
- list_del_init(&msg->list_head);
- pool->num--;
- ceph_msg_kfree(msg);
- }
- return 0;
-}
-
-int ceph_msgpool_init(struct ceph_msgpool *pool,
- int front_len, int min, bool blocking)
-{
- int ret;
-
- dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
- spin_lock_init(&pool->lock);
- pool->front_len = front_len;
- INIT_LIST_HEAD(&pool->msgs);
- pool->num = 0;
- pool->min = min;
- pool->blocking = blocking;
- init_waitqueue_head(&pool->wait);
-
- spin_lock(&pool->lock);
- ret = __fill_msgpool(pool);
- spin_unlock(&pool->lock);
- return ret;
-}
-
-void ceph_msgpool_destroy(struct ceph_msgpool *pool)
-{
- dout("msgpool_destroy %p\n", pool);
- spin_lock(&pool->lock);
- pool->min = 0;
- __fill_msgpool(pool);
- spin_unlock(&pool->lock);
-}
-
-int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
-{
- int ret;
-
- spin_lock(&pool->lock);
- dout("msgpool_resv %p delta %d\n", pool, delta);
- pool->min += delta;
- ret = __fill_msgpool(pool);
- spin_unlock(&pool->lock);
- return ret;
-}
-
-struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool)
-{
- wait_queue_t wait;
- struct ceph_msg *msg;
-
- if (pool->blocking) {
- /* mempool_t behavior; first try to alloc */
- msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
- if (!IS_ERR(msg))
- return msg;
- }
-
- while (1) {
- spin_lock(&pool->lock);
- if (likely(pool->num)) {
- msg = list_entry(pool->msgs.next, struct ceph_msg,
- list_head);
- list_del_init(&msg->list_head);
- pool->num--;
- dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
- pool->num, pool->min);
- spin_unlock(&pool->lock);
- return msg;
- }
- pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
- pool->min, pool->blocking ? "waiting" : "failing");
- spin_unlock(&pool->lock);
-
- if (!pool->blocking) {
- WARN_ON(1);
-
- /* maybe we can allocate it now? */
- msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
- if (!IS_ERR(msg))
- return msg;
-
- return ERR_PTR(-ENOMEM);
- }
-
- init_wait(&wait);
- prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
- schedule();
- finish_wait(&pool->wait, &wait);
- }
-}
-
-void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
-{
- spin_lock(&pool->lock);
- if (pool->num < pool->min) {
- ceph_msg_get(msg); /* retake a single ref */
- list_add(&msg->list_head, &pool->msgs);
- pool->num++;
- dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
- pool->num, pool->min);
- spin_unlock(&pool->lock);
- wake_up(&pool->wait);
- } else {
- dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
- pool->num, pool->min);
- spin_unlock(&pool->lock);
- ceph_msg_kfree(msg);
- }
-}
+++ /dev/null
-#ifndef _FS_CEPH_MSGPOOL
-#define _FS_CEPH_MSGPOOL
-
-#include "messenger.h"
-
-/*
- * we use memory pools for preallocating messages we may receive, to
- * avoid unexpected OOM conditions.
- */
-struct ceph_msgpool {
- spinlock_t lock;
- int front_len; /* preallocated payload size */
- struct list_head msgs; /* msgs in the pool; each has 1 ref */
- int num, min; /* cur, min # msgs in the pool */
- bool blocking;
- wait_queue_head_t wait;
-};
-
-extern int ceph_msgpool_init(struct ceph_msgpool *pool,
- int front_len, int size, bool blocking);
-extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
-extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
-extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *);
-extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
-
-#endif
+++ /dev/null
-../include/msgr.h
\ No newline at end of file
+++ /dev/null
-#include "ceph_debug.h"
-
-#include <linux/err.h>
-#include <linux/highmem.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-
-#include "super.h"
-#include "osd_client.h"
-#include "messenger.h"
-#include "decode.h"
-
-const static struct ceph_connection_operations osd_con_ops;
-
-static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
-
-/*
- * Implement client access to distributed object storage cluster.
- *
- * All data objects are stored within a cluster/cloud of OSDs, or
- * "object storage devices." (Note that Ceph OSDs have _nothing_ to
- * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
- * remote daemons serving up and coordinating consistent and safe
- * access to storage.
- *
- * Cluster membership and the mapping of data objects onto storage devices
- * are described by the osd map.
- *
- * We keep track of pending OSD requests (read, write), resubmit
- * requests to different OSDs when the cluster topology/data layout
- * change, or retry the affected requests when the communications
- * channel with an OSD is reset.
- */
-
-/*
- * calculate the mapping of a file extent onto an object, and fill out the
- * request accordingly. shorten extent as necessary if it crosses an
- * object boundary.
- *
- * fill osd op in request message.
- */
-static void calc_layout(struct ceph_osd_client *osdc,
- struct ceph_vino vino, struct ceph_file_layout *layout,
- u64 off, u64 *plen,
- struct ceph_osd_request *req)
-{
- struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
- struct ceph_osd_op *op = (void *)(reqhead + 1);
- u64 orig_len = *plen;
- u64 objoff, objlen; /* extent in object */
- u64 bno;
-
- reqhead->snapid = cpu_to_le64(vino.snap);
-
- /* object extent? */
- ceph_calc_file_object_mapping(layout, off, plen, &bno,
- &objoff, &objlen);
- if (*plen < orig_len)
- dout(" skipping last %llu, final file extent %llu~%llu\n",
- orig_len - *plen, off, *plen);
-
- sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
- req->r_oid_len = strlen(req->r_oid);
-
- op->extent.offset = cpu_to_le64(objoff);
- op->extent.length = cpu_to_le64(objlen);
- req->r_num_pages = calc_pages_for(off, *plen);
-
- dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
- req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
-}
-
-
-/*
- * requests
- */
-void ceph_osdc_put_request(struct ceph_osd_request *req)
-{
- dout("osdc put_request %p %d -> %d\n", req, atomic_read(&req->r_ref),
- atomic_read(&req->r_ref)-1);
- BUG_ON(atomic_read(&req->r_ref) <= 0);
- if (atomic_dec_and_test(&req->r_ref)) {
- if (req->r_request)
- ceph_msg_put(req->r_request);
- if (req->r_reply)
- ceph_msg_put(req->r_reply);
- if (req->r_own_pages)
- ceph_release_page_vector(req->r_pages,
- req->r_num_pages);
- ceph_put_snap_context(req->r_snapc);
- if (req->r_mempool)
- mempool_free(req, req->r_osdc->req_mempool);
- else
- kfree(req);
- }
-}
-
-/*
- * build new request AND message, calculate layout, and adjust file
- * extent as needed.
- *
- * if the file was recently truncated, we include information about its
- * old and new size so that the object can be updated appropriately. (we
- * avoid synchronously deleting truncated objects because it's slow.)
- *
- * if @do_sync, include a 'startsync' command so that the osd will flush
- * data quickly.
- */
-struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
- struct ceph_file_layout *layout,
- struct ceph_vino vino,
- u64 off, u64 *plen,
- int opcode, int flags,
- struct ceph_snap_context *snapc,
- int do_sync,
- u32 truncate_seq,
- u64 truncate_size,
- struct timespec *mtime,
- bool use_mempool, int num_reply)
-{
- struct ceph_osd_request *req;
- struct ceph_msg *msg;
- struct ceph_osd_request_head *head;
- struct ceph_osd_op *op;
- void *p;
- int do_trunc = truncate_seq && (off + *plen > truncate_size);
- int num_op = 1 + do_sync + do_trunc;
- size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
- int err, i;
- u64 prevofs;
-
- if (use_mempool) {
- req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
- memset(req, 0, sizeof(*req));
- } else {
- req = kzalloc(sizeof(*req), GFP_NOFS);
- }
- if (req == NULL)
- return ERR_PTR(-ENOMEM);
-
- err = ceph_msgpool_resv(&osdc->msgpool_op_reply, num_reply);
- if (err) {
- ceph_osdc_put_request(req);
- return ERR_PTR(-ENOMEM);
- }
-
- req->r_osdc = osdc;
- req->r_mempool = use_mempool;
- atomic_set(&req->r_ref, 1);
- init_completion(&req->r_completion);
- init_completion(&req->r_safe_completion);
- INIT_LIST_HEAD(&req->r_unsafe_item);
- req->r_flags = flags;
-
- WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
-
- /* create message; allow space for oid */
- msg_size += 40;
- if (snapc)
- msg_size += sizeof(u64) * snapc->num_snaps;
- if (use_mempool)
- msg = ceph_msgpool_get(&osdc->msgpool_op);
- else
- msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
- if (IS_ERR(msg)) {
- ceph_msgpool_resv(&osdc->msgpool_op_reply, num_reply);
- ceph_osdc_put_request(req);
- return ERR_PTR(PTR_ERR(msg));
- }
- msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
- memset(msg->front.iov_base, 0, msg->front.iov_len);
- head = msg->front.iov_base;
- op = (void *)(head + 1);
- p = (void *)(op + num_op);
-
- req->r_request = msg;
- req->r_snapc = ceph_get_snap_context(snapc);
-
- head->client_inc = cpu_to_le32(1); /* always, for now. */
- head->flags = cpu_to_le32(flags);
- if (flags & CEPH_OSD_FLAG_WRITE)
- ceph_encode_timespec(&head->mtime, mtime);
- head->num_ops = cpu_to_le16(num_op);
- op->op = cpu_to_le16(opcode);
-
- /* calculate max write size */
- calc_layout(osdc, vino, layout, off, plen, req);
- req->r_file_layout = *layout; /* keep a copy */
-
- if (flags & CEPH_OSD_FLAG_WRITE) {
- req->r_request->hdr.data_off = cpu_to_le16(off);
- req->r_request->hdr.data_len = cpu_to_le32(*plen);
- op->payload_len = cpu_to_le32(*plen);
- }
-
- /* fill in oid */
- head->object_len = cpu_to_le32(req->r_oid_len);
- memcpy(p, req->r_oid, req->r_oid_len);
- p += req->r_oid_len;
-
- /* additional ops */
- if (do_trunc) {
- op++;
- op->op = cpu_to_le16(opcode == CEPH_OSD_OP_READ ?
- CEPH_OSD_OP_MASKTRUNC : CEPH_OSD_OP_SETTRUNC);
- op->trunc.truncate_seq = cpu_to_le32(truncate_seq);
- prevofs = le64_to_cpu((op-1)->extent.offset);
- op->trunc.truncate_size = cpu_to_le64(truncate_size -
- (off-prevofs));
- }
- if (do_sync) {
- op++;
- op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
- }
- if (snapc) {
- head->snap_seq = cpu_to_le64(snapc->seq);
- head->num_snaps = cpu_to_le32(snapc->num_snaps);
- for (i = 0; i < snapc->num_snaps; i++) {
- put_unaligned_le64(snapc->snaps[i], p);
- p += sizeof(u64);
- }
- }
-
- BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
- return req;
-}
-
-/*
- * We keep osd requests in an rbtree, sorted by ->r_tid.
- */
-static void __insert_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *new)
-{
- struct rb_node **p = &osdc->requests.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_osd_request *req = NULL;
-
- while (*p) {
- parent = *p;
- req = rb_entry(parent, struct ceph_osd_request, r_node);
- if (new->r_tid < req->r_tid)
- p = &(*p)->rb_left;
- else if (new->r_tid > req->r_tid)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&new->r_node, parent, p);
- rb_insert_color(&new->r_node, &osdc->requests);
-}
-
-static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
- u64 tid)
-{
- struct ceph_osd_request *req;
- struct rb_node *n = osdc->requests.rb_node;
-
- while (n) {
- req = rb_entry(n, struct ceph_osd_request, r_node);
- if (tid < req->r_tid)
- n = n->rb_left;
- else if (tid > req->r_tid)
- n = n->rb_right;
- else
- return req;
- }
- return NULL;
-}
-
-static struct ceph_osd_request *
-__lookup_request_ge(struct ceph_osd_client *osdc,
- u64 tid)
-{
- struct ceph_osd_request *req;
- struct rb_node *n = osdc->requests.rb_node;
-
- while (n) {
- req = rb_entry(n, struct ceph_osd_request, r_node);
- if (tid < req->r_tid) {
- if (!n->rb_left)
- return req;
- n = n->rb_left;
- } else if (tid > req->r_tid) {
- n = n->rb_right;
- } else {
- return req;
- }
- }
- return NULL;
-}
-
-
-/*
- * The messaging layer will reconnect to the osd as needed. If the
- * session has dropped, the OSD will have dropped the session state,
- * and we'll get notified by the messaging layer. If that happens, we
- * need to resubmit all requests for that osd.
- */
-static void osd_reset(struct ceph_connection *con)
-{
- struct ceph_osd *osd = con->private;
- struct ceph_osd_client *osdc;
-
- if (!osd)
- return;
- dout("osd_reset osd%d\n", osd->o_osd);
- osdc = osd->o_osdc;
- osd->o_incarnation++;
- down_read(&osdc->map_sem);
- kick_requests(osdc, osd);
- up_read(&osdc->map_sem);
-}
-
-/*
- * Track open sessions with osds.
- */
-static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
-{
- struct ceph_osd *osd;
-
- osd = kzalloc(sizeof(*osd), GFP_NOFS);
- if (!osd)
- return NULL;
-
- atomic_set(&osd->o_ref, 1);
- osd->o_osdc = osdc;
- INIT_LIST_HEAD(&osd->o_requests);
- osd->o_incarnation = 1;
-
- ceph_con_init(osdc->client->msgr, &osd->o_con);
- osd->o_con.private = osd;
- osd->o_con.ops = &osd_con_ops;
- osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
- return osd;
-}
-
-static struct ceph_osd *get_osd(struct ceph_osd *osd)
-{
- if (atomic_inc_not_zero(&osd->o_ref)) {
- dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
- atomic_read(&osd->o_ref));
- return osd;
- } else {
- dout("get_osd %p FAIL\n", osd);
- return NULL;
- }
-}
-
-static void put_osd(struct ceph_osd *osd)
-{
- dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
- atomic_read(&osd->o_ref) - 1);
- if (atomic_dec_and_test(&osd->o_ref)) {
- ceph_con_shutdown(&osd->o_con);
- kfree(osd);
- }
-}
-
-/*
- * remove an osd from our map
- */
-static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
- dout("remove_osd %p\n", osd);
- BUG_ON(!list_empty(&osd->o_requests));
- rb_erase(&osd->o_node, &osdc->osds);
- ceph_con_close(&osd->o_con);
- put_osd(osd);
-}
-
-/*
- * reset osd connect
- */
-static int reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
- int ret = 0;
-
- dout("reset_osd %p osd%d\n", osd, osd->o_osd);
- if (list_empty(&osd->o_requests)) {
- remove_osd(osdc, osd);
- } else {
- ceph_con_close(&osd->o_con);
- ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
- osd->o_incarnation++;
- }
- return ret;
-}
-
-static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
-{
- struct rb_node **p = &osdc->osds.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_osd *osd = NULL;
-
- while (*p) {
- parent = *p;
- osd = rb_entry(parent, struct ceph_osd, o_node);
- if (new->o_osd < osd->o_osd)
- p = &(*p)->rb_left;
- else if (new->o_osd > osd->o_osd)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&new->o_node, parent, p);
- rb_insert_color(&new->o_node, &osdc->osds);
-}
-
-static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
-{
- struct ceph_osd *osd;
- struct rb_node *n = osdc->osds.rb_node;
-
- while (n) {
- osd = rb_entry(n, struct ceph_osd, o_node);
- if (o < osd->o_osd)
- n = n->rb_left;
- else if (o > osd->o_osd)
- n = n->rb_right;
- else
- return osd;
- }
- return NULL;
-}
-
-
-/*
- * Register request, assign tid. If this is the first request, set up
- * the timeout event.
- */
-static void register_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
-{
- struct ceph_osd_request_head *head = req->r_request->front.iov_base;
-
- mutex_lock(&osdc->request_mutex);
- req->r_tid = ++osdc->last_tid;
- head->tid = cpu_to_le64(req->r_tid);
-
- dout("register_request %p tid %lld\n", req, req->r_tid);
- __insert_request(osdc, req);
- ceph_osdc_get_request(req);
- osdc->num_requests++;
-
- req->r_timeout_stamp =
- jiffies + osdc->client->mount_args.osd_timeout*HZ;
-
- if (osdc->num_requests == 1) {
- osdc->timeout_tid = req->r_tid;
- dout(" timeout on tid %llu at %lu\n", req->r_tid,
- req->r_timeout_stamp);
- schedule_delayed_work(&osdc->timeout_work,
- round_jiffies_relative(req->r_timeout_stamp - jiffies));
- }
- mutex_unlock(&osdc->request_mutex);
-}
-
-/*
- * called under osdc->request_mutex
- */
-static void __unregister_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
-{
- dout("__unregister_request %p tid %lld\n", req, req->r_tid);
- rb_erase(&req->r_node, &osdc->requests);
- osdc->num_requests--;
-
- list_del_init(&req->r_osd_item);
- if (list_empty(&req->r_osd->o_requests))
- remove_osd(osdc, req->r_osd);
- req->r_osd = NULL;
-
- ceph_osdc_put_request(req);
-
- if (req->r_tid == osdc->timeout_tid) {
- if (osdc->num_requests == 0) {
- dout("no requests, canceling timeout\n");
- osdc->timeout_tid = 0;
- cancel_delayed_work(&osdc->timeout_work);
- } else {
- req = rb_entry(rb_first(&osdc->requests),
- struct ceph_osd_request, r_node);
- osdc->timeout_tid = req->r_tid;
- dout("rescheduled timeout on tid %llu at %lu\n",
- req->r_tid, req->r_timeout_stamp);
- schedule_delayed_work(&osdc->timeout_work,
- round_jiffies_relative(req->r_timeout_stamp -
- jiffies));
- }
- }
-}
-
-/*
- * Cancel a previously queued request message
- */
-static void __cancel_request(struct ceph_osd_request *req)
-{
- if (req->r_sent) {
- ceph_con_revoke(&req->r_osd->o_con, req->r_request);
- req->r_sent = 0;
- }
-}
-
-/*
- * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
- * (as needed), and set the request r_osd appropriately. If there is
- * no up osd, set r_osd to NULL.
- *
- * Return 0 if unchanged, 1 if changed, or negative on error.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __map_osds(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
-{
- struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
- union ceph_pg pgid;
- int o = -1;
- int err;
- struct ceph_osd *newosd = NULL;
-
- dout("map_osds %p tid %lld\n", req, req->r_tid);
- err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
- &req->r_file_layout, osdc->osdmap);
- if (err)
- return err;
- pgid.pg64 = le64_to_cpu(reqhead->layout.ol_pgid);
- o = ceph_calc_pg_primary(osdc->osdmap, pgid);
-
- if ((req->r_osd && req->r_osd->o_osd == o &&
- req->r_sent >= req->r_osd->o_incarnation) ||
- (req->r_osd == NULL && o == -1))
- return 0; /* no change */
-
- dout("map_osds tid %llu pgid %llx pool %d osd%d (was osd%d)\n",
- req->r_tid, pgid.pg64, pgid.pg.pool, o,
- req->r_osd ? req->r_osd->o_osd : -1);
-
- if (req->r_osd) {
- __cancel_request(req);
- list_del_init(&req->r_osd_item);
- if (list_empty(&req->r_osd->o_requests)) {
- /* try to re-use r_osd if possible */
- newosd = get_osd(req->r_osd);
- remove_osd(osdc, newosd);
- }
- req->r_osd = NULL;
- }
-
- req->r_osd = __lookup_osd(osdc, o);
- if (!req->r_osd && o >= 0) {
- if (newosd) {
- req->r_osd = newosd;
- newosd = NULL;
- } else {
- err = -ENOMEM;
- req->r_osd = create_osd(osdc);
- if (!req->r_osd)
- goto out;
- }
-
- dout("map_osds osd %p is osd%d\n", req->r_osd, o);
- req->r_osd->o_osd = o;
- req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
- __insert_osd(osdc, req->r_osd);
-
- ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
- }
-
- if (req->r_osd)
- list_add(&req->r_osd_item, &req->r_osd->o_requests);
- err = 1; /* osd changed */
-
-out:
- if (newosd)
- put_osd(newosd);
- return err;
-}
-
-/*
- * caller should hold map_sem (for read) and request_mutex
- */
-static int __send_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
-{
- struct ceph_osd_request_head *reqhead;
- int err;
-
- err = __map_osds(osdc, req);
- if (err < 0)
- return err;
- if (req->r_osd == NULL) {
- dout("send_request %p no up osds in pg\n", req);
- ceph_monc_request_next_osdmap(&osdc->client->monc);
- return 0;
- }
-
- dout("send_request %p tid %llu to osd%d flags %d\n",
- req, req->r_tid, req->r_osd->o_osd, req->r_flags);
-
- reqhead = req->r_request->front.iov_base;
- reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
- reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
- reqhead->reassert_version = req->r_reassert_version;
-
- req->r_timeout_stamp = jiffies+osdc->client->mount_args.osd_timeout*HZ;
-
- ceph_msg_get(req->r_request); /* send consumes a ref */
- ceph_con_send(&req->r_osd->o_con, req->r_request);
- req->r_sent = req->r_osd->o_incarnation;
- return 0;
-}
-
-/*
- * Timeout callback, called every N seconds when 1 or more osd
- * requests has been active for more than N seconds. When this
- * happens, we ping all OSDs with requests who have timed out to
- * ensure any communications channel reset is detected. Reset the
- * request timeouts another N seconds in the future as we go.
- * Reschedule the timeout event another N seconds in future (unless
- * there are no open requests).
- */
-static void handle_timeout(struct work_struct *work)
-{
- struct ceph_osd_client *osdc =
- container_of(work, struct ceph_osd_client, timeout_work.work);
- struct ceph_osd_request *req;
- struct ceph_osd *osd;
- unsigned long timeout = osdc->client->mount_args.osd_timeout * HZ;
- unsigned long next_timeout = timeout + jiffies;
- struct rb_node *p;
-
- dout("timeout\n");
- down_read(&osdc->map_sem);
-
- ceph_monc_request_next_osdmap(&osdc->client->monc);
-
- mutex_lock(&osdc->request_mutex);
- for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
- req = rb_entry(p, struct ceph_osd_request, r_node);
-
- if (req->r_resend) {
- int err;
-
- dout("osdc resending prev failed %lld\n", req->r_tid);
- err = __send_request(osdc, req);
- if (err)
- dout("osdc failed again on %lld\n", req->r_tid);
- else
- req->r_resend = false;
- continue;
- }
- }
- for (p = rb_first(&osdc->osds); p; p = rb_next(p)) {
- osd = rb_entry(p, struct ceph_osd, o_node);
- if (list_empty(&osd->o_requests))
- continue;
- req = list_first_entry(&osd->o_requests,
- struct ceph_osd_request, r_osd_item);
- if (time_before(jiffies, req->r_timeout_stamp))
- continue;
-
- dout(" tid %llu (at least) timed out on osd%d\n",
- req->r_tid, osd->o_osd);
- req->r_timeout_stamp = next_timeout;
- ceph_con_keepalive(&osd->o_con);
- }
-
- if (osdc->timeout_tid)
- schedule_delayed_work(&osdc->timeout_work,
- round_jiffies_relative(timeout));
-
- mutex_unlock(&osdc->request_mutex);
-
- up_read(&osdc->map_sem);
-}
-
-/*
- * handle osd op reply. either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- */
-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
-{
- struct ceph_osd_reply_head *rhead = msg->front.iov_base;
- struct ceph_osd_request *req;
- u64 tid;
- int numops, object_len, flags;
-
- if (msg->front.iov_len < sizeof(*rhead))
- goto bad;
- tid = le64_to_cpu(rhead->tid);
- numops = le32_to_cpu(rhead->num_ops);
- object_len = le32_to_cpu(rhead->object_len);
- if (msg->front.iov_len != sizeof(*rhead) + object_len +
- numops * sizeof(struct ceph_osd_op))
- goto bad;
- dout("handle_reply %p tid %llu\n", msg, tid);
-
- /* lookup */
- mutex_lock(&osdc->request_mutex);
- req = __lookup_request(osdc, tid);
- if (req == NULL) {
- dout("handle_reply tid %llu dne\n", tid);
- mutex_unlock(&osdc->request_mutex);
- return;
- }
- ceph_osdc_get_request(req);
- flags = le32_to_cpu(rhead->flags);
-
- if (req->r_reply) {
- /*
- * once we see the message has been received, we don't
- * need a ref (which is only needed for revoking
- * pages)
- */
- ceph_msg_put(req->r_reply);
- req->r_reply = NULL;
- }
-
- if (!req->r_got_reply) {
- unsigned bytes;
-
- req->r_result = le32_to_cpu(rhead->result);
- bytes = le32_to_cpu(msg->hdr.data_len);
- dout("handle_reply result %d bytes %d\n", req->r_result,
- bytes);
- if (req->r_result == 0)
- req->r_result = bytes;
-
- /* in case this is a write and we need to replay, */
- req->r_reassert_version = rhead->reassert_version;
-
- req->r_got_reply = 1;
- } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
- dout("handle_reply tid %llu dup ack\n", tid);
- goto done;
- }
-
- dout("handle_reply tid %llu flags %d\n", tid, flags);
-
- /* either this is a read, or we got the safe response */
- if ((flags & CEPH_OSD_FLAG_ONDISK) ||
- ((flags & CEPH_OSD_FLAG_WRITE) == 0))
- __unregister_request(osdc, req);
-
- mutex_unlock(&osdc->request_mutex);
-
- if (req->r_callback)
- req->r_callback(req, msg);
- else
- complete(&req->r_completion);
-
- if (flags & CEPH_OSD_FLAG_ONDISK) {
- if (req->r_safe_callback)
- req->r_safe_callback(req, msg);
- complete(&req->r_safe_completion); /* fsync waiter */
- }
-
-done:
- ceph_osdc_put_request(req);
- return;
-
-bad:
- pr_err("corrupt osd_op_reply got %d %d expected %d\n",
- (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
- (int)sizeof(*rhead));
-}
-
-
-/*
- * Resubmit osd requests whose osd or osd address has changed. Request
- * a new osd map if osds are down, or we are otherwise unable to determine
- * how to direct a request.
- *
- * Close connections to down osds.
- *
- * If @who is specified, resubmit requests for that specific osd.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static void kick_requests(struct ceph_osd_client *osdc,
- struct ceph_osd *kickosd)
-{
- struct ceph_osd_request *req;
- struct rb_node *p, *n;
- int needmap = 0;
- int err;
-
- dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
- mutex_lock(&osdc->request_mutex);
- if (!kickosd) {
- for (p = rb_first(&osdc->osds); p; p = n) {
- struct ceph_osd *osd =
- rb_entry(p, struct ceph_osd, o_node);
-
- n = rb_next(p);
- if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
- !ceph_entity_addr_equal(&osd->o_con.peer_addr,
- ceph_osd_addr(osdc->osdmap,
- osd->o_osd)))
- reset_osd(osdc, osd);
- }
- }
-
- for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
- req = rb_entry(p, struct ceph_osd_request, r_node);
-
- if (req->r_resend) {
- dout(" r_resend set on tid %llu\n", req->r_tid);
- goto kick;
- }
- if (req->r_osd && kickosd == req->r_osd)
- goto kick;
-
- err = __map_osds(osdc, req);
- if (err == 0)
- continue; /* no change */
- if (err < 0) {
- /*
- * FIXME: really, we should set the request
- * error and fail if this isn't a 'nofail'
- * request, but that's a fair bit more
- * complicated to do. So retry!
- */
- dout(" setting r_resend on %llu\n", req->r_tid);
- req->r_resend = true;
- continue;
- }
- if (req->r_osd == NULL) {
- dout("tid %llu maps to no valid osd\n", req->r_tid);
- needmap++; /* request a newer map */
- continue;
- }
-
-kick:
- dout("kicking tid %llu osd%d\n", req->r_tid, req->r_osd->o_osd);
- req->r_flags |= CEPH_OSD_FLAG_RETRY;
- err = __send_request(osdc, req);
- if (err) {
- dout(" setting r_resend on %llu\n", req->r_tid);
- req->r_resend = true;
- }
- }
- mutex_unlock(&osdc->request_mutex);
-
- if (needmap) {
- dout("%d requests for down osds, need new map\n", needmap);
- ceph_monc_request_next_osdmap(&osdc->client->monc);
- }
-}
-
-/*
- * Process updated osd map.
- *
- * The message contains any number of incremental and full maps, normally
- * indicating some sort of topology change in the cluster. Kick requests
- * off to different OSDs as needed.
- */
-void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
-{
- void *p, *end, *next;
- u32 nr_maps, maplen;
- u32 epoch;
- struct ceph_osdmap *newmap = NULL, *oldmap;
- int err;
- struct ceph_fsid fsid;
-
- dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
- p = msg->front.iov_base;
- end = p + msg->front.iov_len;
-
- /* verify fsid */
- ceph_decode_need(&p, end, sizeof(fsid), bad);
- ceph_decode_copy(&p, &fsid, sizeof(fsid));
- if (ceph_fsid_compare(&fsid, &osdc->client->monc.monmap->fsid)) {
- pr_err("got osdmap with wrong fsid, ignoring\n");
- return;
- }
-
- down_write(&osdc->map_sem);
-
- /* incremental maps */
- ceph_decode_32_safe(&p, end, nr_maps, bad);
- dout(" %d inc maps\n", nr_maps);
- while (nr_maps > 0) {
- ceph_decode_need(&p, end, 2*sizeof(u32), bad);
- ceph_decode_32(&p, epoch);
- ceph_decode_32(&p, maplen);
- ceph_decode_need(&p, end, maplen, bad);
- next = p + maplen;
- if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
- dout("applying incremental map %u len %d\n",
- epoch, maplen);
- newmap = osdmap_apply_incremental(&p, next,
- osdc->osdmap,
- osdc->client->msgr);
- if (IS_ERR(newmap)) {
- err = PTR_ERR(newmap);
- goto bad;
- }
- if (newmap != osdc->osdmap) {
- ceph_osdmap_destroy(osdc->osdmap);
- osdc->osdmap = newmap;
- }
- } else {
- dout("ignoring incremental map %u len %d\n",
- epoch, maplen);
- }
- p = next;
- nr_maps--;
- }
- if (newmap)
- goto done;
-
- /* full maps */
- ceph_decode_32_safe(&p, end, nr_maps, bad);
- dout(" %d full maps\n", nr_maps);
- while (nr_maps) {
- ceph_decode_need(&p, end, 2*sizeof(u32), bad);
- ceph_decode_32(&p, epoch);
- ceph_decode_32(&p, maplen);
- ceph_decode_need(&p, end, maplen, bad);
- if (nr_maps > 1) {
- dout("skipping non-latest full map %u len %d\n",
- epoch, maplen);
- } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
- dout("skipping full map %u len %d, "
- "older than our %u\n", epoch, maplen,
- osdc->osdmap->epoch);
- } else {
- dout("taking full map %u len %d\n", epoch, maplen);
- newmap = osdmap_decode(&p, p+maplen);
- if (IS_ERR(newmap)) {
- err = PTR_ERR(newmap);
- goto bad;
- }
- oldmap = osdc->osdmap;
- osdc->osdmap = newmap;
- if (oldmap)
- ceph_osdmap_destroy(oldmap);
- }
- p += maplen;
- nr_maps--;
- }
-
-done:
- downgrade_write(&osdc->map_sem);
- ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
- if (newmap)
- kick_requests(osdc, NULL);
- up_read(&osdc->map_sem);
- return;
-
-bad:
- pr_err("osdc handle_map corrupt msg\n");
- up_write(&osdc->map_sem);
- return;
-}
-
-
-/*
- * A read request prepares specific pages that data is to be read into.
- * When a message is being read off the wire, we call prepare_pages to
- * find those pages.
- * 0 = success, -1 failure.
- */
-static int prepare_pages(struct ceph_connection *con, struct ceph_msg *m,
- int want)
-{
- struct ceph_osd *osd = con->private;
- struct ceph_osd_client *osdc;
- struct ceph_osd_reply_head *rhead = m->front.iov_base;
- struct ceph_osd_request *req;
- u64 tid;
- int ret = -1;
- int type = le16_to_cpu(m->hdr.type);
-
- if (!osd)
- return -1;
- osdc = osd->o_osdc;
-
- dout("prepare_pages on msg %p want %d\n", m, want);
- if (unlikely(type != CEPH_MSG_OSD_OPREPLY))
- return -1; /* hmm! */
-
- tid = le64_to_cpu(rhead->tid);
- mutex_lock(&osdc->request_mutex);
- req = __lookup_request(osdc, tid);
- if (!req) {
- dout("prepare_pages unknown tid %llu\n", tid);
- goto out;
- }
- dout("prepare_pages tid %llu has %d pages, want %d\n",
- tid, req->r_num_pages, want);
- if (likely(req->r_num_pages >= want && !req->r_prepared_pages)) {
- m->pages = req->r_pages;
- m->nr_pages = req->r_num_pages;
- req->r_reply = m; /* only for duration of read over socket */
- ceph_msg_get(m);
- req->r_prepared_pages = 1;
- ret = 0; /* success */
- }
-out:
- mutex_unlock(&osdc->request_mutex);
- return ret;
-}
-
-/*
- * Register request, send initial attempt.
- */
-int ceph_osdc_start_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req,
- bool nofail)
-{
- int rc;
-
- req->r_request->pages = req->r_pages;
- req->r_request->nr_pages = req->r_num_pages;
-
- register_request(osdc, req);
-
- down_read(&osdc->map_sem);
- mutex_lock(&osdc->request_mutex);
- rc = __send_request(osdc, req);
- if (rc) {
- if (nofail) {
- dout("osdc_start_request failed send, marking %lld\n",
- req->r_tid);
- req->r_resend = true;
- rc = 0;
- } else {
- __unregister_request(osdc, req);
- }
- }
- mutex_unlock(&osdc->request_mutex);
- up_read(&osdc->map_sem);
- return rc;
-}
-
-/*
- * wait for a request to complete
- */
-int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
-{
- int rc;
-
- rc = wait_for_completion_interruptible(&req->r_completion);
- if (rc < 0) {
- mutex_lock(&osdc->request_mutex);
- __cancel_request(req);
- mutex_unlock(&osdc->request_mutex);
- dout("wait_request tid %llu timed out\n", req->r_tid);
- return rc;
- }
-
- dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
- return req->r_result;
-}
-
-/*
- * sync - wait for all in-flight requests to flush. avoid starvation.
- */
-void ceph_osdc_sync(struct ceph_osd_client *osdc)
-{
- struct ceph_osd_request *req;
- u64 last_tid, next_tid = 0;
-
- mutex_lock(&osdc->request_mutex);
- last_tid = osdc->last_tid;
- while (1) {
- req = __lookup_request_ge(osdc, next_tid);
- if (!req)
- break;
- if (req->r_tid > last_tid)
- break;
-
- next_tid = req->r_tid + 1;
- if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
- continue;
-
- ceph_osdc_get_request(req);
- mutex_unlock(&osdc->request_mutex);
- dout("sync waiting on tid %llu (last is %llu)\n",
- req->r_tid, last_tid);
- wait_for_completion(&req->r_safe_completion);
- mutex_lock(&osdc->request_mutex);
- ceph_osdc_put_request(req);
- }
- mutex_unlock(&osdc->request_mutex);
- dout("sync done (thru tid %llu)\n", last_tid);
-}
-
-/*
- * init, shutdown
- */
-int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
-{
- int err;
-
- dout("init\n");
- osdc->client = client;
- osdc->osdmap = NULL;
- init_rwsem(&osdc->map_sem);
- init_completion(&osdc->map_waiters);
- osdc->last_requested_map = 0;
- mutex_init(&osdc->request_mutex);
- osdc->timeout_tid = 0;
- osdc->last_tid = 0;
- osdc->osds = RB_ROOT;
- osdc->requests = RB_ROOT;
- osdc->num_requests = 0;
- INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
-
- osdc->req_mempool = mempool_create_kmalloc_pool(10,
- sizeof(struct ceph_osd_request));
- if (!osdc->req_mempool)
- return -ENOMEM;
-
- err = ceph_msgpool_init(&osdc->msgpool_op, 4096, 10, true);
- if (err < 0)
- return -ENOMEM;
- err = ceph_msgpool_init(&osdc->msgpool_op_reply, 512, 0, false);
- if (err < 0)
- return -ENOMEM;
-
- return 0;
-}
-
-void ceph_osdc_stop(struct ceph_osd_client *osdc)
-{
- cancel_delayed_work_sync(&osdc->timeout_work);
- if (osdc->osdmap) {
- ceph_osdmap_destroy(osdc->osdmap);
- osdc->osdmap = NULL;
- }
- mempool_destroy(osdc->req_mempool);
- ceph_msgpool_destroy(&osdc->msgpool_op);
- ceph_msgpool_destroy(&osdc->msgpool_op_reply);
-}
-
-/*
- * Read some contiguous pages. If we cross a stripe boundary, shorten
- * *plen. Return number of bytes read, or error.
- */
-int ceph_osdc_readpages(struct ceph_osd_client *osdc,
- struct ceph_vino vino, struct ceph_file_layout *layout,
- u64 off, u64 *plen,
- u32 truncate_seq, u64 truncate_size,
- struct page **pages, int num_pages)
-{
- struct ceph_osd_request *req;
- int rc = 0;
-
- dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
- vino.snap, off, *plen);
- req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
- CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
- NULL, 0, truncate_seq, truncate_size, NULL,
- false, 1);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- /* it may be a short read due to an object boundary */
- req->r_pages = pages;
- num_pages = calc_pages_for(off, *plen);
- req->r_num_pages = num_pages;
-
- dout("readpages final extent is %llu~%llu (%d pages)\n",
- off, *plen, req->r_num_pages);
-
- rc = ceph_osdc_start_request(osdc, req, false);
- if (!rc)
- rc = ceph_osdc_wait_request(osdc, req);
-
- ceph_osdc_put_request(req);
- dout("readpages result %d\n", rc);
- return rc;
-}
-
-/*
- * do a synchronous write on N pages
- */
-int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
- struct ceph_file_layout *layout,
- struct ceph_snap_context *snapc,
- u64 off, u64 len,
- u32 truncate_seq, u64 truncate_size,
- struct timespec *mtime,
- struct page **pages, int num_pages,
- int flags, int do_sync, bool nofail)
-{
- struct ceph_osd_request *req;
- int rc = 0;
-
- BUG_ON(vino.snap != CEPH_NOSNAP);
- req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
- CEPH_OSD_OP_WRITE,
- flags | CEPH_OSD_FLAG_ONDISK |
- CEPH_OSD_FLAG_WRITE,
- snapc, do_sync,
- truncate_seq, truncate_size, mtime,
- nofail, 1);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- /* it may be a short write due to an object boundary */
- req->r_pages = pages;
- req->r_num_pages = calc_pages_for(off, len);
- dout("writepages %llu~%llu (%d pages)\n", off, len,
- req->r_num_pages);
-
- rc = ceph_osdc_start_request(osdc, req, nofail);
- if (!rc)
- rc = ceph_osdc_wait_request(osdc, req);
-
- ceph_osdc_put_request(req);
- if (rc == 0)
- rc = len;
- dout("writepages result %d\n", rc);
- return rc;
-}
-
-/*
- * handle incoming message
- */
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
-{
- struct ceph_osd *osd = con->private;
- struct ceph_osd_client *osdc = osd->o_osdc;
- int type = le16_to_cpu(msg->hdr.type);
-
- if (!osd)
- return;
-
- switch (type) {
- case CEPH_MSG_OSD_MAP:
- ceph_osdc_handle_map(osdc, msg);
- break;
- case CEPH_MSG_OSD_OPREPLY:
- handle_reply(osdc, msg);
- break;
-
- default:
- pr_err("received unknown message type %d %s\n", type,
- ceph_msg_type_name(type));
- }
- ceph_msg_put(msg);
-}
-
-static struct ceph_msg *alloc_msg(struct ceph_connection *con,
- struct ceph_msg_header *hdr)
-{
- struct ceph_osd *osd = con->private;
- struct ceph_osd_client *osdc = osd->o_osdc;
- int type = le16_to_cpu(hdr->type);
-
- switch (type) {
- case CEPH_MSG_OSD_OPREPLY:
- return ceph_msgpool_get(&osdc->msgpool_op_reply);
- }
- return ceph_alloc_msg(con, hdr);
-}
-
-/*
- * Wrappers to refcount containing ceph_osd struct
- */
-static struct ceph_connection *get_osd_con(struct ceph_connection *con)
-{
- struct ceph_osd *osd = con->private;
- if (get_osd(osd))
- return con;
- return NULL;
-}
-
-static void put_osd_con(struct ceph_connection *con)
-{
- struct ceph_osd *osd = con->private;
- put_osd(osd);
-}
-
-const static struct ceph_connection_operations osd_con_ops = {
- .get = get_osd_con,
- .put = put_osd_con,
- .dispatch = dispatch,
- .alloc_msg = alloc_msg,
- .peer_reset = osd_reset,
- .alloc_middle = ceph_alloc_middle,
- .prepare_pages = prepare_pages,
-};
+++ /dev/null
-#ifndef _FS_CEPH_OSD_CLIENT_H
-#define _FS_CEPH_OSD_CLIENT_H
-
-#include <linux/completion.h>
-#include <linux/mempool.h>
-#include <linux/rbtree.h>
-
-#include "types.h"
-#include "osdmap.h"
-#include "messenger.h"
-
-struct ceph_msg;
-struct ceph_snap_context;
-struct ceph_osd_request;
-struct ceph_osd_client;
-
-/*
- * completion callback for async writepages
- */
-typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
- struct ceph_msg *);
-
-/* a given osd we're communicating with */
-struct ceph_osd {
- atomic_t o_ref;
- struct ceph_osd_client *o_osdc;
- int o_osd;
- int o_incarnation;
- struct rb_node o_node;
- struct ceph_connection o_con;
- struct list_head o_requests;
-};
-
-/* an in-flight request */
-struct ceph_osd_request {
- u64 r_tid; /* unique for this client */
- struct rb_node r_node;
- struct list_head r_osd_item;
- struct ceph_osd *r_osd;
-
- struct ceph_msg *r_request, *r_reply;
- int r_result;
- int r_flags; /* any additional flags for the osd */
- u32 r_sent; /* >0 if r_request is sending/sent */
- int r_prepared_pages, r_got_reply;
-
- struct ceph_osd_client *r_osdc;
- atomic_t r_ref;
- bool r_mempool;
- struct completion r_completion, r_safe_completion;
- ceph_osdc_callback_t r_callback, r_safe_callback;
- struct ceph_eversion r_reassert_version;
- struct list_head r_unsafe_item;
-
- struct inode *r_inode; /* for use by callbacks */
- struct writeback_control *r_wbc; /* ditto */
-
- char r_oid[40]; /* object name */
- int r_oid_len;
- unsigned long r_timeout_stamp;
- bool r_resend; /* msg send failed, needs retry */
-
- struct ceph_file_layout r_file_layout;
- struct ceph_snap_context *r_snapc; /* snap context for writes */
- unsigned r_num_pages; /* size of page array (follows) */
- struct page **r_pages; /* pages for data payload */
- int r_pages_from_pool;
- int r_own_pages; /* if true, i own page list */
-};
-
-struct ceph_osd_client {
- struct ceph_client *client;
-
- struct ceph_osdmap *osdmap; /* current map */
- struct rw_semaphore map_sem;
- struct completion map_waiters;
- u64 last_requested_map;
-
- struct mutex request_mutex;
- struct rb_root osds; /* osds */
- u64 timeout_tid; /* tid of timeout triggering rq */
- u64 last_tid; /* tid of last request */
- struct rb_root requests; /* pending requests */
- int num_requests;
- struct delayed_work timeout_work;
- struct dentry *debugfs_file;
-
- mempool_t *req_mempool;
-
- struct ceph_msgpool msgpool_op;
- struct ceph_msgpool msgpool_op_reply;
-};
-
-extern int ceph_osdc_init(struct ceph_osd_client *osdc,
- struct ceph_client *client);
-extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
-
-extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
- struct ceph_msg *msg);
-extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
- struct ceph_msg *msg);
-
-extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
- struct ceph_file_layout *layout,
- struct ceph_vino vino,
- u64 offset, u64 *len, int op, int flags,
- struct ceph_snap_context *snapc,
- int do_sync, u32 truncate_seq,
- u64 truncate_size,
- struct timespec *mtime,
- bool use_mempool, int num_reply);
-
-static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
-{
- atomic_inc(&req->r_ref);
-}
-extern void ceph_osdc_put_request(struct ceph_osd_request *req);
-
-extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req,
- bool nofail);
-extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req);
-extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
-
-extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
- struct ceph_vino vino,
- struct ceph_file_layout *layout,
- u64 off, u64 *plen,
- u32 truncate_seq, u64 truncate_size,
- struct page **pages, int nr_pages);
-
-extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
- struct ceph_vino vino,
- struct ceph_file_layout *layout,
- struct ceph_snap_context *sc,
- u64 off, u64 len,
- u32 truncate_seq, u64 truncate_size,
- struct timespec *mtime,
- struct page **pages, int nr_pages,
- int flags, int do_sync, bool nofail);
-
-#endif
-
+++ /dev/null
-
-#include <asm/div64.h>
-
-#include "super.h"
-#include "osdmap.h"
-#include "crush/hash.h"
-#include "crush/mapper.h"
-#include "decode.h"
-#include "ceph_debug.h"
-
-char *ceph_osdmap_state_str(char *str, int len, int state)
-{
- int flag = 0;
-
- if (!len)
- goto done;
-
- *str = '\0';
- if (state) {
- if (state & CEPH_OSD_EXISTS) {
- snprintf(str, len, "exists");
- flag = 1;
- }
- if (state & CEPH_OSD_UP) {
- snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
- "up");
- flag = 1;
- }
- } else {
- snprintf(str, len, "doesn't exist");
- }
-done:
- return str;
-}
-
-/* maps */
-
-static int calc_bits_of(unsigned t)
-{
- int b = 0;
- while (t) {
- t = t >> 1;
- b++;
- }
- return b;
-}
-
-/*
- * the foo_mask is the smallest value 2^n-1 that is >= foo.
- */
-static void calc_pg_masks(struct ceph_pg_pool_info *pi)
-{
- pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
- pi->pgp_num_mask =
- (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
- pi->lpg_num_mask =
- (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
- pi->lpgp_num_mask =
- (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
-}
-
-/*
- * decode crush map
- */
-static int crush_decode_uniform_bucket(void **p, void *end,
- struct crush_bucket_uniform *b)
-{
- dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
- ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
- ceph_decode_32(p, b->item_weight);
- return 0;
-bad:
- return -EINVAL;
-}
-
-static int crush_decode_list_bucket(void **p, void *end,
- struct crush_bucket_list *b)
-{
- int j;
- dout("crush_decode_list_bucket %p to %p\n", *p, end);
- b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
- if (b->item_weights == NULL)
- return -ENOMEM;
- b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
- if (b->sum_weights == NULL)
- return -ENOMEM;
- ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
- for (j = 0; j < b->h.size; j++) {
- ceph_decode_32(p, b->item_weights[j]);
- ceph_decode_32(p, b->sum_weights[j]);
- }
- return 0;
-bad:
- return -EINVAL;
-}
-
-static int crush_decode_tree_bucket(void **p, void *end,
- struct crush_bucket_tree *b)
-{
- int j;
- dout("crush_decode_tree_bucket %p to %p\n", *p, end);
- ceph_decode_32_safe(p, end, b->num_nodes, bad);
- b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
- if (b->node_weights == NULL)
- return -ENOMEM;
- ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
- for (j = 0; j < b->num_nodes; j++)
- ceph_decode_32(p, b->node_weights[j]);
- return 0;
-bad:
- return -EINVAL;
-}
-
-static int crush_decode_straw_bucket(void **p, void *end,
- struct crush_bucket_straw *b)
-{
- int j;
- dout("crush_decode_straw_bucket %p to %p\n", *p, end);
- b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
- if (b->item_weights == NULL)
- return -ENOMEM;
- b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
- if (b->straws == NULL)
- return -ENOMEM;
- ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
- for (j = 0; j < b->h.size; j++) {
- ceph_decode_32(p, b->item_weights[j]);
- ceph_decode_32(p, b->straws[j]);
- }
- return 0;
-bad:
- return -EINVAL;
-}
-
-static struct crush_map *crush_decode(void *pbyval, void *end)
-{
- struct crush_map *c;
- int err = -EINVAL;
- int i, j;
- void **p = &pbyval;
- void *start = pbyval;
- u32 magic;
-
- dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
-
- c = kzalloc(sizeof(*c), GFP_NOFS);
- if (c == NULL)
- return ERR_PTR(-ENOMEM);
-
- ceph_decode_need(p, end, 4*sizeof(u32), bad);
- ceph_decode_32(p, magic);
- if (magic != CRUSH_MAGIC) {
- pr_err("crush_decode magic %x != current %x\n",
- (unsigned)magic, (unsigned)CRUSH_MAGIC);
- goto bad;
- }
- ceph_decode_32(p, c->max_buckets);
- ceph_decode_32(p, c->max_rules);
- ceph_decode_32(p, c->max_devices);
-
- c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
- if (c->device_parents == NULL)
- goto badmem;
- c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
- if (c->bucket_parents == NULL)
- goto badmem;
-
- c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
- if (c->buckets == NULL)
- goto badmem;
- c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
- if (c->rules == NULL)
- goto badmem;
-
- /* buckets */
- for (i = 0; i < c->max_buckets; i++) {
- int size = 0;
- u32 alg;
- struct crush_bucket *b;
-
- ceph_decode_32_safe(p, end, alg, bad);
- if (alg == 0) {
- c->buckets[i] = NULL;
- continue;
- }
- dout("crush_decode bucket %d off %x %p to %p\n",
- i, (int)(*p-start), *p, end);
-
- switch (alg) {
- case CRUSH_BUCKET_UNIFORM:
- size = sizeof(struct crush_bucket_uniform);
- break;
- case CRUSH_BUCKET_LIST:
- size = sizeof(struct crush_bucket_list);
- break;
- case CRUSH_BUCKET_TREE:
- size = sizeof(struct crush_bucket_tree);
- break;
- case CRUSH_BUCKET_STRAW:
- size = sizeof(struct crush_bucket_straw);
- break;
- default:
- goto bad;
- }
- BUG_ON(size == 0);
- b = c->buckets[i] = kzalloc(size, GFP_NOFS);
- if (b == NULL)
- goto badmem;
-
- ceph_decode_need(p, end, 4*sizeof(u32), bad);
- ceph_decode_32(p, b->id);
- ceph_decode_16(p, b->type);
- ceph_decode_16(p, b->alg);
- ceph_decode_32(p, b->weight);
- ceph_decode_32(p, b->size);
-
- dout("crush_decode bucket size %d off %x %p to %p\n",
- b->size, (int)(*p-start), *p, end);
-
- b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
- if (b->items == NULL)
- goto badmem;
- b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
- if (b->perm == NULL)
- goto badmem;
- b->perm_n = 0;
-
- ceph_decode_need(p, end, b->size*sizeof(u32), bad);
- for (j = 0; j < b->size; j++)
- ceph_decode_32(p, b->items[j]);
-
- switch (b->alg) {
- case CRUSH_BUCKET_UNIFORM:
- err = crush_decode_uniform_bucket(p, end,
- (struct crush_bucket_uniform *)b);
- if (err < 0)
- goto bad;
- break;
- case CRUSH_BUCKET_LIST:
- err = crush_decode_list_bucket(p, end,
- (struct crush_bucket_list *)b);
- if (err < 0)
- goto bad;
- break;
- case CRUSH_BUCKET_TREE:
- err = crush_decode_tree_bucket(p, end,
- (struct crush_bucket_tree *)b);
- if (err < 0)
- goto bad;
- break;
- case CRUSH_BUCKET_STRAW:
- err = crush_decode_straw_bucket(p, end,
- (struct crush_bucket_straw *)b);
- if (err < 0)
- goto bad;
- break;
- }
- }
-
- /* rules */
- dout("rule vec is %p\n", c->rules);
- for (i = 0; i < c->max_rules; i++) {
- u32 yes;
- struct crush_rule *r;
-
- ceph_decode_32_safe(p, end, yes, bad);
- if (!yes) {
- dout("crush_decode NO rule %d off %x %p to %p\n",
- i, (int)(*p-start), *p, end);
- c->rules[i] = NULL;
- continue;
- }
-
- dout("crush_decode rule %d off %x %p to %p\n",
- i, (int)(*p-start), *p, end);
-
- /* len */
- ceph_decode_32_safe(p, end, yes, bad);
-#if BITS_PER_LONG == 32
- if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
- goto bad;
-#endif
- r = c->rules[i] = kmalloc(sizeof(*r) +
- yes*sizeof(struct crush_rule_step),
- GFP_NOFS);
- if (r == NULL)
- goto badmem;
- dout(" rule %d is at %p\n", i, r);
- r->len = yes;
- ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
- ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
- for (j = 0; j < r->len; j++) {
- ceph_decode_32(p, r->steps[j].op);
- ceph_decode_32(p, r->steps[j].arg1);
- ceph_decode_32(p, r->steps[j].arg2);
- }
- }
-
- /* ignore trailing name maps. */
-
- dout("crush_decode success\n");
- return c;
-
-badmem:
- err = -ENOMEM;
-bad:
- dout("crush_decode fail %d\n", err);
- crush_destroy(c);
- return ERR_PTR(err);
-}
-
-
-/*
- * osd map
- */
-void ceph_osdmap_destroy(struct ceph_osdmap *map)
-{
- dout("osdmap_destroy %p\n", map);
- if (map->crush)
- crush_destroy(map->crush);
- while (!RB_EMPTY_ROOT(&map->pg_temp))
- rb_erase(rb_first(&map->pg_temp), &map->pg_temp);
- kfree(map->osd_state);
- kfree(map->osd_weight);
- kfree(map->pg_pool);
- kfree(map->osd_addr);
- kfree(map);
-}
-
-/*
- * adjust max osd value. reallocate arrays.
- */
-static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
-{
- u8 *state;
- struct ceph_entity_addr *addr;
- u32 *weight;
-
- state = kcalloc(max, sizeof(*state), GFP_NOFS);
- addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
- weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
- if (state == NULL || addr == NULL || weight == NULL) {
- kfree(state);
- kfree(addr);
- kfree(weight);
- return -ENOMEM;
- }
-
- /* copy old? */
- if (map->osd_state) {
- memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
- memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
- memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
- kfree(map->osd_state);
- kfree(map->osd_addr);
- kfree(map->osd_weight);
- }
-
- map->osd_state = state;
- map->osd_weight = weight;
- map->osd_addr = addr;
- map->max_osd = max;
- return 0;
-}
-
-/*
- * Insert a new pg_temp mapping
- */
-static void __insert_pg_mapping(struct ceph_pg_mapping *new,
- struct rb_root *root)
-{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
- struct ceph_pg_mapping *pg = NULL;
-
- while (*p) {
- parent = *p;
- pg = rb_entry(parent, struct ceph_pg_mapping, node);
- if (new->pgid < pg->pgid)
- p = &(*p)->rb_left;
- else if (new->pgid > pg->pgid)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&new->node, parent, p);
- rb_insert_color(&new->node, root);
-}
-
-/*
- * decode a full map.
- */
-struct ceph_osdmap *osdmap_decode(void **p, void *end)
-{
- struct ceph_osdmap *map;
- u16 version;
- u32 len, max, i;
- int err = -EINVAL;
- void *start = *p;
-
- dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
-
- map = kzalloc(sizeof(*map), GFP_NOFS);
- if (map == NULL)
- return ERR_PTR(-ENOMEM);
- map->pg_temp = RB_ROOT;
-
- ceph_decode_16_safe(p, end, version, bad);
-
- ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
- ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
- ceph_decode_32(p, map->epoch);
- ceph_decode_copy(p, &map->created, sizeof(map->created));
- ceph_decode_copy(p, &map->modified, sizeof(map->modified));
-
- ceph_decode_32(p, map->num_pools);
- map->pg_pool = kcalloc(map->num_pools, sizeof(*map->pg_pool),
- GFP_NOFS);
- if (!map->pg_pool) {
- err = -ENOMEM;
- goto bad;
- }
- ceph_decode_32_safe(p, end, max, bad);
- while (max--) {
- ceph_decode_need(p, end, 4+sizeof(map->pg_pool->v), bad);
- ceph_decode_32(p, i);
- if (i >= map->num_pools)
- goto bad;
- ceph_decode_copy(p, &map->pg_pool[i].v,
- sizeof(map->pg_pool->v));
- calc_pg_masks(&map->pg_pool[i]);
- p += le32_to_cpu(map->pg_pool[i].v.num_snaps) * sizeof(u64);
- p += le32_to_cpu(map->pg_pool[i].v.num_removed_snap_intervals)
- * sizeof(u64) * 2;
- }
-
- ceph_decode_32_safe(p, end, map->flags, bad);
-
- ceph_decode_32(p, max);
-
- /* (re)alloc osd arrays */
- err = osdmap_set_max_osd(map, max);
- if (err < 0)
- goto bad;
- dout("osdmap_decode max_osd = %d\n", map->max_osd);
-
- /* osds */
- err = -EINVAL;
- ceph_decode_need(p, end, 3*sizeof(u32) +
- map->max_osd*(1 + sizeof(*map->osd_weight) +
- sizeof(*map->osd_addr)), bad);
- *p += 4; /* skip length field (should match max) */
- ceph_decode_copy(p, map->osd_state, map->max_osd);
-
- *p += 4; /* skip length field (should match max) */
- for (i = 0; i < map->max_osd; i++)
- ceph_decode_32(p, map->osd_weight[i]);
-
- *p += 4; /* skip length field (should match max) */
- ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
-
- /* pg_temp */
- ceph_decode_32_safe(p, end, len, bad);
- for (i = 0; i < len; i++) {
- int n, j;
- u64 pgid;
- struct ceph_pg_mapping *pg;
-
- ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
- ceph_decode_64(p, pgid);
- ceph_decode_32(p, n);
- ceph_decode_need(p, end, n * sizeof(u32), bad);
- pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
- if (!pg) {
- err = -ENOMEM;
- goto bad;
- }
- pg->pgid = pgid;
- pg->len = n;
- for (j = 0; j < n; j++)
- ceph_decode_32(p, pg->osds[j]);
-
- __insert_pg_mapping(pg, &map->pg_temp);
- dout(" added pg_temp %llx len %d\n", pgid, len);
- }
-
- /* crush */
- ceph_decode_32_safe(p, end, len, bad);
- dout("osdmap_decode crush len %d from off 0x%x\n", len,
- (int)(*p - start));
- ceph_decode_need(p, end, len, bad);
- map->crush = crush_decode(*p, end);
- *p += len;
- if (IS_ERR(map->crush)) {
- err = PTR_ERR(map->crush);
- map->crush = NULL;
- goto bad;
- }
-
- /* ignore the rest of the map */
- *p = end;
-
- dout("osdmap_decode done %p %p\n", *p, end);
- return map;
-
-bad:
- dout("osdmap_decode fail\n");
- ceph_osdmap_destroy(map);
- return ERR_PTR(err);
-}
-
-/*
- * decode and apply an incremental map update.
- */
-struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
- struct ceph_osdmap *map,
- struct ceph_messenger *msgr)
-{
- struct ceph_osdmap *newmap = map;
- struct crush_map *newcrush = NULL;
- struct ceph_fsid fsid;
- u32 epoch = 0;
- struct ceph_timespec modified;
- u32 len, pool;
- __s32 new_flags, max;
- void *start = *p;
- int err = -EINVAL;
- u16 version;
- struct rb_node *rbp;
-
- ceph_decode_16_safe(p, end, version, bad);
-
- ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
- bad);
- ceph_decode_copy(p, &fsid, sizeof(fsid));
- ceph_decode_32(p, epoch);
- BUG_ON(epoch != map->epoch+1);
- ceph_decode_copy(p, &modified, sizeof(modified));
- ceph_decode_32(p, new_flags);
-
- /* full map? */
- ceph_decode_32_safe(p, end, len, bad);
- if (len > 0) {
- dout("apply_incremental full map len %d, %p to %p\n",
- len, *p, end);
- newmap = osdmap_decode(p, min(*p+len, end));
- return newmap; /* error or not */
- }
-
- /* new crush? */
- ceph_decode_32_safe(p, end, len, bad);
- if (len > 0) {
- dout("apply_incremental new crush map len %d, %p to %p\n",
- len, *p, end);
- newcrush = crush_decode(*p, min(*p+len, end));
- if (IS_ERR(newcrush))
- return ERR_PTR(PTR_ERR(newcrush));
- }
-
- /* new flags? */
- if (new_flags >= 0)
- map->flags = new_flags;
-
- ceph_decode_need(p, end, 5*sizeof(u32), bad);
-
- /* new max? */
- ceph_decode_32(p, max);
- if (max >= 0) {
- err = osdmap_set_max_osd(map, max);
- if (err < 0)
- goto bad;
- }
-
- map->epoch++;
- map->modified = map->modified;
- if (newcrush) {
- if (map->crush)
- crush_destroy(map->crush);
- map->crush = newcrush;
- newcrush = NULL;
- }
-
- /* new_pool */
- ceph_decode_32_safe(p, end, len, bad);
- while (len--) {
- ceph_decode_32_safe(p, end, pool, bad);
- if (pool >= map->num_pools) {
- void *pg_pool = kcalloc(pool + 1,
- sizeof(*map->pg_pool),
- GFP_NOFS);
- if (!pg_pool) {
- err = -ENOMEM;
- goto bad;
- }
- memcpy(pg_pool, map->pg_pool,
- map->num_pools * sizeof(*map->pg_pool));
- kfree(map->pg_pool);
- map->pg_pool = pg_pool;
- map->num_pools = pool+1;
- }
- ceph_decode_copy(p, &map->pg_pool[pool].v,
- sizeof(map->pg_pool->v));
- calc_pg_masks(&map->pg_pool[pool]);
- }
-
- /* old_pool (ignore) */
- ceph_decode_32_safe(p, end, len, bad);
- *p += len * sizeof(u32);
-
- /* new_up */
- err = -EINVAL;
- ceph_decode_32_safe(p, end, len, bad);
- while (len--) {
- u32 osd;
- struct ceph_entity_addr addr;
- ceph_decode_32_safe(p, end, osd, bad);
- ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
- pr_info("osd%d up\n", osd);
- BUG_ON(osd >= map->max_osd);
- map->osd_state[osd] |= CEPH_OSD_UP;
- map->osd_addr[osd] = addr;
- }
-
- /* new_down */
- ceph_decode_32_safe(p, end, len, bad);
- while (len--) {
- u32 osd;
- ceph_decode_32_safe(p, end, osd, bad);
- (*p)++; /* clean flag */
- pr_info("ceph osd%d down\n", osd);
- if (osd < map->max_osd)
- map->osd_state[osd] &= ~CEPH_OSD_UP;
- }
-
- /* new_weight */
- ceph_decode_32_safe(p, end, len, bad);
- while (len--) {
- u32 osd, off;
- ceph_decode_need(p, end, sizeof(u32)*2, bad);
- ceph_decode_32(p, osd);
- ceph_decode_32(p, off);
- pr_info("osd%d weight 0x%x %s\n", osd, off,
- off == CEPH_OSD_IN ? "(in)" :
- (off == CEPH_OSD_OUT ? "(out)" : ""));
- if (osd < map->max_osd)
- map->osd_weight[osd] = off;
- }
-
- /* new_pg_temp */
- rbp = rb_first(&map->pg_temp);
- ceph_decode_32_safe(p, end, len, bad);
- while (len--) {
- struct ceph_pg_mapping *pg;
- int j;
- u64 pgid;
- u32 pglen;
- ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
- ceph_decode_64(p, pgid);
- ceph_decode_32(p, pglen);
-
- /* remove any? */
- while (rbp && rb_entry(rbp, struct ceph_pg_mapping,
- node)->pgid <= pgid) {
- struct rb_node *cur = rbp;
- rbp = rb_next(rbp);
- dout(" removed pg_temp %llx\n",
- rb_entry(cur, struct ceph_pg_mapping, node)->pgid);
- rb_erase(cur, &map->pg_temp);
- }
-
- if (pglen) {
- /* insert */
- ceph_decode_need(p, end, pglen*sizeof(u32), bad);
- pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
- if (!pg) {
- err = -ENOMEM;
- goto bad;
- }
- pg->pgid = pgid;
- pg->len = pglen;
- for (j = 0; j < len; j++)
- ceph_decode_32(p, pg->osds[j]);
- __insert_pg_mapping(pg, &map->pg_temp);
- dout(" added pg_temp %llx len %d\n", pgid, pglen);
- }
- }
- while (rbp) {
- struct rb_node *cur = rbp;
- rbp = rb_next(rbp);
- dout(" removed pg_temp %llx\n",
- rb_entry(cur, struct ceph_pg_mapping, node)->pgid);
- rb_erase(cur, &map->pg_temp);
- }
-
- /* ignore the rest */
- *p = end;
- return map;
-
-bad:
- pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
- epoch, (int)(*p - start), *p, start, end);
- if (newcrush)
- crush_destroy(newcrush);
- return ERR_PTR(err);
-}
-
-
-
-
-/*
- * calculate file layout from given offset, length.
- * fill in correct oid, logical length, and object extent
- * offset, length.
- *
- * for now, we write only a single su, until we can
- * pass a stride back to the caller.
- */
-void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
- u64 off, u64 *plen,
- u64 *bno,
- u64 *oxoff, u64 *oxlen)
-{
- u32 osize = le32_to_cpu(layout->fl_object_size);
- u32 su = le32_to_cpu(layout->fl_stripe_unit);
- u32 sc = le32_to_cpu(layout->fl_stripe_count);
- u32 bl, stripeno, stripepos, objsetno;
- u32 su_per_object;
- u64 t;
-
- dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
- osize, su);
- su_per_object = osize / le32_to_cpu(layout->fl_stripe_unit);
- dout("osize %u / su %u = su_per_object %u\n", osize, su,
- su_per_object);
-
- BUG_ON((su & ~PAGE_MASK) != 0);
- /* bl = *off / su; */
- t = off;
- do_div(t, su);
- bl = t;
- dout("off %llu / su %u = bl %u\n", off, su, bl);
-
- stripeno = bl / sc;
- stripepos = bl % sc;
- objsetno = stripeno / su_per_object;
-
- *bno = objsetno * sc + stripepos;
- dout("objset %u * sc %u = bno %u\n", objsetno, sc, (unsigned)*bno);
- /* *oxoff = *off / layout->fl_stripe_unit; */
- t = off;
- *oxoff = do_div(t, su);
- *oxlen = min_t(u64, *plen, su - *oxoff);
- *plen = *oxlen;
-
- dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
-}
-
-/*
- * calculate an object layout (i.e. pgid) from an oid,
- * file_layout, and osdmap
- */
-int ceph_calc_object_layout(struct ceph_object_layout *ol,
- const char *oid,
- struct ceph_file_layout *fl,
- struct ceph_osdmap *osdmap)
-{
- unsigned num, num_mask;
- union ceph_pg pgid;
- s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
- int poolid = le32_to_cpu(fl->fl_pg_pool);
- struct ceph_pg_pool_info *pool;
-
- if (poolid >= osdmap->num_pools)
- return -EIO;
- pool = &osdmap->pg_pool[poolid];
-
- if (preferred >= 0) {
- num = le32_to_cpu(pool->v.lpg_num);
- num_mask = pool->lpg_num_mask;
- } else {
- num = le32_to_cpu(pool->v.pg_num);
- num_mask = pool->pg_num_mask;
- }
-
- pgid.pg64 = 0; /* start with it zeroed out */
- pgid.pg.ps = ceph_full_name_hash(oid, strlen(oid));
- pgid.pg.preferred = preferred;
- pgid.pg.pool = le32_to_cpu(fl->fl_pg_pool);
- if (preferred >= 0)
- dout("calc_object_layout '%s' pgid %d.%xp%d (%llx)\n", oid,
- pgid.pg.pool, pgid.pg.ps, (int)preferred, pgid.pg64);
- else
- dout("calc_object_layout '%s' pgid %d.%x (%llx)\n", oid,
- pgid.pg.pool, pgid.pg.ps, pgid.pg64);
-
- ol->ol_pgid = cpu_to_le64(pgid.pg64);
- ol->ol_stripe_unit = fl->fl_object_stripe_unit;
-
- return 0;
-}
-
-/*
- * Calculate raw osd vector for the given pgid. Return pointer to osd
- * array, or NULL on failure.
- */
-static int *calc_pg_raw(struct ceph_osdmap *osdmap, union ceph_pg pgid,
- int *osds, int *num)
-{
- struct rb_node *n = osdmap->pg_temp.rb_node;
- struct ceph_pg_mapping *pg;
- struct ceph_pg_pool_info *pool;
- int ruleno;
- unsigned pps; /* placement ps */
-
- /* pg_temp? */
- while (n) {
- pg = rb_entry(n, struct ceph_pg_mapping, node);
- if (pgid.pg64 < pg->pgid)
- n = n->rb_left;
- else if (pgid.pg64 > pg->pgid)
- n = n->rb_right;
- else {
- *num = pg->len;
- return pg->osds;
- }
- }
-
- /* crush */
- if (pgid.pg.pool >= osdmap->num_pools)
- return NULL;
- pool = &osdmap->pg_pool[pgid.pg.pool];
- ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
- pool->v.type, pool->v.size);
- if (ruleno < 0) {
- pr_err("no crush rule pool %d type %d size %d\n",
- pgid.pg.pool, pool->v.type, pool->v.size);
- return NULL;
- }
-
- if (pgid.pg.preferred >= 0)
- pps = ceph_stable_mod(pgid.pg.ps,
- le32_to_cpu(pool->v.lpgp_num),
- pool->lpgp_num_mask);
- else
- pps = ceph_stable_mod(pgid.pg.ps,
- le32_to_cpu(pool->v.pgp_num),
- pool->pgp_num_mask);
- pps += pgid.pg.pool;
- *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
- min_t(int, pool->v.size, *num),
- pgid.pg.preferred, osdmap->osd_weight);
- return osds;
-}
-
-/*
- * Return primary osd for given pgid, or -1 if none.
- */
-int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, union ceph_pg pgid)
-{
- int rawosds[10], *osds;
- int i, num = ARRAY_SIZE(rawosds);
-
- osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
- if (!osds)
- return -1;
-
- /* primary is first up osd */
- for (i = 0; i < num; i++)
- if (ceph_osd_is_up(osdmap, osds[i])) {
- return osds[i];
- break;
- }
- return -1;
-}
+++ /dev/null
-#ifndef _FS_CEPH_OSDMAP_H
-#define _FS_CEPH_OSDMAP_H
-
-#include <linux/rbtree.h>
-#include "types.h"
-#include "ceph_fs.h"
-#include "crush/crush.h"
-
-/*
- * The osd map describes the current membership of the osd cluster and
- * specifies the mapping of objects to placement groups and placement
- * groups to (sets of) osds. That is, it completely specifies the
- * (desired) distribution of all data objects in the system at some
- * point in time.
- *
- * Each map version is identified by an epoch, which increases monotonically.
- *
- * The map can be updated either via an incremental map (diff) describing
- * the change between two successive epochs, or as a fully encoded map.
- */
-struct ceph_pg_pool_info {
- struct ceph_pg_pool v;
- int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
-};
-
-struct ceph_pg_mapping {
- struct rb_node node;
- u64 pgid;
- int len;
- int osds[];
-};
-
-struct ceph_osdmap {
- struct ceph_fsid fsid;
- u32 epoch;
- u32 mkfs_epoch;
- struct ceph_timespec created, modified;
-
- u32 flags; /* CEPH_OSDMAP_* */
-
- u32 max_osd; /* size of osd_state, _offload, _addr arrays */
- u8 *osd_state; /* CEPH_OSD_* */
- u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
- struct ceph_entity_addr *osd_addr;
-
- struct rb_root pg_temp;
-
- u32 num_pools;
- struct ceph_pg_pool_info *pg_pool;
-
- /* the CRUSH map specifies the mapping of placement groups to
- * the list of osds that store+replicate them. */
- struct crush_map *crush;
-};
-
-/*
- * file layout helpers
- */
-#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
-#define ceph_file_layout_stripe_count(l) \
- ((__s32)le32_to_cpu((l).fl_stripe_count))
-#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
-#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
-#define ceph_file_layout_object_su(l) \
- ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
-#define ceph_file_layout_pg_preferred(l) \
- ((__s32)le32_to_cpu((l).fl_pg_preferred))
-#define ceph_file_layout_pg_pool(l) \
- ((__s32)le32_to_cpu((l).fl_pg_pool))
-
-static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
-{
- return le32_to_cpu(l->fl_stripe_unit) *
- le32_to_cpu(l->fl_stripe_count);
-}
-
-/* "period" == bytes before i start on a new set of objects */
-static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
-{
- return le32_to_cpu(l->fl_object_size) *
- le32_to_cpu(l->fl_stripe_count);
-}
-
-
-static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
-{
- return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
-}
-
-static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
-{
- return map && (map->flags & flag);
-}
-
-extern char *ceph_osdmap_state_str(char *str, int len, int state);
-
-static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
- int osd)
-{
- if (osd >= map->max_osd)
- return NULL;
- return &map->osd_addr[osd];
-}
-
-extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
-extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
- struct ceph_osdmap *map,
- struct ceph_messenger *msgr);
-extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
-
-/* calculate mapping of a file extent to an object */
-extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
- u64 off, u64 *plen,
- u64 *bno, u64 *oxoff, u64 *oxlen);
-
-/* calculate mapping of object to a placement group */
-extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
- const char *oid,
- struct ceph_file_layout *fl,
- struct ceph_osdmap *osdmap);
-extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, union ceph_pg pgid);
-
-#endif
+++ /dev/null
-../include/rados.h
\ No newline at end of file
+++ /dev/null
-#
-# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.23
-# Thu Nov 29 10:07:10 2007
-#
-CONFIG_DEFCONFIG_LIST="arch/$ARCH/defconfig"
-CONFIG_GENERIC_HARDIRQS=y
-CONFIG_UML=y
-CONFIG_MMU=y
-CONFIG_NO_IOMEM=y
-# CONFIG_TRACE_IRQFLAGS_SUPPORT is not set
-CONFIG_LOCKDEP_SUPPORT=y
-# CONFIG_STACKTRACE_SUPPORT is not set
-CONFIG_GENERIC_CALIBRATE_DELAY=y
-CONFIG_GENERIC_BUG=y
-CONFIG_IRQ_RELEASE_METHOD=y
-
-#
-# UML-specific options
-#
-# CONFIG_STATIC_LINK is not set
-CONFIG_MODE_SKAS=y
-CONFIG_UML_X86=y
-CONFIG_64BIT=y
-CONFIG_RWSEM_GENERIC_SPINLOCK=y
-CONFIG_SEMAPHORE_SLEEPERS=y
-CONFIG_TOP_ADDR=0x80000000
-CONFIG_3_LEVEL_PGTABLES=y
-CONFIG_STUB_CODE=0x7fbfffe000
-CONFIG_STUB_DATA=0x7fbffff000
-CONFIG_STUB_START=0x7fbfffe000
-# CONFIG_ARCH_HAS_SC_SIGNALS is not set
-# CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA is not set
-CONFIG_SMP_BROKEN=y
-CONFIG_GENERIC_HWEIGHT=y
-CONFIG_SELECT_MEMORY_MODEL=y
-CONFIG_FLATMEM_MANUAL=y
-# CONFIG_DISCONTIGMEM_MANUAL is not set
-# CONFIG_SPARSEMEM_MANUAL is not set
-CONFIG_FLATMEM=y
-CONFIG_FLAT_NODE_MEM_MAP=y
-# CONFIG_SPARSEMEM_STATIC is not set
-CONFIG_SPLIT_PTLOCK_CPUS=4
-CONFIG_RESOURCES_64BIT=y
-CONFIG_ZONE_DMA_FLAG=0
-CONFIG_VIRT_TO_BUS=y
-CONFIG_LD_SCRIPT_DYN=y
-CONFIG_NET=y
-CONFIG_BINFMT_ELF=y
-CONFIG_BINFMT_MISC=m
-CONFIG_HOSTFS=y
-# CONFIG_HPPFS is not set
-CONFIG_MCONSOLE=y
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_NEST_LEVEL=0
-CONFIG_KERNEL_STACK_ORDER=1
-CONFIG_UML_REAL_TIME_CLOCK=y
-
-#
-# General setup
-#
-CONFIG_EXPERIMENTAL=y
-CONFIG_BROKEN_ON_SMP=y
-CONFIG_INIT_ENV_ARG_LIMIT=128
-CONFIG_LOCALVERSION=""
-CONFIG_LOCALVERSION_AUTO=y
-CONFIG_SWAP=y
-CONFIG_SYSVIPC=y
-CONFIG_SYSVIPC_SYSCTL=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_BSD_PROCESS_ACCT=y
-# CONFIG_BSD_PROCESS_ACCT_V3 is not set
-# CONFIG_TASKSTATS is not set
-# CONFIG_USER_NS is not set
-# CONFIG_AUDIT is not set
-CONFIG_IKCONFIG=y
-CONFIG_IKCONFIG_PROC=y
-CONFIG_LOG_BUF_SHIFT=14
-CONFIG_SYSFS_DEPRECATED=y
-# CONFIG_RELAY is not set
-# CONFIG_BLK_DEV_INITRD is not set
-CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-CONFIG_SYSCTL=y
-# CONFIG_EMBEDDED is not set
-CONFIG_UID16=y
-CONFIG_SYSCTL_SYSCALL=y
-CONFIG_KALLSYMS=y
-# CONFIG_KALLSYMS_ALL is not set
-CONFIG_KALLSYMS_EXTRA_PASS=y
-CONFIG_HOTPLUG=y
-CONFIG_PRINTK=y
-CONFIG_BUG=y
-CONFIG_ELF_CORE=y
-CONFIG_BASE_FULL=y
-CONFIG_FUTEX=y
-CONFIG_ANON_INODES=y
-CONFIG_EPOLL=y
-CONFIG_SIGNALFD=y
-CONFIG_EVENTFD=y
-CONFIG_SHMEM=y
-CONFIG_VM_EVENT_COUNTERS=y
-CONFIG_SLAB=y
-# CONFIG_SLUB is not set
-# CONFIG_SLOB is not set
-CONFIG_RT_MUTEXES=y
-# CONFIG_TINY_SHMEM is not set
-CONFIG_BASE_SMALL=0
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-# CONFIG_MODULE_FORCE_UNLOAD is not set
-# CONFIG_MODVERSIONS is not set
-# CONFIG_MODULE_SRCVERSION_ALL is not set
-CONFIG_KMOD=y
-CONFIG_BLOCK=y
-# CONFIG_BLK_DEV_IO_TRACE is not set
-# CONFIG_BLK_DEV_BSG is not set
-
-#
-# IO Schedulers
-#
-CONFIG_IOSCHED_NOOP=y
-CONFIG_IOSCHED_AS=y
-CONFIG_IOSCHED_DEADLINE=y
-CONFIG_IOSCHED_CFQ=y
-CONFIG_DEFAULT_AS=y
-# CONFIG_DEFAULT_DEADLINE is not set
-# CONFIG_DEFAULT_CFQ is not set
-# CONFIG_DEFAULT_NOOP is not set
-CONFIG_DEFAULT_IOSCHED="anticipatory"
-CONFIG_BLK_DEV=y
-CONFIG_BLK_DEV_UBD=y
-# CONFIG_BLK_DEV_UBD_SYNC is not set
-CONFIG_BLK_DEV_COW_COMMON=y
-# CONFIG_MMAPPER is not set
-CONFIG_BLK_DEV_LOOP=m
-# CONFIG_BLK_DEV_CRYPTOLOOP is not set
-CONFIG_BLK_DEV_NBD=m
-# CONFIG_BLK_DEV_RAM is not set
-# CONFIG_ATA_OVER_ETH is not set
-
-#
-# Character Devices
-#
-CONFIG_STDERR_CONSOLE=y
-CONFIG_STDIO_CONSOLE=y
-CONFIG_SSL=y
-CONFIG_NULL_CHAN=y
-CONFIG_PORT_CHAN=y
-CONFIG_PTY_CHAN=y
-CONFIG_TTY_CHAN=y
-CONFIG_XTERM_CHAN=y
-# CONFIG_NOCONFIG_CHAN is not set
-CONFIG_CON_ZERO_CHAN="fd:0,fd:1"
-CONFIG_CON_CHAN="xterm"
-CONFIG_SSL_CHAN="pts"
-CONFIG_UNIX98_PTYS=y
-CONFIG_LEGACY_PTYS=y
-# CONFIG_RAW_DRIVER is not set
-CONFIG_LEGACY_PTY_COUNT=256
-# CONFIG_WATCHDOG is not set
-CONFIG_UML_SOUND=m
-CONFIG_SOUND=m
-CONFIG_HOSTAUDIO=m
-# CONFIG_HW_RANDOM is not set
-CONFIG_UML_RANDOM=y
-
-#
-# Generic Driver Options
-#
-CONFIG_STANDALONE=y
-CONFIG_PREVENT_FIRMWARE_BUILD=y
-# CONFIG_FW_LOADER is not set
-# CONFIG_DEBUG_DRIVER is not set
-# CONFIG_DEBUG_DEVRES is not set
-# CONFIG_SYS_HYPERVISOR is not set
-
-#
-# Networking
-#
-
-#
-# Networking options
-#
-CONFIG_PACKET=y
-CONFIG_PACKET_MMAP=y
-CONFIG_UNIX=y
-CONFIG_XFRM=y
-# CONFIG_XFRM_USER is not set
-# CONFIG_XFRM_SUB_POLICY is not set
-# CONFIG_XFRM_MIGRATE is not set
-# CONFIG_NET_KEY is not set
-CONFIG_INET=y
-# CONFIG_IP_MULTICAST is not set
-# CONFIG_IP_ADVANCED_ROUTER is not set
-CONFIG_IP_FIB_HASH=y
-# CONFIG_IP_PNP is not set
-# CONFIG_NET_IPIP is not set
-# CONFIG_NET_IPGRE is not set
-# CONFIG_ARPD is not set
-# CONFIG_SYN_COOKIES is not set
-# CONFIG_INET_AH is not set
-# CONFIG_INET_ESP is not set
-# CONFIG_INET_IPCOMP is not set
-# CONFIG_INET_XFRM_TUNNEL is not set
-# CONFIG_INET_TUNNEL is not set
-CONFIG_INET_XFRM_MODE_TRANSPORT=y
-CONFIG_INET_XFRM_MODE_TUNNEL=y
-CONFIG_INET_XFRM_MODE_BEET=y
-CONFIG_INET_DIAG=y
-CONFIG_INET_TCP_DIAG=y
-# CONFIG_TCP_CONG_ADVANCED is not set
-CONFIG_TCP_CONG_CUBIC=y
-CONFIG_DEFAULT_TCP_CONG="cubic"
-# CONFIG_TCP_MD5SIG is not set
-# CONFIG_IPV6 is not set
-# CONFIG_INET6_XFRM_TUNNEL is not set
-# CONFIG_INET6_TUNNEL is not set
-# CONFIG_NETWORK_SECMARK is not set
-# CONFIG_NETFILTER is not set
-# CONFIG_IP_DCCP is not set
-# CONFIG_IP_SCTP is not set
-# CONFIG_TIPC is not set
-# CONFIG_ATM is not set
-# CONFIG_BRIDGE is not set
-# CONFIG_VLAN_8021Q is not set
-# CONFIG_DECNET is not set
-# CONFIG_LLC2 is not set
-# CONFIG_IPX is not set
-# CONFIG_ATALK is not set
-# CONFIG_X25 is not set
-# CONFIG_LAPB is not set
-# CONFIG_ECONET is not set
-# CONFIG_WAN_ROUTER is not set
-
-#
-# QoS and/or fair queueing
-#
-# CONFIG_NET_SCHED is not set
-
-#
-# Network testing
-#
-# CONFIG_NET_PKTGEN is not set
-# CONFIG_HAMRADIO is not set
-# CONFIG_IRDA is not set
-# CONFIG_BT is not set
-# CONFIG_AF_RXRPC is not set
-
-#
-# Wireless
-#
-# CONFIG_CFG80211 is not set
-# CONFIG_WIRELESS_EXT is not set
-# CONFIG_MAC80211 is not set
-# CONFIG_IEEE80211 is not set
-# CONFIG_RFKILL is not set
-# CONFIG_NET_9P is not set
-
-#
-# UML Network Devices
-#
-CONFIG_UML_NET=y
-CONFIG_UML_NET_ETHERTAP=y
-CONFIG_UML_NET_TUNTAP=y
-CONFIG_UML_NET_SLIP=y
-CONFIG_UML_NET_DAEMON=y
-CONFIG_UML_NET_MCAST=y
-# CONFIG_UML_NET_PCAP is not set
-CONFIG_UML_NET_SLIRP=y
-CONFIG_NETDEVICES=y
-# CONFIG_NETDEVICES_MULTIQUEUE is not set
-CONFIG_DUMMY=m
-# CONFIG_BONDING is not set
-# CONFIG_MACVLAN is not set
-# CONFIG_EQUALIZER is not set
-CONFIG_TUN=m
-
-#
-# Wireless LAN
-#
-# CONFIG_WLAN_PRE80211 is not set
-# CONFIG_WLAN_80211 is not set
-# CONFIG_WAN is not set
-CONFIG_PPP=m
-# CONFIG_PPP_MULTILINK is not set
-# CONFIG_PPP_FILTER is not set
-# CONFIG_PPP_ASYNC is not set
-# CONFIG_PPP_SYNC_TTY is not set
-# CONFIG_PPP_DEFLATE is not set
-# CONFIG_PPP_BSDCOMP is not set
-# CONFIG_PPP_MPPE is not set
-# CONFIG_PPPOE is not set
-# CONFIG_PPPOL2TP is not set
-CONFIG_SLIP=m
-# CONFIG_SLIP_COMPRESSED is not set
-CONFIG_SLHC=m
-# CONFIG_SLIP_SMART is not set
-# CONFIG_SLIP_MODE_SLIP6 is not set
-# CONFIG_SHAPER is not set
-# CONFIG_NETCONSOLE is not set
-# CONFIG_NETPOLL is not set
-# CONFIG_NET_POLL_CONTROLLER is not set
-# CONFIG_CONNECTOR is not set
-
-#
-# File systems
-#
-CONFIG_EXT2_FS=y
-# CONFIG_EXT2_FS_XATTR is not set
-# CONFIG_EXT2_FS_XIP is not set
-CONFIG_EXT3_FS=y
-# CONFIG_EXT3_FS_XATTR is not set
-# CONFIG_EXT4DEV_FS is not set
-CONFIG_JBD=y
-# CONFIG_JBD_DEBUG is not set
-CONFIG_REISERFS_FS=y
-# CONFIG_REISERFS_CHECK is not set
-# CONFIG_REISERFS_PROC_INFO is not set
-# CONFIG_REISERFS_FS_XATTR is not set
-# CONFIG_JFS_FS is not set
-# CONFIG_FS_POSIX_ACL is not set
-# CONFIG_XFS_FS is not set
-# CONFIG_GFS2_FS is not set
-# CONFIG_OCFS2_FS is not set
-# CONFIG_MINIX_FS is not set
-# CONFIG_ROMFS_FS is not set
-CONFIG_INOTIFY=y
-CONFIG_INOTIFY_USER=y
-CONFIG_QUOTA=y
-# CONFIG_QFMT_V1 is not set
-# CONFIG_QFMT_V2 is not set
-CONFIG_QUOTACTL=y
-CONFIG_DNOTIFY=y
-CONFIG_AUTOFS_FS=m
-CONFIG_AUTOFS4_FS=m
-# CONFIG_FUSE_FS is not set
-
-#
-# CD-ROM/DVD Filesystems
-#
-CONFIG_ISO9660_FS=m
-CONFIG_JOLIET=y
-# CONFIG_ZISOFS is not set
-# CONFIG_UDF_FS is not set
-
-#
-# DOS/FAT/NT Filesystems
-#
-# CONFIG_MSDOS_FS is not set
-# CONFIG_VFAT_FS is not set
-# CONFIG_NTFS_FS is not set
-
-#
-# Pseudo filesystems
-#
-CONFIG_PROC_FS=y
-CONFIG_PROC_KCORE=y
-CONFIG_PROC_SYSCTL=y
-CONFIG_SYSFS=y
-CONFIG_TMPFS=y
-# CONFIG_TMPFS_POSIX_ACL is not set
-# CONFIG_HUGETLB_PAGE is not set
-CONFIG_RAMFS=y
-# CONFIG_CONFIGFS_FS is not set
-
-#
-# Miscellaneous filesystems
-#
-# CONFIG_ADFS_FS is not set
-# CONFIG_AFFS_FS is not set
-# CONFIG_HFS_FS is not set
-# CONFIG_HFSPLUS_FS is not set
-# CONFIG_BEFS_FS is not set
-# CONFIG_BFS_FS is not set
-# CONFIG_EFS_FS is not set
-# CONFIG_CRAMFS is not set
-# CONFIG_VXFS_FS is not set
-# CONFIG_HPFS_FS is not set
-# CONFIG_QNX4FS_FS is not set
-# CONFIG_SYSV_FS is not set
-# CONFIG_UFS_FS is not set
-
-#
-# Network File Systems
-#
-CONFIG_CEPH_FS=m
-# CONFIG_NFS_FS is not set
-# CONFIG_NFSD is not set
-# CONFIG_SMB_FS is not set
-# CONFIG_CIFS is not set
-# CONFIG_NCP_FS is not set
-# CONFIG_CODA_FS is not set
-# CONFIG_AFS_FS is not set
-
-#
-# Partition Types
-#
-# CONFIG_PARTITION_ADVANCED is not set
-CONFIG_MSDOS_PARTITION=y
-
-#
-# Native Language Support
-#
-CONFIG_NLS=y
-CONFIG_NLS_DEFAULT="iso8859-1"
-# CONFIG_NLS_CODEPAGE_437 is not set
-# CONFIG_NLS_CODEPAGE_737 is not set
-# CONFIG_NLS_CODEPAGE_775 is not set
-# CONFIG_NLS_CODEPAGE_850 is not set
-# CONFIG_NLS_CODEPAGE_852 is not set
-# CONFIG_NLS_CODEPAGE_855 is not set
-# CONFIG_NLS_CODEPAGE_857 is not set
-# CONFIG_NLS_CODEPAGE_860 is not set
-# CONFIG_NLS_CODEPAGE_861 is not set
-# CONFIG_NLS_CODEPAGE_862 is not set
-# CONFIG_NLS_CODEPAGE_863 is not set
-# CONFIG_NLS_CODEPAGE_864 is not set
-# CONFIG_NLS_CODEPAGE_865 is not set
-# CONFIG_NLS_CODEPAGE_866 is not set
-# CONFIG_NLS_CODEPAGE_869 is not set
-# CONFIG_NLS_CODEPAGE_936 is not set
-# CONFIG_NLS_CODEPAGE_950 is not set
-# CONFIG_NLS_CODEPAGE_932 is not set
-# CONFIG_NLS_CODEPAGE_949 is not set
-# CONFIG_NLS_CODEPAGE_874 is not set
-# CONFIG_NLS_ISO8859_8 is not set
-# CONFIG_NLS_CODEPAGE_1250 is not set
-# CONFIG_NLS_CODEPAGE_1251 is not set
-# CONFIG_NLS_ASCII is not set
-# CONFIG_NLS_ISO8859_1 is not set
-# CONFIG_NLS_ISO8859_2 is not set
-# CONFIG_NLS_ISO8859_3 is not set
-# CONFIG_NLS_ISO8859_4 is not set
-# CONFIG_NLS_ISO8859_5 is not set
-# CONFIG_NLS_ISO8859_6 is not set
-# CONFIG_NLS_ISO8859_7 is not set
-# CONFIG_NLS_ISO8859_9 is not set
-# CONFIG_NLS_ISO8859_13 is not set
-# CONFIG_NLS_ISO8859_14 is not set
-# CONFIG_NLS_ISO8859_15 is not set
-# CONFIG_NLS_KOI8_R is not set
-# CONFIG_NLS_KOI8_U is not set
-# CONFIG_NLS_UTF8 is not set
-
-#
-# Distributed Lock Manager
-#
-# CONFIG_DLM is not set
-
-#
-# Security options
-#
-# CONFIG_KEYS is not set
-# CONFIG_SECURITY is not set
-# CONFIG_CRYPTO is not set
-
-#
-# Library routines
-#
-CONFIG_BITREVERSE=m
-# CONFIG_CRC_CCITT is not set
-# CONFIG_CRC16 is not set
-# CONFIG_CRC_ITU_T is not set
-CONFIG_CRC32=m
-# CONFIG_CRC7 is not set
-# CONFIG_LIBCRC32C is not set
-CONFIG_PLIST=y
-CONFIG_HAS_DMA=y
-
-#
-# SCSI device support
-#
-# CONFIG_RAID_ATTRS is not set
-# CONFIG_SCSI is not set
-# CONFIG_SCSI_DMA is not set
-# CONFIG_SCSI_NETLINK is not set
-# CONFIG_MD is not set
-# CONFIG_INPUT is not set
-
-#
-# Kernel hacking
-#
-# CONFIG_PRINTK_TIME is not set
-CONFIG_ENABLE_MUST_CHECK=y
-# CONFIG_UNUSED_SYMBOLS is not set
-# CONFIG_DEBUG_FS is not set
-CONFIG_DEBUG_KERNEL=y
-# CONFIG_DEBUG_SHIRQ is not set
-CONFIG_DETECT_SOFTLOCKUP=y
-CONFIG_SCHED_DEBUG=y
-# CONFIG_SCHEDSTATS is not set
-# CONFIG_TIMER_STATS is not set
-CONFIG_DEBUG_SLAB=y
-# CONFIG_DEBUG_SLAB_LEAK is not set
-# CONFIG_DEBUG_RT_MUTEXES is not set
-# CONFIG_RT_MUTEX_TESTER is not set
-CONFIG_DEBUG_SPINLOCK=y
-CONFIG_DEBUG_MUTEXES=y
-CONFIG_DEBUG_SPINLOCK_SLEEP=y
-# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
-# CONFIG_DEBUG_KOBJECT is not set
-CONFIG_DEBUG_BUGVERBOSE=y
-CONFIG_DEBUG_INFO=y
-# CONFIG_DEBUG_VM is not set
-CONFIG_DEBUG_LIST=y
-CONFIG_FRAME_POINTER=y
-CONFIG_FORCED_INLINING=y
-# CONFIG_RCU_TORTURE_TEST is not set
-# CONFIG_FAULT_INJECTION is not set
-# CONFIG_GPROF is not set
-# CONFIG_GCOV is not set
-# CONFIG_DEBUG_STACK_USAGE is not set
+++ /dev/null
-#include "ceph_debug.h"
-
-#include <linux/radix-tree.h>
-#include <linux/sort.h>
-
-#include "super.h"
-#include "decode.h"
-
-/*
- * Snapshots in ceph are driven in large part by cooperation from the
- * client. In contrast to local file systems or file servers that
- * implement snapshots at a single point in the system, ceph's
- * distributed access to storage requires clients to help decide
- * whether a write logically occurs before or after a recently created
- * snapshot.
- *
- * This provides a perfect instantanous client-wide snapshot. Between
- * clients, however, snapshots may appear to be applied at slightly
- * different points in time, depending on delays in delivering the
- * snapshot notification.
- *
- * Snapshots are _not_ file system-wide. Instead, each snapshot
- * applies to the subdirectory nested beneath some directory. This
- * effectively divides the hierarchy into multiple "realms," where all
- * of the files contained by each realm share the same set of
- * snapshots. An individual realm's snap set contains snapshots
- * explicitly created on that realm, as well as any snaps in its
- * parent's snap set _after_ the point at which the parent became it's
- * parent (due to, say, a rename). Similarly, snaps from prior parents
- * during the time intervals during which they were the parent are included.
- *
- * The client is spared most of this detail, fortunately... it must only
- * maintains a hierarchy of realms reflecting the current parent/child
- * realm relationship, and for each realm has an explicit list of snaps
- * inherited from prior parents.
- *
- * A snap_realm struct is maintained for realms containing every inode
- * with an open cap in the system. (The needed snap realm information is
- * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq'
- * version number is used to ensure that as realm parameters change (new
- * snapshot, new parent, etc.) the client's realm hierarchy is updated.
- *
- * The realm hierarchy drives the generation of a 'snap context' for each
- * realm, which simply lists the resulting set of snaps for the realm. This
- * is attached to any writes sent to OSDs.
- */
-/*
- * Unfortunately error handling is a bit mixed here. If we get a snap
- * update, but don't have enough memory to update our realm hierarchy,
- * it's not clear what we can do about it (besides complaining to the
- * console).
- */
-
-
-/*
- * increase ref count for the realm
- *
- * caller must hold snap_rwsem for write.
- */
-void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
- struct ceph_snap_realm *realm)
-{
- dout("get_realm %p %d -> %d\n", realm,
- atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
- /*
- * since we _only_ increment realm refs or empty the empty
- * list with snap_rwsem held, adjusting the empty list here is
- * safe. we do need to protect against concurrent empty list
- * additions, however.
- */
- if (atomic_read(&realm->nref) == 0) {
- spin_lock(&mdsc->snap_empty_lock);
- list_del_init(&realm->empty_item);
- spin_unlock(&mdsc->snap_empty_lock);
- }
-
- atomic_inc(&realm->nref);
-}
-
-/*
- * create and get the realm rooted at @ino and bump its ref count.
- *
- * caller must hold snap_rwsem for write.
- */
-static struct ceph_snap_realm *ceph_create_snap_realm(
- struct ceph_mds_client *mdsc,
- u64 ino)
-{
- struct ceph_snap_realm *realm;
-
- realm = kzalloc(sizeof(*realm), GFP_NOFS);
- if (!realm)
- return ERR_PTR(-ENOMEM);
-
- radix_tree_insert(&mdsc->snap_realms, ino, realm);
-
- atomic_set(&realm->nref, 0); /* tree does not take a ref */
- realm->ino = ino;
- INIT_LIST_HEAD(&realm->children);
- INIT_LIST_HEAD(&realm->child_item);
- INIT_LIST_HEAD(&realm->empty_item);
- INIT_LIST_HEAD(&realm->inodes_with_caps);
- spin_lock_init(&realm->inodes_with_caps_lock);
- dout("create_snap_realm %llx %p\n", realm->ino, realm);
- return realm;
-}
-
-/*
- * find and get (if found) the realm rooted at @ino and bump its ref count.
- *
- * caller must hold snap_rwsem for write.
- */
-struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
- u64 ino)
-{
- struct ceph_snap_realm *realm;
-
- realm = radix_tree_lookup(&mdsc->snap_realms, ino);
- if (realm)
- dout("lookup_snap_realm %llx %p\n", realm->ino, realm);
- return realm;
-}
-
-static void __put_snap_realm(struct ceph_mds_client *mdsc,
- struct ceph_snap_realm *realm);
-
-/*
- * called with snap_rwsem (write)
- */
-static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
- struct ceph_snap_realm *realm)
-{
- dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
-
- radix_tree_delete(&mdsc->snap_realms, realm->ino);
-
- if (realm->parent) {
- list_del_init(&realm->child_item);
- __put_snap_realm(mdsc, realm->parent);
- }
-
- kfree(realm->prior_parent_snaps);
- kfree(realm->snaps);
- ceph_put_snap_context(realm->cached_context);
- kfree(realm);
-}
-
-/*
- * caller holds snap_rwsem (write)
- */
-static void __put_snap_realm(struct ceph_mds_client *mdsc,
- struct ceph_snap_realm *realm)
-{
- dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
- atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
- if (atomic_dec_and_test(&realm->nref))
- __destroy_snap_realm(mdsc, realm);
-}
-
-/*
- * caller needn't hold any locks
- */
-void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
- struct ceph_snap_realm *realm)
-{
- dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
- atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
- if (!atomic_dec_and_test(&realm->nref))
- return;
-
- if (down_write_trylock(&mdsc->snap_rwsem)) {
- __destroy_snap_realm(mdsc, realm);
- up_write(&mdsc->snap_rwsem);
- } else {
- spin_lock(&mdsc->snap_empty_lock);
- list_add(&mdsc->snap_empty, &realm->empty_item);
- spin_unlock(&mdsc->snap_empty_lock);
- }
-}
-
-/*
- * Clean up any realms whose ref counts have dropped to zero. Note
- * that this does not include realms who were created but not yet
- * used.
- *
- * Called under snap_rwsem (write)
- */
-static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
-{
- struct ceph_snap_realm *realm;
-
- spin_lock(&mdsc->snap_empty_lock);
- while (!list_empty(&mdsc->snap_empty)) {
- realm = list_first_entry(&mdsc->snap_empty,
- struct ceph_snap_realm, empty_item);
- list_del(&realm->empty_item);
- spin_unlock(&mdsc->snap_empty_lock);
- __destroy_snap_realm(mdsc, realm);
- spin_lock(&mdsc->snap_empty_lock);
- }
- spin_unlock(&mdsc->snap_empty_lock);
-}
-
-void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
-{
- down_write(&mdsc->snap_rwsem);
- __cleanup_empty_realms(mdsc);
- up_write(&mdsc->snap_rwsem);
-}
-
-/*
- * adjust the parent realm of a given @realm. adjust child list, and parent
- * pointers, and ref counts appropriately.
- *
- * return true if parent was changed, 0 if unchanged, <0 on error.
- *
- * caller must hold snap_rwsem for write.
- */
-static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
- struct ceph_snap_realm *realm,
- u64 parentino)
-{
- struct ceph_snap_realm *parent;
-
- if (realm->parent_ino == parentino)
- return 0;
-
- parent = ceph_lookup_snap_realm(mdsc, parentino);
- if (IS_ERR(parent))
- return PTR_ERR(parent);
- if (!parent) {
- parent = ceph_create_snap_realm(mdsc, parentino);
- if (IS_ERR(parent))
- return PTR_ERR(parent);
- }
- dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
- realm->ino, realm, realm->parent_ino, realm->parent,
- parentino, parent);
- if (realm->parent) {
- list_del_init(&realm->child_item);
- ceph_put_snap_realm(mdsc, realm->parent);
- }
- realm->parent_ino = parentino;
- realm->parent = parent;
- ceph_get_snap_realm(mdsc, parent);
- list_add(&realm->child_item, &parent->children);
- return 1;
-}
-
-
-static int cmpu64_rev(const void *a, const void *b)
-{
- if (*(u64 *)a < *(u64 *)b)
- return 1;
- if (*(u64 *)a > *(u64 *)b)
- return -1;
- return 0;
-}
-
-/*
- * build the snap context for a given realm.
- */
-static int build_snap_context(struct ceph_snap_realm *realm)
-{
- struct ceph_snap_realm *parent = realm->parent;
- struct ceph_snap_context *snapc;
- int err = 0;
- int i;
- int num = realm->num_prior_parent_snaps + realm->num_snaps;
-
- /*
- * build parent context, if it hasn't been built.
- * conservatively estimate that all parent snaps might be
- * included by us.
- */
- if (parent) {
- if (!parent->cached_context) {
- err = build_snap_context(parent);
- if (err)
- goto fail;
- }
- num += parent->cached_context->num_snaps;
- }
-
- /* do i actually need to update? not if my context seq
- matches realm seq, and my parents' does to. (this works
- because we rebuild_snap_realms() works _downward_ in
- hierarchy after each update.) */
- if (realm->cached_context &&
- realm->cached_context->seq <= realm->seq &&
- (!parent ||
- realm->cached_context->seq <= parent->cached_context->seq)) {
- dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
- " (unchanged)\n",
- realm->ino, realm, realm->cached_context,
- realm->cached_context->seq,
- realm->cached_context->num_snaps);
- return 0;
- }
-
- /* alloc new snap context */
- err = -ENOMEM;
- if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
- goto fail;
- snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
- if (!snapc)
- goto fail;
- atomic_set(&snapc->nref, 1);
-
- /* build (reverse sorted) snap vector */
- num = 0;
- snapc->seq = realm->seq;
- if (parent) {
- /* include any of parent's snaps occuring _after_ my
- parent became my parent */
- for (i = 0; i < parent->cached_context->num_snaps; i++)
- if (parent->cached_context->snaps[i] >=
- realm->parent_since)
- snapc->snaps[num++] =
- parent->cached_context->snaps[i];
- if (parent->cached_context->seq > snapc->seq)
- snapc->seq = parent->cached_context->seq;
- }
- memcpy(snapc->snaps + num, realm->snaps,
- sizeof(u64)*realm->num_snaps);
- num += realm->num_snaps;
- memcpy(snapc->snaps + num, realm->prior_parent_snaps,
- sizeof(u64)*realm->num_prior_parent_snaps);
- num += realm->num_prior_parent_snaps;
-
- sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
- snapc->num_snaps = num;
- dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n",
- realm->ino, realm, snapc, snapc->seq, snapc->num_snaps);
-
- if (realm->cached_context)
- ceph_put_snap_context(realm->cached_context);
- realm->cached_context = snapc;
- return 0;
-
-fail:
- /*
- * if we fail, clear old (incorrect) cached_context... hopefully
- * we'll have better luck building it later
- */
- if (realm->cached_context) {
- ceph_put_snap_context(realm->cached_context);
- realm->cached_context = NULL;
- }
- pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
- realm, err);
- return err;
-}
-
-/*
- * rebuild snap context for the given realm and all of its children.
- */
-static void rebuild_snap_realms(struct ceph_snap_realm *realm)
-{
- struct ceph_snap_realm *child;
-
- dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
- build_snap_context(realm);
-
- list_for_each_entry(child, &realm->children, child_item)
- rebuild_snap_realms(child);
-}
-
-
-/*
- * helper to allocate and decode an array of snapids. free prior
- * instance, if any.
- */
-static int dup_array(u64 **dst, __le64 *src, int num)
-{
- int i;
-
- kfree(*dst);
- if (num) {
- *dst = kcalloc(num, sizeof(u64), GFP_NOFS);
- if (!*dst)
- return -ENOMEM;
- for (i = 0; i < num; i++)
- (*dst)[i] = get_unaligned_le64(src + i);
- } else {
- *dst = NULL;
- }
- return 0;
-}
-
-
-/*
- * When a snapshot is applied, the size/mtime inode metadata is queued
- * in a ceph_cap_snap (one for each snapshot) until writeback
- * completes and the metadata can be flushed back to the MDS.
- *
- * However, if a (sync) write is currently in-progress when we apply
- * the snapshot, we have to wait until the write succeeds or fails
- * (and a final size/mtime is known). In this case the
- * cap_snap->writing = 1, and is said to be "pending." When the write
- * finishes, we __ceph_finish_cap_snap().
- *
- * Caller must hold snap_rwsem for read (i.e., the realm topology won't
- * change).
- */
-void ceph_queue_cap_snap(struct ceph_inode_info *ci,
- struct ceph_snap_context *snapc)
-{
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap_snap *capsnap;
- int used;
-
- capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
- if (!capsnap) {
- pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
- return;
- }
-
- spin_lock(&inode->i_lock);
- used = __ceph_caps_used(ci);
- if (__ceph_have_pending_cap_snap(ci)) {
- /* there is no point in queuing multiple "pending" cap_snaps,
- as no new writes are allowed to start when pending, so any
- writes in progress now were started before the previous
- cap_snap. lucky us. */
- dout("queue_cap_snap %p snapc %p seq %llu used %d"
- " already pending\n", inode, snapc, snapc->seq, used);
- kfree(capsnap);
- } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
- igrab(inode);
-
- atomic_set(&capsnap->nref, 1);
- capsnap->ci = ci;
- INIT_LIST_HEAD(&capsnap->ci_item);
- INIT_LIST_HEAD(&capsnap->flushing_item);
-
- capsnap->follows = snapc->seq - 1;
- capsnap->context = ceph_get_snap_context(snapc);
- capsnap->issued = __ceph_caps_issued(ci, NULL);
- capsnap->dirty = __ceph_caps_dirty(ci);
-
- capsnap->mode = inode->i_mode;
- capsnap->uid = inode->i_uid;
- capsnap->gid = inode->i_gid;
-
- /* fixme? */
- capsnap->xattr_blob = NULL;
- capsnap->xattr_len = 0;
-
- /* dirty page count moved from _head to this cap_snap;
- all subsequent writes page dirties occur _after_ this
- snapshot. */
- capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
- ci->i_wrbuffer_ref_head = 0;
- ceph_put_snap_context(ci->i_head_snapc);
- ci->i_head_snapc = NULL;
- list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
-
- if (used & CEPH_CAP_FILE_WR) {
- dout("queue_cap_snap %p cap_snap %p snapc %p"
- " seq %llu used WR, now pending\n", inode,
- capsnap, snapc, snapc->seq);
- capsnap->writing = 1;
- } else {
- /* note mtime, size NOW. */
- __ceph_finish_cap_snap(ci, capsnap);
- }
- } else {
- dout("queue_cap_snap %p nothing dirty|writing\n", inode);
- kfree(capsnap);
- }
-
- spin_unlock(&inode->i_lock);
-}
-
-/*
- * Finalize the size, mtime for a cap_snap.. that is, settle on final values
- * to be used for the snapshot, to be flushed back to the mds.
- *
- * If capsnap can now be flushed, add to snap_flush list, and return 1.
- *
- * Caller must hold i_lock.
- */
-int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
- struct ceph_cap_snap *capsnap)
-{
- struct inode *inode = &ci->vfs_inode;
- struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
-
- BUG_ON(capsnap->writing);
- capsnap->size = inode->i_size;
- capsnap->mtime = inode->i_mtime;
- capsnap->atime = inode->i_atime;
- capsnap->ctime = inode->i_ctime;
- capsnap->time_warp_seq = ci->i_time_warp_seq;
- if (capsnap->dirty_pages) {
- dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu "
- "still has %d dirty pages\n", inode, capsnap,
- capsnap->context, capsnap->context->seq,
- capsnap->size, capsnap->dirty_pages);
- return 0;
- }
- dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n",
- inode, capsnap, capsnap->context,
- capsnap->context->seq, capsnap->size);
-
- spin_lock(&mdsc->snap_flush_lock);
- list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
- spin_unlock(&mdsc->snap_flush_lock);
- return 1; /* caller may want to ceph_flush_snaps */
-}
-
-
-/*
- * Parse and apply a snapblob "snap trace" from the MDS. This specifies
- * the snap realm parameters from a given realm and all of its ancestors,
- * up to the root.
- *
- * Caller must hold snap_rwsem for write.
- */
-int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
- void *p, void *e, bool deletion)
-{
- struct ceph_mds_snap_realm *ri; /* encoded */
- __le64 *snaps; /* encoded */
- __le64 *prior_parent_snaps; /* encoded */
- struct ceph_snap_realm *realm;
- int invalidate = 0;
- int err = -ENOMEM;
-
- dout("update_snap_trace deletion=%d\n", deletion);
-more:
- ceph_decode_need(&p, e, sizeof(*ri), bad);
- ri = p;
- p += sizeof(*ri);
- ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
- le32_to_cpu(ri->num_prior_parent_snaps)), bad);
- snaps = p;
- p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
- prior_parent_snaps = p;
- p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
-
- realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
- if (IS_ERR(realm)) {
- err = PTR_ERR(realm);
- goto fail;
- }
- if (!realm) {
- realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
- if (IS_ERR(realm)) {
- err = PTR_ERR(realm);
- goto fail;
- }
- }
-
- if (le64_to_cpu(ri->seq) > realm->seq) {
- dout("update_snap_trace updating %llx %p %lld -> %lld\n",
- realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
- /*
- * if the realm seq has changed, queue a cap_snap for every
- * inode with open caps. we do this _before_ we update
- * the realm info so that we prepare for writeback under the
- * _previous_ snap context.
- *
- * ...unless it's a snap deletion!
- */
- if (!deletion) {
- struct ceph_inode_info *ci;
- struct inode *lastinode = NULL;
-
- spin_lock(&realm->inodes_with_caps_lock);
- list_for_each_entry(ci, &realm->inodes_with_caps,
- i_snap_realm_item) {
- struct inode *inode = igrab(&ci->vfs_inode);
- if (!inode)
- continue;
- spin_unlock(&realm->inodes_with_caps_lock);
- if (lastinode)
- iput(lastinode);
- lastinode = inode;
- ceph_queue_cap_snap(ci, realm->cached_context);
- spin_lock(&realm->inodes_with_caps_lock);
- }
- spin_unlock(&realm->inodes_with_caps_lock);
- if (lastinode)
- iput(lastinode);
- dout("update_snap_trace cap_snaps queued\n");
- }
-
- } else {
- dout("update_snap_trace %llx %p seq %lld unchanged\n",
- realm->ino, realm, realm->seq);
- }
-
- /* ensure the parent is correct */
- err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
- if (err < 0)
- goto fail;
- invalidate += err;
-
- if (le64_to_cpu(ri->seq) > realm->seq) {
- /* update realm parameters, snap lists */
- realm->seq = le64_to_cpu(ri->seq);
- realm->created = le64_to_cpu(ri->created);
- realm->parent_since = le64_to_cpu(ri->parent_since);
-
- realm->num_snaps = le32_to_cpu(ri->num_snaps);
- err = dup_array(&realm->snaps, snaps, realm->num_snaps);
- if (err < 0)
- goto fail;
-
- realm->num_prior_parent_snaps =
- le32_to_cpu(ri->num_prior_parent_snaps);
- err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
- realm->num_prior_parent_snaps);
- if (err < 0)
- goto fail;
-
- invalidate = 1;
- } else if (!realm->cached_context) {
- invalidate = 1;
- }
-
- dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
- realm, invalidate, p, e);
-
- if (p < e)
- goto more;
-
- /* invalidate when we reach the _end_ (root) of the trace */
- if (invalidate)
- rebuild_snap_realms(realm);
-
- __cleanup_empty_realms(mdsc);
- return 0;
-
-bad:
- err = -EINVAL;
-fail:
- pr_err("update_snap_trace error %d\n", err);
- return err;
-}
-
-
-/*
- * Send any cap_snaps that are queued for flush. Try to carry
- * s_mutex across multiple snap flushes to avoid locking overhead.
- *
- * Caller holds no locks.
- */
-static void flush_snaps(struct ceph_mds_client *mdsc)
-{
- struct ceph_inode_info *ci;
- struct inode *inode;
- struct ceph_mds_session *session = NULL;
-
- dout("flush_snaps\n");
- spin_lock(&mdsc->snap_flush_lock);
- while (!list_empty(&mdsc->snap_flush_list)) {
- ci = list_first_entry(&mdsc->snap_flush_list,
- struct ceph_inode_info, i_snap_flush_item);
- inode = &ci->vfs_inode;
- igrab(inode);
- spin_unlock(&mdsc->snap_flush_lock);
- spin_lock(&inode->i_lock);
- __ceph_flush_snaps(ci, &session);
- spin_unlock(&inode->i_lock);
- iput(inode);
- spin_lock(&mdsc->snap_flush_lock);
- }
- spin_unlock(&mdsc->snap_flush_lock);
-
- if (session) {
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
- }
- dout("flush_snaps done\n");
-}
-
-
-/*
- * Handle a snap notification from the MDS.
- *
- * This can take two basic forms: the simplest is just a snap creation
- * or deletion notification on an existing realm. This should update the
- * realm and its children.
- *
- * The more difficult case is realm creation, due to snap creation at a
- * new point in the file hierarchy, or due to a rename that moves a file or
- * directory into another realm.
- */
-void ceph_handle_snap(struct ceph_mds_client *mdsc,
- struct ceph_msg *msg)
-{
- struct super_block *sb = mdsc->client->sb;
- struct ceph_mds_session *session;
- int mds;
- u64 split;
- int op;
- int trace_len;
- struct ceph_snap_realm *realm = NULL;
- void *p = msg->front.iov_base;
- void *e = p + msg->front.iov_len;
- struct ceph_mds_snap_head *h;
- int num_split_inos, num_split_realms;
- __le64 *split_inos = NULL, *split_realms = NULL;
- int i;
- int locked_rwsem = 0;
-
- if (msg->hdr.src.name.type != CEPH_ENTITY_TYPE_MDS)
- return;
- mds = le64_to_cpu(msg->hdr.src.name.num);
-
- /* decode */
- if (msg->front.iov_len < sizeof(*h))
- goto bad;
- h = p;
- op = le32_to_cpu(h->op);
- split = le64_to_cpu(h->split); /* non-zero if we are splitting an
- * existing realm */
- num_split_inos = le32_to_cpu(h->num_split_inos);
- num_split_realms = le32_to_cpu(h->num_split_realms);
- trace_len = le32_to_cpu(h->trace_len);
- p += sizeof(*h);
-
- dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
- ceph_snap_op_name(op), split, trace_len);
-
- /* find session */
- mutex_lock(&mdsc->mutex);
- session = __ceph_lookup_mds_session(mdsc, mds);
- mutex_unlock(&mdsc->mutex);
- if (!session) {
- dout("WTF, got snap but no session for mds%d\n", mds);
- return;
- }
-
- mutex_lock(&session->s_mutex);
- session->s_seq++;
- mutex_unlock(&session->s_mutex);
-
- down_write(&mdsc->snap_rwsem);
- locked_rwsem = 1;
-
- if (op == CEPH_SNAP_OP_SPLIT) {
- struct ceph_mds_snap_realm *ri;
-
- /*
- * A "split" breaks part of an existing realm off into
- * a new realm. The MDS provides a list of inodes
- * (with caps) and child realms that belong to the new
- * child.
- */
- split_inos = p;
- p += sizeof(u64) * num_split_inos;
- split_realms = p;
- p += sizeof(u64) * num_split_realms;
- ceph_decode_need(&p, e, sizeof(*ri), bad);
- /* we will peek at realm info here, but will _not_
- * advance p, as the realm update will occur below in
- * ceph_update_snap_trace. */
- ri = p;
-
- realm = ceph_lookup_snap_realm(mdsc, split);
- if (IS_ERR(realm))
- goto out;
- if (!realm) {
- realm = ceph_create_snap_realm(mdsc, split);
- if (IS_ERR(realm))
- goto out;
- }
- ceph_get_snap_realm(mdsc, realm);
-
- dout("splitting snap_realm %llx %p\n", realm->ino, realm);
- for (i = 0; i < num_split_inos; i++) {
- struct ceph_vino vino = {
- .ino = le64_to_cpu(split_inos[i]),
- .snap = CEPH_NOSNAP,
- };
- struct inode *inode = ceph_find_inode(sb, vino);
- struct ceph_inode_info *ci;
-
- if (!inode)
- continue;
- ci = ceph_inode(inode);
-
- spin_lock(&inode->i_lock);
- if (!ci->i_snap_realm)
- goto skip_inode;
- /*
- * If this inode belongs to a realm that was
- * created after our new realm, we experienced
- * a race (due to another split notifications
- * arriving from a different MDS). So skip
- * this inode.
- */
- if (ci->i_snap_realm->created >
- le64_to_cpu(ri->created)) {
- dout(" leaving %p in newer realm %llx %p\n",
- inode, ci->i_snap_realm->ino,
- ci->i_snap_realm);
- goto skip_inode;
- }
- dout(" will move %p to split realm %llx %p\n",
- inode, realm->ino, realm);
- /*
- * Remove the inode from the realm's inode
- * list, but don't add it to the new realm
- * yet. We don't want the cap_snap to be
- * queued (again) by ceph_update_snap_trace()
- * below. Queue it _now_, under the old context.
- */
- list_del_init(&ci->i_snap_realm_item);
- spin_unlock(&inode->i_lock);
-
- ceph_queue_cap_snap(ci,
- ci->i_snap_realm->cached_context);
-
- iput(inode);
- continue;
-
-skip_inode:
- spin_unlock(&inode->i_lock);
- iput(inode);
- }
-
- /* we may have taken some of the old realm's children. */
- for (i = 0; i < num_split_realms; i++) {
- struct ceph_snap_realm *child =
- ceph_lookup_snap_realm(mdsc,
- le64_to_cpu(split_realms[i]));
- if (IS_ERR(child))
- continue;
- if (!child)
- continue;
- adjust_snap_realm_parent(mdsc, child, realm->ino);
- }
- }
-
- /*
- * update using the provided snap trace. if we are deleting a
- * snap, we can avoid queueing cap_snaps.
- */
- ceph_update_snap_trace(mdsc, p, e,
- op == CEPH_SNAP_OP_DESTROY);
-
- if (op == CEPH_SNAP_OP_SPLIT) {
- /*
- * ok, _now_ add the inodes into the new realm.
- */
- for (i = 0; i < num_split_inos; i++) {
- struct ceph_vino vino = {
- .ino = le64_to_cpu(split_inos[i]),
- .snap = CEPH_NOSNAP,
- };
- struct inode *inode = ceph_find_inode(sb, vino);
- struct ceph_inode_info *ci;
-
- if (!inode)
- continue;
- ci = ceph_inode(inode);
- spin_lock(&inode->i_lock);
- if (!ci->i_snap_realm)
- goto split_skip_inode;
- ceph_put_snap_realm(mdsc, ci->i_snap_realm);
- spin_lock(&realm->inodes_with_caps_lock);
- list_add(&ci->i_snap_realm_item,
- &realm->inodes_with_caps);
- ci->i_snap_realm = realm;
- spin_unlock(&realm->inodes_with_caps_lock);
- ceph_get_snap_realm(mdsc, realm);
-split_skip_inode:
- spin_unlock(&inode->i_lock);
- iput(inode);
- }
-
- /* we took a reference when we created the realm, above */
- ceph_put_snap_realm(mdsc, realm);
- }
-
- __cleanup_empty_realms(mdsc);
-
- up_write(&mdsc->snap_rwsem);
-
- flush_snaps(mdsc);
- return;
-
-bad:
- pr_err("corrupt snap message from mds%d\n", mds);
-out:
- if (locked_rwsem)
- up_write(&mdsc->snap_rwsem);
- return;
-}
-
-
-
+++ /dev/null
-
-#include "ceph_debug.h"
-
-#include <linux/backing-dev.h>
-#include <linux/fs.h>
-#include <linux/inet.h>
-#include <linux/in6.h>
-#include <linux/module.h>
-#include <linux/mount.h>
-#include <linux/parser.h>
-#include <linux/rwsem.h>
-#include <linux/sched.h>
-#include <linux/seq_file.h>
-#include <linux/statfs.h>
-#include <linux/string.h>
-#include <linux/version.h>
-#include <linux/vmalloc.h>
-
-#include "ceph_ver.h"
-#include "decode.h"
-#include "super.h"
-#include "mon_client.h"
-
-/*
- * Ceph superblock operations
- *
- * Handle the basics of mounting, unmounting.
- */
-
-
-/*
- * find filename portion of a path (/foo/bar/baz -> baz)
- */
-const char *ceph_file_part(const char *s, int len)
-{
- const char *e = s + len;
-
- while (e != s && *(e-1) != '/')
- e--;
- return e;
-}
-
-
-/*
- * super ops
- */
-static void ceph_put_super(struct super_block *s)
-{
- struct ceph_client *cl = ceph_client(s);
-
- dout("put_super\n");
- ceph_mdsc_close_sessions(&cl->mdsc);
- return;
-}
-
-static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
- struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
- struct ceph_monmap *monmap = client->monc.monmap;
- struct ceph_statfs st;
- u64 fsid;
- int err;
-
- dout("statfs\n");
- err = ceph_monc_do_statfs(&client->monc, &st);
- if (err < 0)
- return err;
-
- /* fill in kstatfs */
- buf->f_type = CEPH_SUPER_MAGIC; /* ?? */
-
- /*
- * express utilization in terms of large blocks to avoid
- * overflow on 32-bit machines.
- */
- buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
- buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
- buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >>
- (CEPH_BLOCK_SHIFT-10);
- buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
-
- buf->f_files = le64_to_cpu(st.num_objects);
- buf->f_ffree = -1;
- buf->f_namelen = PATH_MAX;
- buf->f_frsize = PAGE_CACHE_SIZE;
-
- /* leave fsid little-endian, regardless of host endianness */
- fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
- buf->f_fsid.val[0] = fsid & 0xffffffff;
- buf->f_fsid.val[1] = fsid >> 32;
-
- return 0;
-}
-
-
-static int ceph_syncfs(struct super_block *sb, int wait)
-{
- dout("sync_fs %d\n", wait);
- ceph_osdc_sync(&ceph_client(sb)->osdc);
- ceph_mdsc_sync(&ceph_client(sb)->mdsc);
- return 0;
-}
-
-
-/**
- * ceph_show_options - Show mount options in /proc/mounts
- * @m: seq_file to write to
- * @mnt: mount descriptor
- */
-static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
-{
- struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
- struct ceph_mount_args *args = &client->mount_args;
-
- if (args->flags & CEPH_OPT_FSID)
- seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
- le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
- le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
- if (args->flags & CEPH_OPT_NOSHARE)
- seq_puts(m, ",noshare");
- if (args->flags & CEPH_OPT_DIRSTAT)
- seq_puts(m, ",dirstat");
- if ((args->flags & CEPH_OPT_RBYTES) == 0)
- seq_puts(m, ",norbytes");
- if (args->flags & CEPH_OPT_NOCRC)
- seq_puts(m, ",nocrc");
- if (args->flags & CEPH_OPT_NOASYNCREADDIR)
- seq_puts(m, ",noasyncreaddir");
- if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
- seq_printf(m, ",snapdirname=%s", args->snapdir_name);
- if (args->secret)
- seq_puts(m, ",secret=<hidden>");
- return 0;
-}
-
-/*
- * caches
- */
-struct kmem_cache *ceph_inode_cachep;
-struct kmem_cache *ceph_cap_cachep;
-struct kmem_cache *ceph_dentry_cachep;
-struct kmem_cache *ceph_file_cachep;
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27)
-static void ceph_inode_init_once(void *foo)
-#else
-static void ceph_inode_init_once(struct kmem_cache *cachep, void *foo)
-#endif
-{
- struct ceph_inode_info *ci = foo;
- inode_init_once(&ci->vfs_inode);
-}
-
-static int __init init_caches(void)
-{
- ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
- sizeof(struct ceph_inode_info),
- __alignof__(struct ceph_inode_info),
- (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
- ceph_inode_init_once);
- if (ceph_inode_cachep == NULL)
- return -ENOMEM;
-
- ceph_cap_cachep = KMEM_CACHE(ceph_cap,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
- if (ceph_cap_cachep == NULL)
- goto bad_cap;
-
- ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
- if (ceph_dentry_cachep == NULL)
- goto bad_dentry;
-
- ceph_file_cachep = KMEM_CACHE(ceph_file_info,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
- if (ceph_file_cachep == NULL)
- goto bad_file;
-
- return 0;
-
-bad_file:
- kmem_cache_destroy(ceph_dentry_cachep);
-bad_dentry:
- kmem_cache_destroy(ceph_cap_cachep);
-bad_cap:
- kmem_cache_destroy(ceph_inode_cachep);
- return -ENOMEM;
-}
-
-static void destroy_caches(void)
-{
- kmem_cache_destroy(ceph_inode_cachep);
- kmem_cache_destroy(ceph_cap_cachep);
- kmem_cache_destroy(ceph_dentry_cachep);
- kmem_cache_destroy(ceph_file_cachep);
-}
-
-
-/*
- * ceph_umount_begin - initiate forced umount. Tear down down the
- * mount, skipping steps that may hang while waiting for server(s).
- */
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 26)
-static void ceph_umount_begin(struct vfsmount *vfsmnt, int flags)
-#else
-static void ceph_umount_begin(struct super_block *sb)
-#endif
-{
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 26)
- struct ceph_client *client = ceph_sb_to_client(vfsmnt->mnt_sb);
-#else
- struct ceph_client *client = ceph_sb_to_client(sb);
-#endif
-
- dout("ceph_umount_begin - starting forced umount\n");
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 26)
- if (!(flags & MNT_FORCE))
- return;
-#endif
-
- if (!client)
- return;
-
- client->mount_state = CEPH_MOUNT_SHUTDOWN;
- return;
-}
-
-static const struct super_operations ceph_super_ops = {
- .alloc_inode = ceph_alloc_inode,
- .destroy_inode = ceph_destroy_inode,
- .write_inode = ceph_write_inode,
- .sync_fs = ceph_syncfs,
- .put_super = ceph_put_super,
- .show_options = ceph_show_options,
- .statfs = ceph_statfs,
- .umount_begin = ceph_umount_begin,
-};
-
-
-const char *ceph_msg_type_name(int type)
-{
- switch (type) {
- case CEPH_MSG_SHUTDOWN: return "shutdown";
- case CEPH_MSG_PING: return "ping";
- case CEPH_MSG_MON_MAP: return "mon_map";
- case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
- case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
- case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
- case CEPH_MSG_CLIENT_MOUNT: return "client_mount";
- case CEPH_MSG_CLIENT_MOUNT_ACK: return "client_mount_ack";
- case CEPH_MSG_STATFS: return "statfs";
- case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
- case CEPH_MSG_MDS_GETMAP: return "mds_getmap";
- case CEPH_MSG_MDS_MAP: return "mds_map";
- case CEPH_MSG_CLIENT_SESSION: return "client_session";
- case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
- case CEPH_MSG_CLIENT_REQUEST: return "client_request";
- case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
- case CEPH_MSG_CLIENT_REPLY: return "client_reply";
- case CEPH_MSG_CLIENT_CAPS: return "client_caps";
- case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
- case CEPH_MSG_CLIENT_SNAP: return "client_snap";
- case CEPH_MSG_CLIENT_LEASE: return "client_lease";
- case CEPH_MSG_OSD_GETMAP: return "osd_getmap";
- case CEPH_MSG_OSD_MAP: return "osd_map";
- case CEPH_MSG_OSD_OP: return "osd_op";
- case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
- default: return "unknown";
- }
-}
-
-
-/*
- * mount options
- */
-enum {
- Opt_fsidmajor,
- Opt_fsidminor,
- Opt_monport,
- Opt_wsize,
- Opt_rsize,
- Opt_osdtimeout,
- Opt_mount_timeout,
- Opt_caps_wanted_delay_min,
- Opt_caps_wanted_delay_max,
- Opt_readdir_max_entries,
- /* int args above */
- Opt_snapdirname,
- Opt_secret,
- /* string args above */
- Opt_ip,
- Opt_noshare,
- Opt_dirstat,
- Opt_nodirstat,
- Opt_rbytes,
- Opt_norbytes,
- Opt_nocrc,
- Opt_noasyncreaddir,
-};
-
-static match_table_t arg_tokens = {
- {Opt_fsidmajor, "fsidmajor=%ld"},
- {Opt_fsidminor, "fsidminor=%ld"},
- {Opt_monport, "monport=%d"},
- {Opt_wsize, "wsize=%d"},
- {Opt_rsize, "rsize=%d"},
- {Opt_osdtimeout, "osdtimeout=%d"},
- {Opt_mount_timeout, "mount_timeout=%d"},
- {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
- {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
- {Opt_readdir_max_entries, "readdir_max_entries=%d"},
- /* int args above */
- {Opt_snapdirname, "snapdirname=%s"},
- {Opt_secret, "secret=%s"},
- /* string args above */
- {Opt_ip, "ip=%s"},
- {Opt_noshare, "noshare"},
- {Opt_dirstat, "dirstat"},
- {Opt_nodirstat, "nodirstat"},
- {Opt_rbytes, "rbytes"},
- {Opt_norbytes, "norbytes"},
- {Opt_nocrc, "nocrc"},
- {Opt_noasyncreaddir, "noasyncreaddir"},
- {-1, NULL}
-};
-
-
-static int parse_mount_args(struct ceph_client *client,
- int flags, char *options, const char *dev_name,
- const char **path)
-{
- struct ceph_mount_args *args = &client->mount_args;
- const char *c;
- int err;
- substring_t argstr[MAX_OPT_ARGS];
- int num_mon;
- struct ceph_entity_addr mon_addr[CEPH_MAX_MON_MOUNT_ADDR];
- int i;
-
- dout("parse_mount_args dev_name '%s'\n", dev_name);
- memset(args, 0, sizeof(*args));
-
- /* start with defaults */
- args->sb_flags = flags;
- args->flags = CEPH_OPT_DEFAULT;
- args->osd_timeout = 5; /* seconds */
- args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
- args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
- args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
- args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
- args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
- args->max_readdir = 1024;
-
- /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
- if (!dev_name)
- return -EINVAL;
- *path = strstr(dev_name, ":/");
- if (*path == NULL) {
- pr_err("device name is missing path (no :/ in %s)\n",
- dev_name);
- return -EINVAL;
- }
-
- /* get mon ip(s) */
- err = ceph_parse_ips(dev_name, *path, mon_addr,
- CEPH_MAX_MON_MOUNT_ADDR, &num_mon);
- if (err < 0)
- return err;
-
- /* build initial monmap */
- client->monc.monmap = kzalloc(sizeof(*client->monc.monmap) +
- num_mon*sizeof(client->monc.monmap->mon_inst[0]),
- GFP_KERNEL);
- if (!client->monc.monmap)
- return -ENOMEM;
- for (i = 0; i < num_mon; i++) {
- client->monc.monmap->mon_inst[i].addr = mon_addr[i];
- client->monc.monmap->mon_inst[i].addr.erank = 0;
- client->monc.monmap->mon_inst[i].addr.nonce = 0;
- client->monc.monmap->mon_inst[i].name.type =
- CEPH_ENTITY_TYPE_MON;
- client->monc.monmap->mon_inst[i].name.num = cpu_to_le64(i);
- }
- client->monc.monmap->num_mon = num_mon;
- memset(&args->my_addr.in_addr, 0, sizeof(args->my_addr.in_addr));
-
- /* path on server */
- *path += 2;
- dout("server path '%s'\n", *path);
-
- /* parse mount options */
- while ((c = strsep(&options, ",")) != NULL) {
- int token, intval, ret;
- if (!*c)
- continue;
- token = match_token((char *)c, arg_tokens, argstr);
- if (token < 0) {
- pr_err("bad mount option at '%s'\n", c);
- return -EINVAL;
-
- }
- if (token < Opt_ip) {
- ret = match_int(&argstr[0], &intval);
- if (ret < 0) {
- pr_err("bad mount option arg (not int) "
- "at '%s'\n", c);
- continue;
- }
- dout("got token %d intval %d\n", token, intval);
- }
- switch (token) {
- case Opt_fsidmajor:
- *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
- break;
- case Opt_fsidminor:
- *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
- break;
- case Opt_ip:
- err = ceph_parse_ips(argstr[0].from,
- argstr[0].to,
- &args->my_addr,
- 1, NULL);
- if (err < 0)
- return err;
- args->flags |= CEPH_OPT_MYIP;
- break;
-
- case Opt_snapdirname:
- kfree(args->snapdir_name);
- args->snapdir_name = kstrndup(argstr[0].from,
- argstr[0].to-argstr[0].from,
- GFP_KERNEL);
- break;
- case Opt_secret:
- args->secret = kstrndup(argstr[0].from,
- argstr[0].to-argstr[0].from,
- GFP_KERNEL);
- break;
-
- /* misc */
- case Opt_wsize:
- args->wsize = intval;
- break;
- case Opt_rsize:
- args->rsize = intval;
- break;
- case Opt_osdtimeout:
- args->osd_timeout = intval;
- break;
- case Opt_mount_timeout:
- args->mount_timeout = intval;
- break;
- case Opt_caps_wanted_delay_min:
- args->caps_wanted_delay_min = intval;
- break;
- case Opt_caps_wanted_delay_max:
- args->caps_wanted_delay_max = intval;
- break;
- case Opt_readdir_max_entries:
- args->max_readdir = intval;
- break;
-
- case Opt_noshare:
- args->flags |= CEPH_OPT_NOSHARE;
- break;
-
- case Opt_dirstat:
- args->flags |= CEPH_OPT_DIRSTAT;
- break;
- case Opt_nodirstat:
- args->flags &= ~CEPH_OPT_DIRSTAT;
- break;
- case Opt_rbytes:
- args->flags |= CEPH_OPT_RBYTES;
- break;
- case Opt_norbytes:
- args->flags &= ~CEPH_OPT_RBYTES;
- break;
- case Opt_nocrc:
- args->flags |= CEPH_OPT_NOCRC;
- break;
- case Opt_noasyncreaddir:
- args->flags |= CEPH_OPT_NOASYNCREADDIR;
- break;
-
- default:
- BUG_ON(token);
- }
- }
-
- return 0;
-}
-
-static void release_mount_args(struct ceph_mount_args *args)
-{
- kfree(args->snapdir_name);
- args->snapdir_name = NULL;
- kfree(args->secret);
- args->secret = NULL;
-}
-
-/*
- * create a fresh client instance
- */
-static struct ceph_client *ceph_create_client(void)
-{
- struct ceph_client *client;
- int err = -ENOMEM;
-
- client = kzalloc(sizeof(*client), GFP_KERNEL);
- if (client == NULL)
- return ERR_PTR(-ENOMEM);
-
- mutex_init(&client->mount_mutex);
-
- init_waitqueue_head(&client->mount_wq);
-
- client->sb = NULL;
- client->mount_state = CEPH_MOUNT_MOUNTING;
- client->whoami = -1;
-
- client->msgr = NULL;
-
- client->mount_err = 0;
- client->signed_ticket = NULL;
- client->signed_ticket_len = 0;
-
- err = -ENOMEM;
- client->wb_wq = create_workqueue("ceph-writeback");
- if (client->wb_wq == NULL)
- goto fail;
- client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
- if (client->pg_inv_wq == NULL)
- goto fail_wb_wq;
- client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
- if (client->trunc_wq == NULL)
- goto fail_pg_inv_wq;
-
- /* subsystems */
- err = ceph_monc_init(&client->monc, client);
- if (err < 0)
- goto fail_trunc_wq;
- err = ceph_osdc_init(&client->osdc, client);
- if (err < 0)
- goto fail_monc;
- ceph_mdsc_init(&client->mdsc, client);
- return client;
-
-fail_monc:
- ceph_monc_stop(&client->monc);
-fail_trunc_wq:
- destroy_workqueue(client->trunc_wq);
-fail_pg_inv_wq:
- destroy_workqueue(client->pg_inv_wq);
-fail_wb_wq:
- destroy_workqueue(client->wb_wq);
-fail:
- kfree(client);
- return ERR_PTR(err);
-}
-
-static void ceph_destroy_client(struct ceph_client *client)
-{
- dout("destroy_client %p\n", client);
-
- /* unmount */
- ceph_mdsc_stop(&client->mdsc);
- ceph_monc_stop(&client->monc);
- ceph_osdc_stop(&client->osdc);
-
- kfree(client->signed_ticket);
-
- ceph_debugfs_client_cleanup(client);
- destroy_workqueue(client->wb_wq);
- destroy_workqueue(client->pg_inv_wq);
- destroy_workqueue(client->trunc_wq);
-
- if (client->msgr)
- ceph_messenger_destroy(client->msgr);
- if (client->wb_pagevec_pool)
- mempool_destroy(client->wb_pagevec_pool);
-
- release_mount_args(&client->mount_args);
-
- kfree(client);
- dout("destroy_client %p done\n", client);
-}
-
-/*
- * true if we have the mon map (and have thus joined the cluster)
- */
-static int have_mon_map(struct ceph_client *client)
-{
- return client->monc.monmap && client->monc.monmap->epoch;
-}
-
-/*
- * Bootstrap mount by opening the root directory. Note the mount
- * @started time from caller, and time out if this takes too long.
- */
-static struct dentry *open_root_dentry(struct ceph_client *client,
- const char *path,
- unsigned long started)
-{
- struct ceph_mds_client *mdsc = &client->mdsc;
- struct ceph_mds_request *req = NULL;
- int err;
- struct dentry *root;
-
- /* open dir */
- dout("open_root_inode opening '%s'\n", path);
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
- if (IS_ERR(req))
- return ERR_PTR(PTR_ERR(req));
- req->r_path1 = kstrdup(path, GFP_NOFS);
- req->r_ino1.ino = CEPH_INO_ROOT;
- req->r_ino1.snap = CEPH_NOSNAP;
- req->r_started = started;
- req->r_timeout = client->mount_args.mount_timeout * HZ;
- req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
- req->r_num_caps = 2;
- err = ceph_mdsc_do_request(mdsc, NULL, req);
- if (err == 0) {
- dout("open_root_inode success\n");
- if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
- client->sb->s_root == NULL)
- root = d_alloc_root(req->r_target_inode);
- else
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28)
- root = d_obtain_alias(req->r_target_inode);
-#else
- root = d_alloc_anon(req->r_target_inode);
-#endif
- req->r_target_inode = NULL;
- dout("open_root_inode success, root dentry is %p\n", root);
- } else {
- root = ERR_PTR(err);
- }
- ceph_mdsc_put_request(req);
- return root;
-}
-
-/*
- * mount: join the ceph cluster, and open root directory.
- */
-static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
- const char *path)
-{
- struct ceph_entity_addr *myaddr = NULL;
- int err;
- unsigned long timeout = client->mount_args.mount_timeout * HZ;
- unsigned long started = jiffies; /* note the start time */
- struct dentry *root;
-
- dout("mount start\n");
- mutex_lock(&client->mount_mutex);
-
- /* initialize the messenger */
- if (client->msgr == NULL) {
- if (ceph_test_opt(client, MYIP))
- myaddr = &client->mount_args.my_addr;
- client->msgr = ceph_messenger_create(myaddr);
- if (IS_ERR(client->msgr)) {
- err = PTR_ERR(client->msgr);
- client->msgr = NULL;
- goto out;
- }
- client->msgr->nocrc = ceph_test_opt(client, NOCRC);
- }
-
- /* send mount request, and wait for mon, mds, and osd maps */
- err = ceph_monc_request_mount(&client->monc);
- if (err < 0)
- goto out;
-
- while (!have_mon_map(client) && !client->mount_err) {
- err = -EIO;
- if (timeout && time_after_eq(jiffies, started + timeout))
- goto out;
-
- /* wait */
- dout("mount waiting for mount\n");
- err = wait_event_interruptible_timeout(client->mount_wq,
- client->mount_err || have_mon_map(client),
- timeout);
- if (err == -EINTR || err == -ERESTARTSYS)
- goto out;
- if (client->mount_err) {
- err = client->mount_err;
- goto out;
- }
- }
-
- dout("mount opening root\n");
- root = open_root_dentry(client, "", started);
- if (IS_ERR(root)) {
- err = PTR_ERR(root);
- goto out;
- }
- if (client->sb->s_root)
- dput(root);
- else
- client->sb->s_root = root;
-
- if (path[0] == 0) {
- dget(root);
- } else {
- dout("mount opening base mountpoint\n");
- root = open_root_dentry(client, path, started);
- if (IS_ERR(root)) {
- err = PTR_ERR(root);
- dput(client->sb->s_root);
- client->sb->s_root = NULL;
- goto out;
- }
- }
-
- mnt->mnt_root = root;
- mnt->mnt_sb = client->sb;
-
- client->mount_state = CEPH_MOUNT_MOUNTED;
- dout("mount success\n");
- err = 0;
-
-out:
- mutex_unlock(&client->mount_mutex);
- return err;
-}
-
-static int ceph_set_super(struct super_block *s, void *data)
-{
- struct ceph_client *client = data;
- int ret;
-
- dout("set_super %p data %p\n", s, data);
-
- s->s_flags = client->mount_args.sb_flags;
- s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
-
- s->s_fs_info = client;
- client->sb = s;
-
- s->s_op = &ceph_super_ops;
- s->s_export_op = &ceph_export_ops;
-
- s->s_time_gran = 1000; /* 1000 ns == 1 us */
-
- ret = set_anon_super(s, NULL); /* what is that second arg for? */
- if (ret != 0)
- goto fail;
-
- return ret;
-
-fail:
- s->s_fs_info = NULL;
- client->sb = NULL;
- return ret;
-}
-
-/*
- * share superblock if same fs AND options
- */
-static int ceph_compare_super(struct super_block *sb, void *data)
-{
- struct ceph_client *new = data;
- struct ceph_mount_args *args = &new->mount_args;
- struct ceph_client *other = ceph_sb_to_client(sb);
- int i;
-
- dout("ceph_compare_super %p\n", sb);
- if (args->flags & CEPH_OPT_FSID) {
- if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
- dout("fsid doesn't match\n");
- return 0;
- }
- } else {
- /* do we share (a) monitor? */
- for (i = 0; i < new->monc.monmap->num_mon; i++)
- if (ceph_monmap_contains(other->monc.monmap,
- &new->monc.monmap->mon_inst[i].addr))
- break;
- if (i == new->monc.monmap->num_mon) {
- dout("mon ip not part of monmap\n");
- return 0;
- }
- dout("mon ip matches existing sb %p\n", sb);
- }
- if (args->sb_flags != other->mount_args.sb_flags) {
- dout("flags differ\n");
- return 0;
- }
- return 1;
-}
-
-/*
- * construct our own bdi so we can control readahead, etc.
- */
-static int ceph_init_bdi(struct super_block *sb, struct ceph_client *client)
-{
- int err;
-
- err = bdi_init(&client->backing_dev_info);
- if (err < 0)
- return err;
-
- /* set ra_pages based on rsize mount option? */
- if (client->mount_args.rsize >= PAGE_CACHE_SIZE)
- client->backing_dev_info.ra_pages =
- (client->mount_args.rsize + PAGE_CACHE_SIZE - 1)
- >> PAGE_SHIFT;
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 26)
- err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
-#endif
-
- return err;
-}
-
-static int ceph_get_sb(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data,
- struct vfsmount *mnt)
-{
- struct super_block *sb;
- struct ceph_client *client;
- int err;
- int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
- const char *path;
-
- dout("ceph_get_sb\n");
-
- /* create client (which we may/may not use) */
- client = ceph_create_client();
- if (IS_ERR(client))
- return PTR_ERR(client);
-
- err = parse_mount_args(client, flags, data, dev_name, &path);
- if (err < 0)
- goto out;
-
- if (client->mount_args.flags & CEPH_OPT_NOSHARE)
- compare_super = NULL;
- sb = sget(fs_type, compare_super, ceph_set_super, client);
- if (IS_ERR(sb)) {
- err = PTR_ERR(sb);
- goto out;
- }
-
- if (ceph_client(sb) != client) {
- ceph_destroy_client(client);
- client = ceph_client(sb);
- dout("get_sb got existing client %p\n", client);
- } else {
- dout("get_sb using new client %p\n", client);
-
- /* set up mempools */
- err = -ENOMEM;
- client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
- client->mount_args.wsize >> PAGE_CACHE_SHIFT);
- if (!client->wb_pagevec_pool)
- goto out_splat;
-
- err = ceph_init_bdi(sb, client);
- if (err < 0)
- goto out_splat;
- }
-
- err = ceph_mount(client, mnt, path);
- if (err < 0)
- goto out_splat;
- dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
- mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
- return 0;
-
-out_splat:
- ceph_mdsc_close_sessions(&client->mdsc);
- up_write(&sb->s_umount);
- deactivate_super(sb);
- goto out_final;
-
-out:
- ceph_destroy_client(client);
-out_final:
- dout("ceph_get_sb fail %d\n", err);
- return err;
-}
-
-static void ceph_kill_sb(struct super_block *s)
-{
- struct ceph_client *client = ceph_sb_to_client(s);
- dout("kill_sb %p\n", s);
- ceph_mdsc_pre_umount(&client->mdsc);
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 26)
- bdi_unregister(&client->backing_dev_info);
-#endif
- kill_anon_super(s); /* will call put_super after sb is r/o */
- bdi_destroy(&client->backing_dev_info);
- ceph_destroy_client(client);
-}
-
-static struct file_system_type ceph_fs_type = {
- .owner = THIS_MODULE,
- .name = "ceph",
- .get_sb = ceph_get_sb,
- .kill_sb = ceph_kill_sb,
- .fs_flags = FS_RENAME_DOES_D_MOVE,
-};
-
-#define _STRINGIFY(x) #x
-#define STRINGIFY(x) _STRINGIFY(x)
-
-static int __init init_ceph(void)
-{
- int ret = 0;
-
- ret = ceph_debugfs_init();
- if (ret < 0)
- goto out;
-
- ret = ceph_msgr_init();
- if (ret < 0)
- goto out_debugfs;
-
- ret = init_caches();
- if (ret)
- goto out_msgr;
-
- ceph_caps_init();
-
- ret = register_filesystem(&ceph_fs_type);
- if (ret)
- goto out_icache;
-
- pr_info("loaded (%s)\n", STRINGIFY(CEPH_GIT_VER));
- return 0;
-
-out_icache:
- destroy_caches();
-out_msgr:
- ceph_msgr_exit();
-out_debugfs:
- ceph_debugfs_cleanup();
-out:
- return ret;
-}
-
-static void __exit exit_ceph(void)
-{
- dout("exit_ceph\n");
- unregister_filesystem(&ceph_fs_type);
- ceph_caps_finalize();
- destroy_caches();
- ceph_msgr_exit();
- ceph_debugfs_cleanup();
-}
-
-module_init(init_ceph);
-module_exit(exit_ceph);
-
-MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
-MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
-MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
-MODULE_DESCRIPTION("Ceph filesystem for Linux");
-MODULE_LICENSE("GPL");
+++ /dev/null
-#ifndef _FS_CEPH_SUPER_H
-#define _FS_CEPH_SUPER_H
-
-#include "ceph_debug.h"
-
-#include <asm/unaligned.h>
-#include <linux/backing-dev.h>
-#include <linux/completion.h>
-#include <linux/exportfs.h>
-#include <linux/fs.h>
-#include <linux/mempool.h>
-#include <linux/pagemap.h>
-#include <linux/wait.h>
-
-#include "types.h"
-#include "messenger.h"
-#include "msgpool.h"
-#include "mon_client.h"
-#include "mds_client.h"
-#include "osd_client.h"
-#include "ceph_fs.h"
-
-/* f_type in struct statfs */
-#define CEPH_SUPER_MAGIC 0x00c36400
-
-/* large granularity for statfs utilization stats to facilitate
- * large volume sizes on 32-bit machines. */
-#define CEPH_BLOCK_SHIFT 20 /* 1 MB */
-#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
-
-/*
- * mount options
- */
-#define CEPH_OPT_FSID (1<<0)
-#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
-#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
-#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */
-#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
-#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */
-#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
-
-#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES)
-
-#define ceph_set_opt(client, opt) \
- (client)->mount_args.flags |= CEPH_OPT_##opt;
-#define ceph_test_opt(client, opt) \
- (!!((client)->mount_args.flags & CEPH_OPT_##opt))
-
-
-#define CEPH_MAX_MON_MOUNT_ADDR 5
-
-struct ceph_mount_args {
- int sb_flags;
- int flags;
- int mount_timeout;
- int caps_wanted_delay_min, caps_wanted_delay_max;
- struct ceph_fsid fsid;
- struct ceph_entity_addr my_addr;
- int wsize;
- int rsize; /* max readahead */
- int max_readdir; /* max readdir size */
- int osd_timeout;
- char *snapdir_name; /* default ".snap" */
- char *secret;
- int cap_release_safety;
-};
-
-/*
- * defaults
- */
-#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
-#define CEPH_MOUNT_RSIZE_DEFAULT (128*1024) /* readahead */
-
-#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
-#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
-
-#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
-
-/*
- * Delay telling the MDS we no longer want caps, in case we reopen
- * the file. Delay a minimum amount of time, even if we send a cap
- * message for some other reason. Otherwise, take the oppotunity to
- * update the mds to avoid sending another message later.
- */
-#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
-#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
-
-
-/* mount state */
-enum {
- CEPH_MOUNT_MOUNTING,
- CEPH_MOUNT_MOUNTED,
- CEPH_MOUNT_UNMOUNTING,
- CEPH_MOUNT_UNMOUNTED,
- CEPH_MOUNT_SHUTDOWN,
-};
-
-/*
- * subtract jiffies
- */
-static inline unsigned long time_sub(unsigned long a, unsigned long b)
-{
- BUG_ON(time_after(b, a));
- return (long)a - (long)b;
-}
-
-/*
- * per-filesystem client state
- *
- * possibly shared by multiple mount points, if they are
- * mounting the same ceph filesystem/cluster.
- */
-struct ceph_client {
- __s64 whoami; /* my client number */
- struct dentry *debugfs_monmap;
- struct dentry *debugfs_mdsmap, *debugfs_osdmap;
- struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
-
- struct mutex mount_mutex; /* serialize mount attempts */
- struct ceph_mount_args mount_args;
- struct ceph_fsid fsid;
-
- struct super_block *sb;
-
- unsigned long mount_state;
- wait_queue_head_t mount_wq;
-
- int mount_err;
- void *signed_ticket; /* our keys to the kingdom */
- int signed_ticket_len;
-
- struct ceph_messenger *msgr; /* messenger instance */
- struct ceph_mon_client monc;
- struct ceph_mds_client mdsc;
- struct ceph_osd_client osdc;
-
- /* writeback */
- mempool_t *wb_pagevec_pool;
- struct workqueue_struct *wb_wq;
- struct workqueue_struct *pg_inv_wq;
- struct workqueue_struct *trunc_wq;
-
- struct backing_dev_info backing_dev_info;
-};
-
-static inline struct ceph_client *ceph_client(struct super_block *sb)
-{
- return sb->s_fs_info;
-}
-
-
-/*
- * File i/o capability. This tracks shared state with the metadata
- * server that allows us to cache or writeback attributes or to read
- * and write data. For any given inode, we should have one or more
- * capabilities, one issued by each metadata server, and our
- * cumulative access is the OR of all issued capabilities.
- *
- * Each cap is referenced by the inode's i_caps rbtree and by per-mds
- * session capability lists.
- */
-struct ceph_cap {
- struct ceph_inode_info *ci;
- struct rb_node ci_node; /* per-ci cap tree */
- struct ceph_mds_session *session;
- struct list_head session_caps; /* per-session caplist */
- int mds;
- u64 cap_id; /* unique cap id (mds provided) */
- int issued; /* latest, from the mds */
- int implemented; /* implemented superset of issued (for revocation) */
- int mds_wanted;
- u32 seq, issue_seq, mseq, gen;
- unsigned long last_used;
- struct list_head caps_item;
-};
-
-#define CHECK_CAPS_NODELAY 1 /* do not delay any further */
-#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
-#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
-
-/*
- * Snapped cap state that is pending flush to mds. When a snapshot occurs,
- * we first complete any in-process sync writes and writeback any dirty
- * data before flushing the snapped state (tracked here) back to the MDS.
- */
-struct ceph_cap_snap {
- atomic_t nref;
- struct ceph_inode_info *ci;
- struct list_head ci_item, flushing_item;
-
- u64 follows, flush_tid;
- int issued, dirty;
- struct ceph_snap_context *context;
-
- mode_t mode;
- uid_t uid;
- gid_t gid;
-
- void *xattr_blob;
- int xattr_len;
- u64 xattr_version;
-
- u64 size;
- struct timespec mtime, atime, ctime;
- u64 time_warp_seq;
- int writing; /* a sync write is still in progress */
- int dirty_pages; /* dirty pages awaiting writeback */
-};
-
-static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
-{
- if (atomic_dec_and_test(&capsnap->nref))
- kfree(capsnap);
-}
-
-/*
- * The frag tree describes how a directory is fragmented, potentially across
- * multiple metadata servers. It is also used to indicate points where
- * metadata authority is delegated, and whether/where metadata is replicated.
- *
- * A _leaf_ frag will be present in the i_fragtree IFF there is
- * delegation info. That is, if mds >= 0 || ndist > 0.
- */
-#define CEPH_MAX_DIRFRAG_REP 4
-
-struct ceph_inode_frag {
- struct rb_node node;
-
- /* fragtree state */
- u32 frag;
- int split_by; /* i.e. 2^(split_by) children */
-
- /* delegation and replication info */
- int mds; /* -1 if same authority as parent */
- int ndist; /* >0 if replicated */
- int dist[CEPH_MAX_DIRFRAG_REP];
-};
-
-/*
- * We cache inode xattrs as an encoded blob until they are first used,
- * at which point we parse them into an rbtree.
- */
-struct ceph_inode_xattr {
- struct rb_node node;
-
- const char *name;
- int name_len;
- const char *val;
- int val_len;
- int dirty;
-
- int should_free_name;
- int should_free_val;
-};
-
-struct ceph_inode_xattrs_info {
- /*
- * (still encoded) xattr blob. we avoid the overhead of parsing
- * this until someone actually calls getxattr, etc.
- *
- * blob->vec.iov_len == 4 implies there are no xattrs; blob ==
- * NULL means we don't know.
- */
- struct ceph_buffer *blob, *prealloc_blob;
-
- struct rb_root index;
- bool dirty;
- int count;
- int names_size;
- int vals_size;
- u64 version, index_version;
-};
-
-/*
- * Ceph inode.
- */
-#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
-#define CEPH_I_NODELAY 4 /* do not delay cap release */
-#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
-
-struct ceph_inode_info {
- struct ceph_vino i_vino; /* ceph ino + snap */
-
- u64 i_version;
- u32 i_time_warp_seq;
-
- unsigned i_ceph_flags;
- unsigned long i_release_count;
-
- struct ceph_file_layout i_layout;
- char *i_symlink;
-
- /* for dirs */
- struct timespec i_rctime;
- u64 i_rbytes, i_rfiles, i_rsubdirs;
- u64 i_files, i_subdirs;
- u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */
-
- struct rb_root i_fragtree;
- struct mutex i_fragtree_mutex;
-
- struct ceph_inode_xattrs_info i_xattrs;
-
- /* capabilities. protected _both_ by i_lock and cap->session's
- * s_mutex. */
- struct rb_root i_caps; /* cap list */
- struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
- unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
- struct list_head i_dirty_item, i_flushing_item;
- u64 i_cap_flush_seq;
- /* we need to track cap writeback on a per-cap-bit basis, to allow
- * overlapping, pipelined cap flushes to the mds. we can probably
- * reduce the tid to 8 bits if we're concerned about inode size. */
- u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
- wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
- unsigned long i_hold_caps_min; /* jiffies */
- unsigned long i_hold_caps_max; /* jiffies */
- struct list_head i_cap_delay_list; /* for delayed cap release to mds */
- int i_cap_exporting_mds; /* to handle cap migration between */
- unsigned i_cap_exporting_mseq; /* mds's. */
- unsigned i_cap_exporting_issued;
- struct ceph_cap_reservation i_cap_migration_resv;
- struct list_head i_cap_snaps; /* snapped state pending flush to mds */
- struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 */
- unsigned i_snap_caps; /* cap bits for snapped files */
-
- int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
-
- u32 i_truncate_seq; /* last truncate to smaller size */
- u64 i_truncate_size; /* and the size we last truncated down to */
- int i_truncate_pending; /* still need to call vmtruncate */
-
- u64 i_max_size; /* max file size authorized by mds */
- u64 i_reported_size; /* (max_)size reported to or requested of mds */
- u64 i_wanted_max_size; /* offset we'd like to write too */
- u64 i_requested_max_size; /* max_size we've requested */
-
- /* held references to caps */
- int i_pin_ref;
- int i_rd_ref, i_rdcache_ref, i_wr_ref;
- int i_wrbuffer_ref, i_wrbuffer_ref_head;
- u32 i_shared_gen; /* increment each time we get FILE_SHARED */
- u32 i_rdcache_gen; /* we increment this each time we get
- FILE_CACHE. If it's non-zero, we
- _may_ have cached pages. */
- u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
-
- struct list_head i_unsafe_writes; /* uncommitted sync writes */
- struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
- spinlock_t i_unsafe_lock;
-
- struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
- int i_snap_realm_counter; /* snap realm (if caps) */
- struct list_head i_snap_realm_item;
- struct list_head i_snap_flush_item;
-
- struct work_struct i_wb_work; /* writeback work */
- struct work_struct i_pg_inv_work; /* page invalidation work */
-
- struct work_struct i_vmtruncate_work;
-
- struct inode vfs_inode; /* at end */
-};
-
-static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
-{
- return list_entry(inode, struct ceph_inode_info, vfs_inode);
-}
-
-static inline void ceph_i_clear(struct inode *inode, unsigned mask)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
-
- spin_lock(&inode->i_lock);
- ci->i_ceph_flags &= ~mask;
- spin_unlock(&inode->i_lock);
-}
-
-static inline void ceph_i_set(struct inode *inode, unsigned mask)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
-
- spin_lock(&inode->i_lock);
- ci->i_ceph_flags |= mask;
- spin_unlock(&inode->i_lock);
-}
-
-static inline bool ceph_i_test(struct inode *inode, unsigned mask)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- bool r;
-
- smp_mb();
- r = (ci->i_ceph_flags & mask) == mask;
- return r;
-}
-
-
-/* find a specific frag @f */
-extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
- u32 f);
-
-/*
- * choose fragment for value @v. copy frag content to pfrag, if leaf
- * exists
- */
-extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
- struct ceph_inode_frag *pfrag,
- int *found);
-
-/*
- * Ceph dentry state
- */
-struct ceph_dentry_info {
- struct ceph_mds_session *lease_session;
- u32 lease_gen, lease_shared_gen;
- u32 lease_seq;
- unsigned long lease_renew_after, lease_renew_from;
- struct list_head lru;
- struct dentry *dentry;
- u64 time;
- u64 offset;
-};
-
-static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
-{
- return (struct ceph_dentry_info *)dentry->d_fsdata;
-}
-
-static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
-{
- return ((loff_t)frag << 32) | (loff_t)off;
-}
-
-/*
- * ino_t is <64 bits on many architectures, blech.
- *
- * don't include snap in ino hash, at least for now.
- */
-static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
-{
- ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
-#if BITS_PER_LONG == 32
- ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
- if (!ino)
- ino = 1;
-#endif
- return ino;
-}
-
-static inline int ceph_set_ino_cb(struct inode *inode, void *data)
-{
- ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
- inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
- return 0;
-}
-
-static inline struct ceph_vino ceph_vino(struct inode *inode)
-{
- return ceph_inode(inode)->i_vino;
-}
-
-/* for printf-style formatting */
-#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
-
-static inline u64 ceph_ino(struct inode *inode)
-{
- return ceph_inode(inode)->i_vino.ino;
-}
-static inline u64 ceph_snap(struct inode *inode)
-{
- return ceph_inode(inode)->i_vino.snap;
-}
-
-static inline int ceph_ino_compare(struct inode *inode, void *data)
-{
- struct ceph_vino *pvino = (struct ceph_vino *)data;
- struct ceph_inode_info *ci = ceph_inode(inode);
- return ci->i_vino.ino == pvino->ino &&
- ci->i_vino.snap == pvino->snap;
-}
-
-static inline struct inode *ceph_find_inode(struct super_block *sb,
- struct ceph_vino vino)
-{
- ino_t t = ceph_vino_to_ino(vino);
- return ilookup5(sb, t, ceph_ino_compare, &vino);
-}
-
-
-/*
- * caps helpers
- */
-static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
-{
- return !RB_EMPTY_ROOT(&ci->i_caps);
-}
-
-extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
-extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
-extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
- struct ceph_cap *cap);
-
-static inline int ceph_caps_issued(struct ceph_inode_info *ci)
-{
- int issued;
- spin_lock(&ci->vfs_inode.i_lock);
- issued = __ceph_caps_issued(ci, NULL);
- spin_unlock(&ci->vfs_inode.i_lock);
- return issued;
-}
-
-static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
- int touch)
-{
- int r;
- spin_lock(&ci->vfs_inode.i_lock);
- r = __ceph_caps_issued_mask(ci, mask, touch);
- spin_unlock(&ci->vfs_inode.i_lock);
- return r;
-}
-
-static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
-{
- return ci->i_dirty_caps | ci->i_flushing_caps;
-}
-extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
-
-extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
-extern int __ceph_caps_used(struct ceph_inode_info *ci);
-
-extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
-
-/*
- * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
- */
-static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
-{
- int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
- if (w & CEPH_CAP_FILE_BUFFER)
- w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */
- return w;
-}
-
-/* what the mds thinks we want */
-extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
-
-extern void ceph_caps_init(void);
-extern void ceph_caps_finalize(void);
-extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need);
-extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx);
-extern void ceph_reservation_status(struct ceph_client *client,
- int *total, int *avail, int *used,
- int *reserved);
-
-static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
-{
- return (struct ceph_client *)inode->i_sb->s_fs_info;
-}
-
-static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
-{
- return (struct ceph_client *)sb->s_fs_info;
-}
-
-static inline int ceph_queue_writeback(struct inode *inode)
-{
- return queue_work(ceph_inode_to_client(inode)->wb_wq,
- &ceph_inode(inode)->i_wb_work);
-}
-
-static inline int ceph_queue_page_invalidation(struct inode *inode)
-{
- return queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
- &ceph_inode(inode)->i_pg_inv_work);
-}
-
-
-/*
- * we keep buffered readdir results attached to file->private_data
- */
-struct ceph_file_info {
- int fmode; /* initialized on open */
-
- /* readdir: position within the dir */
- u32 frag;
- struct ceph_mds_request *last_readdir;
- int at_end;
-
- /* readdir: position within a frag */
- unsigned offset; /* offset of last chunk, adjusted for . and .. */
- u64 next_offset; /* offset of next chunk (last_name's + 1) */
- char *last_name; /* last entry in previous chunk */
- struct dentry *dentry; /* next dentry (for dcache readdir) */
- unsigned long dir_release_count;
-
- /* used for -o dirstat read() on directory thing */
- char *dir_info;
- int dir_info_len;
-};
-
-
-
-/*
- * snapshots
- */
-
-/*
- * A "snap context" is the set of existing snapshots when we
- * write data. It is used by the OSD to guide its COW behavior.
- *
- * The ceph_snap_context is refcounted, and attached to each dirty
- * page, indicating which context the dirty data belonged when it was
- * dirtied.
- */
-struct ceph_snap_context {
- atomic_t nref;
- u64 seq;
- int num_snaps;
- u64 snaps[];
-};
-
-static inline struct ceph_snap_context *
-ceph_get_snap_context(struct ceph_snap_context *sc)
-{
- /*
- printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
- atomic_read(&sc->nref)+1);
- */
- if (sc)
- atomic_inc(&sc->nref);
- return sc;
-}
-
-static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
-{
- if (!sc)
- return;
- /*
- printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
- atomic_read(&sc->nref)-1);
- */
- if (atomic_dec_and_test(&sc->nref)) {
- /*printk(" deleting snap_context %p\n", sc);*/
- kfree(sc);
- }
-}
-
-/*
- * A "snap realm" describes a subset of the file hierarchy sharing
- * the same set of snapshots that apply to it. The realms themselves
- * are organized into a hierarchy, such that children inherit (some of)
- * the snapshots of their parents.
- *
- * All inodes within the realm that have capabilities are linked into a
- * per-realm list.
- */
-struct ceph_snap_realm {
- u64 ino;
- atomic_t nref;
- u64 created, seq;
- u64 parent_ino;
- u64 parent_since; /* snapid when our current parent became so */
-
- u64 *prior_parent_snaps; /* snaps inherited from any parents we */
- int num_prior_parent_snaps; /* had prior to parent_since */
- u64 *snaps; /* snaps specific to this realm */
- int num_snaps;
-
- struct ceph_snap_realm *parent;
- struct list_head children; /* list of child realms */
- struct list_head child_item;
-
- struct list_head empty_item; /* if i have ref==0 */
-
- /* the current set of snaps for this realm */
- struct ceph_snap_context *cached_context;
-
- struct list_head inodes_with_caps;
- spinlock_t inodes_with_caps_lock;
-};
-
-
-
-/*
- * calculate the number of pages a given length and offset map onto,
- * if we align the data.
- */
-static inline int calc_pages_for(u64 off, u64 len)
-{
- return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
- (off >> PAGE_CACHE_SHIFT);
-}
-
-
-
-/* snap.c */
-struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
- u64 ino);
-extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
- struct ceph_snap_realm *realm);
-extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
- struct ceph_snap_realm *realm);
-extern int ceph_update_snap_trace(struct ceph_mds_client *m,
- void *p, void *e, bool deletion);
-extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
- struct ceph_msg *msg);
-extern void ceph_queue_cap_snap(struct ceph_inode_info *ci,
- struct ceph_snap_context *snapc);
-extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
- struct ceph_cap_snap *capsnap);
-extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
-
-/*
- * a cap_snap is "pending" if it is still awaiting an in-progress
- * sync write (that may/may not still update size, mtime, etc.).
- */
-static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
-{
- return !list_empty(&ci->i_cap_snaps) &&
- list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
- ci_item)->writing;
-}
-
-
-/* super.c */
-extern struct kmem_cache *ceph_inode_cachep;
-extern struct kmem_cache *ceph_cap_cachep;
-extern struct kmem_cache *ceph_dentry_cachep;
-extern struct kmem_cache *ceph_file_cachep;
-
-extern const char *ceph_msg_type_name(int type);
-
-#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
- "%02x%02x%02x%02x%02x%02x"
-#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
- (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7], \
- (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11], \
- (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
-
-/* inode.c */
-extern const struct inode_operations ceph_file_iops;
-
-extern struct inode *ceph_alloc_inode(struct super_block *sb);
-extern void ceph_destroy_inode(struct inode *inode);
-
-extern struct inode *ceph_get_inode(struct super_block *sb,
- struct ceph_vino vino);
-extern struct inode *ceph_get_snapdir(struct inode *parent);
-extern int ceph_fill_file_size(struct inode *inode, int issued,
- u32 truncate_seq, u64 truncate_size, u64 size);
-extern void ceph_fill_file_time(struct inode *inode, int issued,
- u64 time_warp_seq, struct timespec *ctime,
- struct timespec *mtime, struct timespec *atime);
-extern int ceph_fill_trace(struct super_block *sb,
- struct ceph_mds_request *req,
- struct ceph_mds_session *session);
-extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
- struct ceph_mds_session *session);
-
-extern int ceph_inode_holds_cap(struct inode *inode, int mask);
-
-extern int ceph_inode_set_size(struct inode *inode, loff_t size);
-extern void ceph_inode_writeback(struct work_struct *work);
-extern void ceph_vmtruncate_work(struct work_struct *work);
-extern void __ceph_do_pending_vmtruncate(struct inode *inode);
-extern void __ceph_queue_vmtruncate(struct inode *inode);
-
-extern int ceph_do_getattr(struct inode *inode, int mask);
-extern int ceph_permission(struct inode *inode, int mask);
-extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
-extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
- struct kstat *stat);
-
-/* xattr.c */
-extern int ceph_setxattr(struct dentry *, const char *, const void *,
- size_t, int);
-extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
-extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
-extern int ceph_removexattr(struct dentry *, const char *);
-extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
-extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
-
-/* caps.c */
-extern const char *ceph_cap_string(int c);
-extern void ceph_handle_caps(struct ceph_mds_session *session,
- struct ceph_msg *msg);
-extern int ceph_add_cap(struct inode *inode,
- struct ceph_mds_session *session, u64 cap_id,
- int fmode, unsigned issued, unsigned wanted,
- unsigned cap, unsigned seq, u64 realmino, int flags,
- struct ceph_cap_reservation *caps_reservation);
-extern void __ceph_remove_cap(struct ceph_cap *cap,
- struct ceph_cap_reservation *ctx);
-static inline void ceph_remove_cap(struct ceph_cap *cap)
-{
- struct inode *inode = &cap->ci->vfs_inode;
- spin_lock(&inode->i_lock);
- __ceph_remove_cap(cap, NULL);
- spin_unlock(&inode->i_lock);
-}
-
-extern void ceph_queue_caps_release(struct inode *inode);
-extern int ceph_write_inode(struct inode *inode, int unused);
-extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
-extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session);
-extern int ceph_get_cap_mds(struct inode *inode);
-extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
-extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
-extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
- struct ceph_snap_context *snapc);
-extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
- struct ceph_mds_session **psession);
-extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
- struct ceph_mds_session *session);
-extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc,
- int flushdirty);
-
-extern int ceph_encode_inode_release(void **p, struct inode *inode,
- int mds, int drop, int unless, int force);
-extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
- int mds, int drop, int unless);
-
-extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
- int *got, loff_t endoff);
-
-/* for counting open files by mode */
-static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
-{
- ci->i_nr_by_mode[mode]++;
-}
-extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
-
-/* addr.c */
-extern const struct address_space_operations ceph_aops;
-extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
-
-/* file.c */
-extern const struct file_operations ceph_file_fops;
-extern const struct address_space_operations ceph_aops;
-extern int ceph_open(struct inode *inode, struct file *file);
-extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
- struct nameidata *nd, int mode,
- int locked_dir);
-extern int ceph_release(struct inode *inode, struct file *filp);
-extern void ceph_release_page_vector(struct page **pages, int num_pages);
-
-/* dir.c */
-extern const struct file_operations ceph_dir_fops;
-extern const struct inode_operations ceph_dir_iops;
-extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
- ceph_snapdir_dentry_ops;
-
-extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
-extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
- struct dentry *dentry, int err);
-
-extern void ceph_dentry_lru_add(struct dentry *dn);
-extern void ceph_dentry_lru_touch(struct dentry *dn);
-extern void ceph_dentry_lru_del(struct dentry *dn);
-
-/*
- * our d_ops vary depending on whether the inode is live,
- * snapshotted (read-only), or a virtual ".snap" directory.
- */
-int ceph_init_dentry(struct dentry *dentry);
-
-
-/* ioctl.c */
-extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-
-/* export.c */
-extern const struct export_operations ceph_export_ops;
-
-/* debugfs.c */
-extern int ceph_debugfs_init(void);
-extern void ceph_debugfs_cleanup(void);
-extern int ceph_debugfs_client_init(struct ceph_client *client);
-extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
-
-static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
-{
- if (dentry && dentry->d_parent)
- return dentry->d_parent->d_inode;
-
- return NULL;
-}
-
-#endif /* _FS_CEPH_SUPER_H */
+++ /dev/null
-#ifndef _FS_CEPH_TYPES_H
-#define _FS_CEPH_TYPES_H
-
-/* needed before including ceph_fs.h */
-#include <linux/in.h>
-#include <linux/types.h>
-#include <linux/fcntl.h>
-#include <linux/string.h>
-
-#include "ceph_fs.h"
-#include "ceph_frag.h"
-
-/*
- * Identify inodes by both their ino AND snapshot id (a u64).
- */
-struct ceph_vino {
- u64 ino;
- u64 snap;
-};
-
-
-/* context for the caps reservation mechanism */
-struct ceph_cap_reservation {
- int count;
-};
-
-
-#endif
+++ /dev/null
-#include "ceph_debug.h"
-#include "super.h"
-#include "decode.h"
-
-#include <linux/xattr.h>
-
-static bool ceph_is_valid_xattr(const char *name)
-{
- return !strncmp(name, XATTR_SECURITY_PREFIX,
- XATTR_SECURITY_PREFIX_LEN) ||
- !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
- !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
-}
-
-/*
- * These define virtual xattrs exposing the recursive directory
- * statistics and layout metadata.
- */
-struct ceph_vxattr_cb {
- bool readonly;
- char *name;
- size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
- size_t size);
-};
-
-/* directories */
-
-static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
- size_t size)
-{
- return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
-}
-
-static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
- size_t size)
-{
- return snprintf(val, size, "%lld", ci->i_files);
-}
-
-static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
- size_t size)
-{
- return snprintf(val, size, "%lld", ci->i_subdirs);
-}
-
-static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
- size_t size)
-{
- return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
-}
-
-static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
- size_t size)
-{
- return snprintf(val, size, "%lld", ci->i_rfiles);
-}
-
-static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
- size_t size)
-{
- return snprintf(val, size, "%lld", ci->i_rsubdirs);
-}
-
-static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
- size_t size)
-{
- return snprintf(val, size, "%lld", ci->i_rbytes);
-}
-
-static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
- size_t size)
-{
- return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
- (long)ci->i_rctime.tv_nsec);
-}
-
-static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
- { true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
- { true, "user.ceph.dir.files", ceph_vxattrcb_files},
- { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
- { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
- { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
- { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
- { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
- { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
- { true, NULL, NULL }
-};
-
-/* files */
-
-static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
- size_t size)
-{
- return snprintf(val, size,
- "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
- (unsigned long long)ceph_file_layout_su(ci->i_layout),
- (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
- (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
-}
-
-static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
- { true, "user.ceph.layout", ceph_vxattrcb_layout},
- { NULL, NULL }
-};
-
-static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
-{
- if (S_ISDIR(inode->i_mode))
- return ceph_dir_vxattrs;
- else if (S_ISREG(inode->i_mode))
- return ceph_file_vxattrs;
- return NULL;
-}
-
-static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
- const char *name)
-{
- do {
- if (strcmp(vxattr->name, name) == 0)
- return vxattr;
- vxattr++;
- } while (vxattr->name);
- return NULL;
-}
-
-static int __set_xattr(struct ceph_inode_info *ci,
- const char *name, int name_len,
- const char *val, int val_len,
- int dirty,
- int should_free_name, int should_free_val,
- struct ceph_inode_xattr **newxattr)
-{
- struct rb_node **p;
- struct rb_node *parent = NULL;
- struct ceph_inode_xattr *xattr = NULL;
- int c;
- int new = 0;
-
- p = &ci->i_xattrs.index.rb_node;
- while (*p) {
- parent = *p;
- xattr = rb_entry(parent, struct ceph_inode_xattr, node);
- c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
- if (c < 0)
- p = &(*p)->rb_left;
- else if (c > 0)
- p = &(*p)->rb_right;
- else {
- if (name_len == xattr->name_len)
- break;
- else if (name_len < xattr->name_len)
- p = &(*p)->rb_left;
- else
- p = &(*p)->rb_right;
- }
- xattr = NULL;
- }
-
- if (!xattr) {
- new = 1;
- xattr = *newxattr;
- xattr->name = name;
- xattr->name_len = name_len;
- xattr->should_free_name = should_free_name;
-
- ci->i_xattrs.count++;
- dout("__set_xattr count=%d\n", ci->i_xattrs.count);
- } else {
- kfree(*newxattr);
- *newxattr = NULL;
- if (xattr->should_free_val)
- kfree((void *)xattr->val);
-
- if (should_free_name) {
- kfree((void *)name);
- name = xattr->name;
- }
- ci->i_xattrs.names_size -= xattr->name_len;
- ci->i_xattrs.vals_size -= xattr->val_len;
- }
- if (!xattr) {
- pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
- &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
- xattr->val);
- return -ENOMEM;
- }
- ci->i_xattrs.names_size += name_len;
- ci->i_xattrs.vals_size += val_len;
- if (val)
- xattr->val = val;
- else
- xattr->val = "";
-
- xattr->val_len = val_len;
- xattr->dirty = dirty;
- xattr->should_free_val = (val && should_free_val);
-
- if (new) {
- rb_link_node(&xattr->node, parent, p);
- rb_insert_color(&xattr->node, &ci->i_xattrs.index);
- dout("__set_xattr_val p=%p\n", p);
- }
-
- dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
- ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
-
- return 0;
-}
-
-static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
- const char *name)
-{
- struct rb_node **p;
- struct rb_node *parent = NULL;
- struct ceph_inode_xattr *xattr = NULL;
- int c;
-
- p = &ci->i_xattrs.index.rb_node;
- while (*p) {
- parent = *p;
- xattr = rb_entry(parent, struct ceph_inode_xattr, node);
- c = strncmp(name, xattr->name, xattr->name_len);
- if (c < 0)
- p = &(*p)->rb_left;
- else if (c > 0)
- p = &(*p)->rb_right;
- else {
- dout("__get_xattr %s: found %.*s\n", name,
- xattr->val_len, xattr->val);
- return xattr;
- }
- }
-
- dout("__get_xattr %s: not found\n", name);
-
- return NULL;
-}
-
-static void __free_xattr(struct ceph_inode_xattr *xattr)
-{
- BUG_ON(!xattr);
-
- if (xattr->should_free_name)
- kfree((void *)xattr->name);
- if (xattr->should_free_val)
- kfree((void *)xattr->val);
-
- kfree(xattr);
-}
-
-static int __remove_xattr(struct ceph_inode_info *ci,
- struct ceph_inode_xattr *xattr)
-{
- if (!xattr)
- return -EOPNOTSUPP;
-
- rb_erase(&xattr->node, &ci->i_xattrs.index);
-
- if (xattr->should_free_name)
- kfree((void *)xattr->name);
- if (xattr->should_free_val)
- kfree((void *)xattr->val);
-
- ci->i_xattrs.names_size -= xattr->name_len;
- ci->i_xattrs.vals_size -= xattr->val_len;
- ci->i_xattrs.count--;
- kfree(xattr);
-
- return 0;
-}
-
-static int __remove_xattr_by_name(struct ceph_inode_info *ci,
- const char *name)
-{
- struct rb_node **p;
- struct ceph_inode_xattr *xattr;
- int err;
-
- p = &ci->i_xattrs.index.rb_node;
- xattr = __get_xattr(ci, name);
- err = __remove_xattr(ci, xattr);
- return err;
-}
-
-static char *__copy_xattr_names(struct ceph_inode_info *ci,
- char *dest)
-{
- struct rb_node *p;
- struct ceph_inode_xattr *xattr = NULL;
-
- p = rb_first(&ci->i_xattrs.index);
- dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
-
- while (p) {
- xattr = rb_entry(p, struct ceph_inode_xattr, node);
- memcpy(dest, xattr->name, xattr->name_len);
- dest[xattr->name_len] = '\0';
-
- dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
- xattr->name_len, ci->i_xattrs.names_size);
-
- dest += xattr->name_len + 1;
- p = rb_next(p);
- }
-
- return dest;
-}
-
-void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
-{
- struct rb_node *p, *tmp;
- struct ceph_inode_xattr *xattr = NULL;
-
- p = rb_first(&ci->i_xattrs.index);
-
- dout("__ceph_destroy_xattrs p=%p\n", p);
-
- while (p) {
- xattr = rb_entry(p, struct ceph_inode_xattr, node);
- tmp = p;
- p = rb_next(tmp);
- dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
- xattr->name_len, xattr->name);
- rb_erase(tmp, &ci->i_xattrs.index);
-
- __free_xattr(xattr);
- }
-
- ci->i_xattrs.names_size = 0;
- ci->i_xattrs.vals_size = 0;
- ci->i_xattrs.index_version = 0;
- ci->i_xattrs.count = 0;
- ci->i_xattrs.index = RB_ROOT;
-}
-
-static int __build_xattrs(struct inode *inode)
-{
- u32 namelen;
- u32 numattr = 0;
- void *p, *end;
- u32 len;
- const char *name, *val;
- struct ceph_inode_info *ci = ceph_inode(inode);
- int xattr_version;
- struct ceph_inode_xattr **xattrs = NULL;
- int err;
- int i;
-
- dout("__build_xattrs() len=%d\n",
- ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
-
- if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
- return 0; /* already built */
-
- __ceph_destroy_xattrs(ci);
-
-start:
- /* updated internal xattr rb tree */
- if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
- p = ci->i_xattrs.blob->vec.iov_base;
- end = p + ci->i_xattrs.blob->vec.iov_len;
- ceph_decode_32_safe(&p, end, numattr, bad);
- xattr_version = ci->i_xattrs.version;
- spin_unlock(&inode->i_lock);
-
- xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
- GFP_NOFS);
- err = -ENOMEM;
- if (!xattrs)
- goto bad_lock;
- memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
- for (i = 0; i < numattr; i++) {
- xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
- GFP_NOFS);
- if (!xattrs[i])
- goto bad_lock;
- }
-
- spin_lock(&inode->i_lock);
- if (ci->i_xattrs.version != xattr_version) {
- /* lost a race, retry */
- for (i = 0; i < numattr; i++)
- kfree(xattrs[i]);
- kfree(xattrs);
- goto start;
- }
- err = -EIO;
- while (numattr--) {
- ceph_decode_32_safe(&p, end, len, bad);
- namelen = len;
- name = p;
- p += len;
- ceph_decode_32_safe(&p, end, len, bad);
- val = p;
- p += len;
-
- err = __set_xattr(ci, name, namelen, val, len,
- 0, 0, 0, &xattrs[numattr]);
-
- if (err < 0)
- goto bad;
- }
- kfree(xattrs);
- }
- ci->i_xattrs.index_version = ci->i_xattrs.version;
- ci->i_xattrs.dirty = false;
-
- return err;
-bad_lock:
- spin_lock(&inode->i_lock);
-bad:
- if (xattrs) {
- for (i = 0; i < numattr; i++)
- kfree(xattrs[i]);
- kfree(xattrs);
- }
- ci->i_xattrs.names_size = 0;
- return err;
-}
-
-static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
- int val_size)
-{
- /*
- * 4 bytes for the length, and additional 4 bytes per each xattr name,
- * 4 bytes per each value
- */
- int size = 4 + ci->i_xattrs.count*(4 + 4) +
- ci->i_xattrs.names_size +
- ci->i_xattrs.vals_size;
- dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
- ci->i_xattrs.count, ci->i_xattrs.names_size,
- ci->i_xattrs.vals_size);
-
- if (name_size)
- size += 4 + 4 + name_size + val_size;
-
- return size;
-}
-
-/*
- * If there are dirty xattrs, reencode xattrs into the prealloc_blob
- * and swap into place.
- */
-void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
-{
- struct rb_node *p;
- struct ceph_inode_xattr *xattr = NULL;
- void *dest;
-
- dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
- if (ci->i_xattrs.dirty) {
- int need = __get_required_blob_size(ci, 0, 0);
-
- BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
-
- p = rb_first(&ci->i_xattrs.index);
- dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
-
- ceph_encode_32(&dest, ci->i_xattrs.count);
- while (p) {
- xattr = rb_entry(p, struct ceph_inode_xattr, node);
-
- ceph_encode_32(&dest, xattr->name_len);
- memcpy(dest, xattr->name, xattr->name_len);
- dest += xattr->name_len;
- ceph_encode_32(&dest, xattr->val_len);
- memcpy(dest, xattr->val, xattr->val_len);
- dest += xattr->val_len;
-
- p = rb_next(p);
- }
-
- /* adjust buffer len; it may be larger than we need */
- ci->i_xattrs.prealloc_blob->vec.iov_len =
- dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
-
- ceph_buffer_put(ci->i_xattrs.blob);
- ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
- ci->i_xattrs.prealloc_blob = NULL;
- ci->i_xattrs.dirty = false;
- }
-}
-
-ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
- size_t size)
-{
- struct inode *inode = dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
- int err;
- struct ceph_inode_xattr *xattr;
- struct ceph_vxattr_cb *vxattr = NULL;
-
- if (!ceph_is_valid_xattr(name))
- return -ENODATA;
-
- /* let's see if a virtual xattr was requested */
- if (vxattrs)
- vxattr = ceph_match_vxattr(vxattrs, name);
-
- spin_lock(&inode->i_lock);
- dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
- ci->i_xattrs.version, ci->i_xattrs.index_version);
-
- if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
- (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
- goto get_xattr;
- } else {
- spin_unlock(&inode->i_lock);
- /* get xattrs from mds (if we don't already have them) */
- err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
- if (err)
- return err;
- }
-
- spin_lock(&inode->i_lock);
-
- if (vxattr && vxattr->readonly) {
- err = vxattr->getxattr_cb(ci, value, size);
- goto out;
- }
-
- err = __build_xattrs(inode);
- if (err < 0)
- goto out;
-
-get_xattr:
- err = -ENODATA; /* == ENOATTR */
- xattr = __get_xattr(ci, name);
- if (!xattr) {
- if (vxattr)
- err = vxattr->getxattr_cb(ci, value, size);
- goto out;
- }
-
- err = -ERANGE;
- if (size && size < xattr->val_len)
- goto out;
-
- err = xattr->val_len;
- if (size == 0)
- goto out;
-
- memcpy(value, xattr->val, xattr->val_len);
-
-out:
- spin_unlock(&inode->i_lock);
- return err;
-}
-
-ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
-{
- struct inode *inode = dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
- u32 vir_namelen = 0;
- u32 namelen;
- int err;
- u32 len;
- int i;
-
- spin_lock(&inode->i_lock);
- dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
- ci->i_xattrs.version, ci->i_xattrs.index_version);
-
- if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
- (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
- goto list_xattr;
- } else {
- spin_unlock(&inode->i_lock);
- err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
- if (err)
- return err;
- }
-
- spin_lock(&inode->i_lock);
-
- err = __build_xattrs(inode);
- if (err < 0)
- goto out;
-
-list_xattr:
- vir_namelen = 0;
- /* include virtual dir xattrs */
- if (vxattrs)
- for (i = 0; vxattrs[i].name; i++)
- vir_namelen += strlen(vxattrs[i].name) + 1;
- /* adding 1 byte per each variable due to the null termination */
- namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
- err = -ERANGE;
- if (size && namelen > size)
- goto out;
-
- err = namelen;
- if (size == 0)
- goto out;
-
- names = __copy_xattr_names(ci, names);
-
- /* virtual xattr names, too */
- if (vxattrs)
- for (i = 0; vxattrs[i].name; i++) {
- len = sprintf(names, "%s", vxattrs[i].name);
- names += len + 1;
- }
-
-out:
- spin_unlock(&inode->i_lock);
- return err;
-}
-
-static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
- const char *value, size_t size, int flags)
-{
- struct ceph_client *client = ceph_client(dentry->d_sb);
- struct inode *inode = dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct inode *parent_inode = dentry->d_parent->d_inode;
- struct ceph_mds_request *req;
- struct ceph_mds_client *mdsc = &client->mdsc;
- int err;
- int i, nr_pages;
- struct page **pages = NULL;
- void *kaddr;
-
- /* copy value into some pages */
- nr_pages = calc_pages_for(0, size);
- if (nr_pages) {
- pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
- if (!pages)
- return -ENOMEM;
- err = -ENOMEM;
- for (i = 0; i < nr_pages; i++) {
- pages[i] = alloc_page(GFP_NOFS);
- if (!pages[i]) {
- nr_pages = i;
- goto out;
- }
- kaddr = kmap(pages[i]);
- memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
- min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
- }
- }
-
- dout("setxattr value=%.*s\n", (int)size, value);
-
- /* do request */
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
- USE_AUTH_MDS);
- if (IS_ERR(req))
- return PTR_ERR(req);
- req->r_inode = igrab(inode);
- req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
- req->r_num_caps = 1;
- req->r_args.setxattr.flags = cpu_to_le32(flags);
- req->r_path2 = kstrdup(name, GFP_NOFS);
-
- req->r_pages = pages;
- req->r_num_pages = nr_pages;
- req->r_data_len = size;
-
- dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- ceph_mdsc_put_request(req);
- dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
-
-out:
- if (pages) {
- for (i = 0; i < nr_pages; i++)
- __free_page(pages[i]);
- kfree(pages);
- }
- return err;
-}
-
-int ceph_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
-{
- struct inode *inode = dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
- int err;
- int name_len = strlen(name);
- int val_len = size;
- char *newname = NULL;
- char *newval = NULL;
- struct ceph_inode_xattr *xattr = NULL;
- int issued;
- int required_blob_size;
-
- if (ceph_snap(inode) != CEPH_NOSNAP)
- return -EROFS;
-
- if (!ceph_is_valid_xattr(name))
- return -EOPNOTSUPP;
-
- if (vxattrs) {
- struct ceph_vxattr_cb *vxattr =
- ceph_match_vxattr(vxattrs, name);
- if (vxattr && vxattr->readonly)
- return -EOPNOTSUPP;
- }
-
- /* preallocate memory for xattr name, value, index node */
- err = -ENOMEM;
- newname = kmalloc(name_len + 1, GFP_NOFS);
- if (!newname)
- goto out;
- memcpy(newname, name, name_len + 1);
-
- if (val_len) {
- newval = kmalloc(val_len + 1, GFP_NOFS);
- if (!newval)
- goto out;
- memcpy(newval, value, val_len);
- newval[val_len] = '\0';
- }
-
- xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
- if (!xattr)
- goto out;
-
- spin_lock(&inode->i_lock);
-retry:
- issued = __ceph_caps_issued(ci, NULL);
- if (!(issued & CEPH_CAP_XATTR_EXCL))
- goto do_sync;
- __build_xattrs(inode);
-
- required_blob_size = __get_required_blob_size(ci, name_len, val_len);
-
- if (!ci->i_xattrs.prealloc_blob ||
- required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
- struct ceph_buffer *blob = NULL;
-
- spin_unlock(&inode->i_lock);
- dout(" preaallocating new blob size=%d\n", required_blob_size);
- blob = ceph_buffer_new_alloc(required_blob_size, GFP_NOFS);
- if (!blob)
- goto out;
- spin_lock(&inode->i_lock);
- ceph_buffer_put(ci->i_xattrs.prealloc_blob);
- ci->i_xattrs.prealloc_blob = blob;
- goto retry;
- }
-
- dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
- err = __set_xattr(ci, newname, name_len, newval,
- val_len, 1, 1, 1, &xattr);
- __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
- ci->i_xattrs.dirty = true;
- inode->i_ctime = CURRENT_TIME;
- spin_unlock(&inode->i_lock);
-
- return err;
-
-do_sync:
- spin_unlock(&inode->i_lock);
- err = ceph_sync_setxattr(dentry, name, value, size, flags);
-out:
- kfree(newname);
- kfree(newval);
- kfree(xattr);
- return err;
-}
-
-static int ceph_send_removexattr(struct dentry *dentry, const char *name)
-{
- struct ceph_client *client = ceph_client(dentry->d_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
- struct inode *inode = dentry->d_inode;
- struct inode *parent_inode = dentry->d_parent->d_inode;
- struct ceph_mds_request *req;
- int err;
-
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
- USE_AUTH_MDS);
- if (IS_ERR(req))
- return PTR_ERR(req);
- req->r_inode = igrab(inode);
- req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
- req->r_num_caps = 1;
- req->r_path2 = kstrdup(name, GFP_NOFS);
-
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- ceph_mdsc_put_request(req);
- return err;
-}
-
-int ceph_removexattr(struct dentry *dentry, const char *name)
-{
- struct inode *inode = dentry->d_inode;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
- int issued;
- int err;
-
- if (ceph_snap(inode) != CEPH_NOSNAP)
- return -EROFS;
-
- if (!ceph_is_valid_xattr(name))
- return -EOPNOTSUPP;
-
- if (vxattrs) {
- struct ceph_vxattr_cb *vxattr =
- ceph_match_vxattr(vxattrs, name);
- if (vxattr && vxattr->readonly)
- return -EOPNOTSUPP;
- }
-
- spin_lock(&inode->i_lock);
- __build_xattrs(inode);
- issued = __ceph_caps_issued(ci, NULL);
- dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
-
- if (!(issued & CEPH_CAP_XATTR_EXCL))
- goto do_sync;
-
- err = __remove_xattr_by_name(ceph_inode(inode), name);
- __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
- ci->i_xattrs.dirty = true;
- inode->i_ctime = CURRENT_TIME;
-
- spin_unlock(&inode->i_lock);
-
- return err;
-do_sync:
- spin_unlock(&inode->i_lock);
- err = ceph_send_removexattr(dentry, name);
- return err;
-}
-