From: Sage Weil <sage@newdream.net>
Date: Fri, 14 Nov 2008 19:14:30 +0000 (-0800)
Subject: kclient: pick new mon if statfs is unresponsive; clean up other retry code
X-Git-Tag: v0.6~435
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=cd3292093ae3d58d3ed4d76651353331bb866403;p=ceph.git

kclient: pick new mon if statfs is unresponsive; clean up other retry code
---

diff --git a/src/kernel/mon_client.c b/src/kernel/mon_client.c
index ea4a5b2c5613..11619bb9f70e 100644
--- a/src/kernel/mon_client.c
+++ b/src/kernel/mon_client.c
@@ -68,11 +68,11 @@ int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
  * Choose a monitor.  If @notmon >= 0, choose a different monitor than
  * last time.
  */
-static int pick_mon(struct ceph_mon_client *monc, int notmon)
+static int pick_mon(struct ceph_mon_client *monc, int newmon)
 {
 	char r;
 
-	if (notmon < 0 && monc->last_mon >= 0)
+	if (!newmon && monc->last_mon >= 0)
 		return monc->last_mon;
 	get_random_bytes(&r, 1);
 	monc->last_mon = r % monc->monmap->num_mon;
@@ -82,7 +82,7 @@ static int pick_mon(struct ceph_mon_client *monc, int notmon)
 /*
  * Delay work with exponential backoff.
  */
-static void delayed_work(struct delayed_work *dwork, unsigned long *delay)
+static void reschedule_timeout(struct delayed_work *dwork, unsigned long *delay)
 {
 	schedule_delayed_work(dwork, *delay);
 	if (*delay < MAX_DELAY_INTERVAL)
@@ -95,14 +95,11 @@ static void delayed_work(struct delayed_work *dwork, unsigned long *delay)
 /*
  * mds map
  */
-static void do_request_mdsmap(struct work_struct *work)
+static void request_mdsmap(struct ceph_mon_client *monc, int newmon)
 {
 	struct ceph_msg *msg;
 	struct ceph_mds_getmap *h;
-	struct ceph_mon_client *monc =
-		container_of(work, struct ceph_mon_client,
-			     mds_delayed_work.work);
-	int mon = pick_mon(monc, -1);
+	int mon = pick_mon(monc, newmon);
 
 	dout(5, "request_mdsmap from mon%d want %u\n", mon, monc->want_mdsmap);
 	msg = ceph_msg_new(CEPH_MSG_MDS_GETMAP, sizeof(*h), 0, 0, NULL);
@@ -113,10 +110,18 @@ static void do_request_mdsmap(struct work_struct *work)
 	h->want = cpu_to_le32(monc->want_mdsmap);
 	msg->hdr.dst = monc->monmap->mon_inst[mon];
 	ceph_msg_send(monc->client->msgr, msg, 0);
+}
+
+static void retry_request_mdsmap(struct work_struct *work)
+{
+	struct ceph_mon_client *monc =
+		container_of(work, struct ceph_mon_client,
+			     mds_delayed_work.work);
 
 	/* keep sending request until we receive mds map */
+	request_mdsmap(monc, 1);
 	if (monc->want_mdsmap)
-		delayed_work(&monc->mds_delayed_work, &monc->mds_delay);
+		reschedule_timeout(&monc->mds_delayed_work, &monc->mds_delay);
 }
 
 /*
@@ -129,7 +134,8 @@ void ceph_monc_request_mdsmap(struct ceph_mon_client *monc, u32 want)
 	if (want > monc->want_mdsmap) {
 		monc->mds_delay = BASE_DELAY_INTERVAL;
 		monc->want_mdsmap = want;
-		do_request_mdsmap(&monc->mds_delayed_work.work);
+		request_mdsmap(monc, 0);
+		reschedule_timeout(&monc->mds_delayed_work, &monc->mds_delay);
 	}
 	mutex_unlock(&monc->req_mutex);
 }
@@ -159,14 +165,12 @@ int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
 /*
  * osd map
  */
-static void do_request_osdmap(struct work_struct *work)
+
+static void request_osdmap(struct ceph_mon_client *monc, int newmon)
 {
 	struct ceph_msg *msg;
 	struct ceph_osd_getmap *h;
-	struct ceph_mon_client *monc =
-		container_of(work, struct ceph_mon_client,
-			     osd_delayed_work.work);
-	int mon = pick_mon(monc, -1);
+	int mon = pick_mon(monc, newmon);
 
 	dout(5, "request_osdmap from mon%d want %u\n", mon, monc->want_osdmap);
 	msg = ceph_msg_new(CEPH_MSG_OSD_GETMAP, sizeof(*h), 0, 0, NULL);
@@ -177,10 +181,18 @@ static void do_request_osdmap(struct work_struct *work)
 	h->start = cpu_to_le32(monc->want_osdmap);
 	msg->hdr.dst = monc->monmap->mon_inst[mon];
 	ceph_msg_send(monc->client->msgr, msg, 0);
+}
+
+static void retry_request_osdmap(struct work_struct *work)
+{
+	struct ceph_mon_client *monc =
+		container_of(work, struct ceph_mon_client,
+			     osd_delayed_work.work);
 
 	/* keep sending request until we receive osd map */
+	request_osdmap(monc, 1);
 	if (monc->want_osdmap)
-		delayed_work(&monc->osd_delayed_work, &monc->osd_delay);
+		reschedule_timeout(&monc->osd_delayed_work, &monc->osd_delay);
 }
 
 void ceph_monc_request_osdmap(struct ceph_mon_client *monc, u32 want)
@@ -189,7 +201,8 @@ void ceph_monc_request_osdmap(struct ceph_mon_client *monc, u32 want)
 	mutex_lock(&monc->req_mutex);
 	monc->osd_delay = BASE_DELAY_INTERVAL;
 	monc->want_osdmap = want;
-	do_request_osdmap(&monc->osd_delayed_work.work);
+	request_osdmap(monc, 0);
+	reschedule_timeout(&monc->osd_delayed_work, &monc->osd_delay);
 	mutex_unlock(&monc->req_mutex);
 }
 
@@ -215,13 +228,10 @@ int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
 /*
  * umount
  */
-static void do_request_umount(struct work_struct *work)
+static void request_umount(struct ceph_mon_client *monc, int newmon)
 {
 	struct ceph_msg *msg;
-	struct ceph_mon_client *monc =
-		container_of(work, struct ceph_mon_client,
-			     umount_delayed_work.work);
-	int mon = pick_mon(monc, -1);
+	int mon = pick_mon(monc, newmon);
 
 	dout(5, "do_request_umount from mon%d\n", mon);
 	msg = ceph_msg_new(CEPH_MSG_CLIENT_UNMOUNT, 0, 0, 0, NULL);
@@ -229,8 +239,16 @@ static void do_request_umount(struct work_struct *work)
 		return;
 	msg->hdr.dst = monc->monmap->mon_inst[mon];
 	ceph_msg_send(monc->client->msgr, msg, 0);
+}
 
-	delayed_work(&monc->umount_delayed_work, &monc->umount_delay);
+static void retry_request_umount(struct work_struct *work)
+{
+	struct ceph_mon_client *monc =
+		container_of(work, struct ceph_mon_client,
+			     umount_delayed_work.work);
+
+	request_umount(monc, 1);
+	reschedule_timeout(&monc->umount_delayed_work, &monc->umount_delay);
 }
 
 void ceph_monc_request_umount(struct ceph_mon_client *monc)
@@ -243,7 +261,8 @@ void ceph_monc_request_umount(struct ceph_mon_client *monc)
 
 	mutex_lock(&monc->req_mutex);
 	monc->umount_delay = BASE_DELAY_INTERVAL;
-	do_request_umount(&monc->umount_delayed_work.work);
+	request_umount(monc, 0);
+	reschedule_timeout(&monc->umount_delayed_work, &monc->umount_delay);
 	mutex_unlock(&monc->req_mutex);
 }
 
@@ -277,17 +296,20 @@ void ceph_monc_handle_statfs_reply(struct ceph_mon_client *monc,
 	tid = le64_to_cpu(reply->tid);
 	dout(10, "handle_statfs_reply %p tid %llu\n", msg, tid);
 
-	spin_lock(&monc->statfs_lock);
+	mutex_lock(&monc->statfs_mutex);
 	req = radix_tree_lookup(&monc->statfs_request_tree, tid);
 	if (req) {
 		radix_tree_delete(&monc->statfs_request_tree, tid);
+		monc->num_statfs_requests--;
+		if (monc->num_statfs_requests == 0)
+			cancel_delayed_work(&monc->statfs_delayed_work);
 		req->buf->f_total = reply->st.f_total;
 		req->buf->f_free = reply->st.f_free;
 		req->buf->f_avail = reply->st.f_avail;
 		req->buf->f_objects = reply->st.f_objects;
 		req->result = 0;
 	}
-	spin_unlock(&monc->statfs_lock);
+	mutex_unlock(&monc->statfs_mutex);
 	if (req)
 		complete(&req->completion);
 	return;
@@ -299,11 +321,11 @@ bad:
 /*
  * (re)send a statfs request
  */
-static int send_statfs(struct ceph_mon_client *monc, u64 tid)
+static int send_statfs(struct ceph_mon_client *monc, u64 tid, int newmon)
 {
 	struct ceph_msg *msg;
 	struct ceph_mon_statfs *h;
-	int mon = pick_mon(monc, -1);
+	int mon = pick_mon(monc, newmon ? 1:-1);
 
 	dout(10, "send_statfs to mon%d tid %llu\n", mon, tid);
 	msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
@@ -329,20 +351,23 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 	init_completion(&req.completion);
 
 	/* register request */
-	err = radix_tree_preload(GFP_NOFS);
-	if (err < 0) {
-		derr(10, "ENOMEM in do_statfs\n");
-		return err;
-	}
-	spin_lock(&monc->statfs_lock);
+	mutex_lock(&monc->statfs_mutex);
 	req.tid = ++monc->last_tid;
 	req.last_attempt = jiffies;
-	radix_tree_insert(&monc->statfs_request_tree, req.tid, &req);
-	spin_unlock(&monc->statfs_lock);
-	radix_tree_preload_end();
+	req.delay = BASE_DELAY_INTERVAL;
+	if (radix_tree_insert(&monc->statfs_request_tree, req.tid, &req) < 0) {
+		mutex_unlock(&monc->statfs_mutex);
+		derr(10, "ENOMEM in do_statfs\n");
+		return -ENOMEM;
+	}
+	if (monc->num_statfs_requests == 0)
+		schedule_delayed_work(&monc->statfs_delayed_work,
+				      round_jiffies_relative(1*HZ));
+	monc->num_statfs_requests++;
+	mutex_unlock(&monc->statfs_mutex);
 
 	/* send request and wait */
-	err = send_statfs(monc, req.tid);
+	err = send_statfs(monc, req.tid, 0);
 	if (err)
 		return err;
 	err = wait_for_completion_interruptible(&req.completion);
@@ -351,6 +376,41 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 	return req.result;
 }
 
+static void do_statfs_check(struct work_struct *work)
+{
+	struct ceph_mon_client *monc =
+		container_of(work, struct ceph_mon_client,
+			     statfs_delayed_work.work);
+	u64 next_tid = 0;
+	int got;
+	int did = 0;
+	int newmon = 1;
+	struct ceph_mon_statfs_request *req;
+
+	dout(10, "do_statfs_check\n");
+	mutex_lock(&monc->statfs_mutex);
+	while (1) {
+		got = radix_tree_gang_lookup(&monc->statfs_request_tree,
+					     (void **)&req,
+					     next_tid, 1);
+		if (got == 0)
+			break;
+		did++;
+		next_tid = req->tid + 1;
+		if (time_after(jiffies, req->last_attempt + req->delay)) {
+			req->last_attempt = jiffies;
+			if (req->delay < HZ*60)
+				req->delay *= 2;
+			send_statfs(monc, req->tid, newmon);
+			newmon = 0;
+		}
+	}
+	mutex_unlock(&monc->statfs_mutex);
+
+	if (did)
+		schedule_delayed_work(&monc->statfs_delayed_work,
+				      round_jiffies_relative(1*HZ));
+}
 
 int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 {
@@ -362,12 +422,14 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 		       GFP_KERNEL);
 	if (monc->monmap == NULL)
 		return -ENOMEM;
-	spin_lock_init(&monc->statfs_lock);
+	mutex_init(&monc->statfs_mutex);
 	INIT_RADIX_TREE(&monc->statfs_request_tree, GFP_ATOMIC);
+	monc->num_statfs_requests = 0;
 	monc->last_tid = 0;
-	INIT_DELAYED_WORK(&monc->mds_delayed_work, do_request_mdsmap);
-	INIT_DELAYED_WORK(&monc->osd_delayed_work, do_request_osdmap);
-	INIT_DELAYED_WORK(&monc->umount_delayed_work, do_request_umount);
+	INIT_DELAYED_WORK(&monc->mds_delayed_work, retry_request_mdsmap);
+	INIT_DELAYED_WORK(&monc->osd_delayed_work, retry_request_osdmap);
+	INIT_DELAYED_WORK(&monc->umount_delayed_work, retry_request_umount);
+	INIT_DELAYED_WORK(&monc->statfs_delayed_work, do_statfs_check);
 	monc->mds_delay = monc->osd_delay = monc->umount_delay = 0;
 	mutex_init(&monc->req_mutex);
 	monc->want_mdsmap = 0;
diff --git a/src/kernel/mon_client.h b/src/kernel/mon_client.h
index 920cda86db4d..9ccc10613104 100644
--- a/src/kernel/mon_client.h
+++ b/src/kernel/mon_client.h
@@ -15,7 +15,7 @@
  * Communication with the monitor cluster is lossy, so requests for
  * information may have to be resent if we time out waiting for a response.
  * As long as we do not time out, we continue to send all requests to the
- * same monitor.  If there is a problem, we randomly pick a new monitor form
+ * same monitor.  If there is a problem, we randomly pick a new monitor from
  * the cluster to try.
  */
 
@@ -43,7 +43,7 @@ struct ceph_mon_statfs_request {
 	int result;
 	struct ceph_statfs *buf;
 	struct completion completion;
-	unsigned long last_attempt; /* jiffies */
+	unsigned long last_attempt, delay; /* jiffies */
 };
 
 struct ceph_mon_client {
@@ -52,14 +52,16 @@ struct ceph_mon_client {
 	struct ceph_monmap *monmap;
 
 	/* pending statfs requests */
-	spinlock_t statfs_lock;
+	struct mutex statfs_mutex;
 	struct radix_tree_root statfs_request_tree;
+	int num_statfs_requests;
 	u64 last_tid;
 
 	/* mds/osd map or umount requests */
 	struct delayed_work mds_delayed_work;
 	struct delayed_work osd_delayed_work;
 	struct delayed_work umount_delayed_work;
+	struct delayed_work statfs_delayed_work;
 	unsigned long mds_delay;
 	unsigned long osd_delay;
 	unsigned long umount_delay;