1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include <sys/types.h>
18 #include <boost/scoped_ptr.hpp>
23 #include "auth/KeyRing.h"
25 #include "os/ObjectStore.h"
26 #include "mon/MonClient.h"
27 #include "include/ceph_features.h"
28 #include "common/config.h"
30 #include "mon/MonMap.h"
32 #include "msg/Messenger.h"
34 #include "common/Throttle.h"
35 #include "common/Timer.h"
36 #include "common/TracepointProvider.h"
37 #include "common/ceph_argparse.h"
38 #include "common/numa.h"
40 #include "global/global_init.h"
41 #include "global/signal_handler.h"
43 #include "include/color.h"
44 #include "common/errno.h"
45 #include "common/pick_address.h"
47 #include "perfglue/heap_profiler.h"
49 #include "include/ceph_assert.h"
51 #include "common/Preforker.h"
53 #define dout_context g_ceph_context
54 #define dout_subsys ceph_subsys_osd
59 using std::ostringstream;
63 using ceph::bufferlist;
67 TracepointProvider::Traits osd_tracepoint_traits("libosd_tp.so",
69 TracepointProvider::Traits os_tracepoint_traits("libos_tp.so",
70 "osd_objectstore_tracing");
71 TracepointProvider::Traits bluestore_tracepoint_traits("libbluestore_tp.so",
73 #ifdef WITH_OSD_INSTRUMENT_FUNCTIONS
74 TracepointProvider::Traits cyg_profile_traits("libcyg_profile_tp.so",
75 "osd_function_tracing");
78 } // anonymous namespace
80 OSD *osdptr = nullptr;
82 void handle_osd_signal(int signum)
85 osdptr->handle_signal(signum);
90 cout << "usage: ceph-osd -i <ID> [flags]\n"
91 << " --osd-data PATH data directory\n"
92 << " --osd-journal PATH\n"
93 << " journal file or block device\n"
94 << " --mkfs create a [new] data directory\n"
95 << " --mkkey generate a new secret key. This is normally used in combination with --mkfs\n"
96 << " --monmap specify the path to the monitor map. This is normally used in combination with --mkfs\n"
97 << " --osd-uuid specify the OSD's fsid. This is normally used in combination with --mkfs\n"
98 << " --keyring specify a path to the osd keyring. This is normally used in combination with --mkfs\n"
99 << " --convert-filestore\n"
100 << " run any pending upgrade operations\n"
101 << " --flush-journal flush all data out of journal\n"
102 << " --osdspec-affinity\n"
103 << " set affinity to an osdspec\n"
104 << " --dump-journal dump all data of journal\n"
105 << " --mkjournal initialize a new journal\n"
106 << " --check-wants-journal\n"
107 << " check whether a journal is desired\n"
108 << " --check-allows-journal\n"
109 << " check whether a journal is allowed\n"
110 << " --check-needs-journal\n"
111 << " check whether a journal is required\n"
112 << " --debug_osd <N> set debug level (e.g. 10)\n"
113 << " --get-device-fsid PATH\n"
114 << " get OSD fsid for the given block device\n"
116 generic_server_usage();
119 int main(int argc, const char **argv)
121 vector<const char*> args;
122 argv_to_vec(argc, argv, args);
124 cerr << argv[0] << ": -h or --help for usage" << std::endl;
127 if (ceph_argparse_need_usage(args)) {
132 map<string,string> defaults = {
133 // We want to enable leveldb's log, while allowing users to override this
134 // option, therefore we will pass it as a default argument to global_init().
135 { "leveldb_log", "" }
137 auto cct = global_init(
139 args, CEPH_ENTITY_TYPE_OSD,
140 CODE_ENVIRONMENT_DAEMON, 0);
141 ceph_heap_profiler_init();
147 bool mkjournal = false;
148 bool check_wants_journal = false;
149 bool check_allows_journal = false;
150 bool check_needs_journal = false;
152 bool flushjournal = false;
153 bool dump_journal = false;
154 bool convertfilestore = false;
155 bool get_osd_fsid = false;
156 bool get_cluster_fsid = false;
157 bool get_journal_fsid = false;
158 bool get_device_fsid = false;
160 std::string dump_pg_log;
161 std::string osdspec_affinity;
164 for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
165 if (ceph_argparse_double_dash(args, i)) {
167 } else if (ceph_argparse_flag(args, i, "--mkfs", (char*)NULL)) {
169 } else if (ceph_argparse_witharg(args, i, &val, "--osdspec-affinity", (char*)NULL)) {
170 osdspec_affinity = val;
171 } else if (ceph_argparse_flag(args, i, "--mkjournal", (char*)NULL)) {
173 } else if (ceph_argparse_flag(args, i, "--check-allows-journal", (char*)NULL)) {
174 check_allows_journal = true;
175 } else if (ceph_argparse_flag(args, i, "--check-wants-journal", (char*)NULL)) {
176 check_wants_journal = true;
177 } else if (ceph_argparse_flag(args, i, "--check-needs-journal", (char*)NULL)) {
178 check_needs_journal = true;
179 } else if (ceph_argparse_flag(args, i, "--mkkey", (char*)NULL)) {
181 } else if (ceph_argparse_flag(args, i, "--flush-journal", (char*)NULL)) {
183 } else if (ceph_argparse_flag(args, i, "--convert-filestore", (char*)NULL)) {
184 convertfilestore = true;
185 } else if (ceph_argparse_witharg(args, i, &val, "--dump-pg-log", (char*)NULL)) {
187 } else if (ceph_argparse_flag(args, i, "--dump-journal", (char*)NULL)) {
189 } else if (ceph_argparse_flag(args, i, "--get-cluster-fsid", (char*)NULL)) {
190 get_cluster_fsid = true;
191 } else if (ceph_argparse_flag(args, i, "--get-osd-fsid", "--get-osd-uuid", (char*)NULL)) {
193 } else if (ceph_argparse_flag(args, i, "--get-journal-fsid", "--get-journal-uuid", (char*)NULL)) {
194 get_journal_fsid = true;
195 } else if (ceph_argparse_witharg(args, i, &device_path,
196 "--get-device-fsid", (char*)NULL)) {
197 get_device_fsid = true;
203 cerr << "unrecognized arg " << args[0] << std::endl;
207 if (global_init_prefork(g_ceph_context) >= 0) {
209 int r = forker.prefork(err);
211 cerr << err << std::endl;
214 if (forker.is_parent()) {
215 g_ceph_context->_log->start();
216 if (forker.parent_wait(err) != 0) {
222 global_init_postfork_start(g_ceph_context);
224 common_init_finish(g_ceph_context);
225 global_init_chdir(g_ceph_context);
227 if (get_journal_fsid) {
228 device_path = g_conf().get_val<std::string>("osd_journal");
229 get_device_fsid = true;
231 if (get_device_fsid) {
233 int r = ObjectStore::probe_block_device_fsid(g_ceph_context, device_path,
236 cerr << "failed to get device fsid for " << device_path
237 << ": " << cpp_strerror(r) << std::endl;
240 cout << uuid << std::endl;
244 if (!dump_pg_log.empty()) {
245 common_init_finish(g_ceph_context);
249 if (bl.read_file(dump_pg_log.c_str(), &error) >= 0) {
251 auto p = bl.cbegin();
253 uint64_t pos = p.get_off();
257 catch (const ceph::buffer::error &e) {
258 derr << "failed to decode LogEntry at offset " << pos << dendl;
261 derr << pos << ":\t" << e << dendl;
264 derr << "unable to open " << dump_pg_log << ": " << error << dendl;
271 const char *id = g_conf()->name.get_id().c_str();
272 int whoami = strtol(id, &end, 10);
273 std::string data_path = g_conf().get_val<std::string>("osd_data");
274 if (*end || end == id || whoami < 0) {
275 derr << "must specify '-i #' where # is the osd number" << dendl;
279 if (data_path.empty()) {
280 derr << "must specify '--osd-data=foo' data path" << dendl;
285 std::string store_type;
288 snprintf(fn, sizeof(fn), "%s/type", data_path.c_str());
289 int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
294 store_type = string(bl.c_str(), bl.length() - 1); // drop \n
295 dout(5) << "object store type is " << store_type << dendl;
299 store_type = g_conf().get_val<std::string>("osd_objectstore");
301 // hrm, infer the type
302 snprintf(fn, sizeof(fn), "%s/current", data_path.c_str());
304 if (::stat(fn, &st) == 0 &&
305 S_ISDIR(st.st_mode)) {
306 derr << "missing 'type' file, inferring filestore from current/ dir"
308 store_type = "filestore";
310 snprintf(fn, sizeof(fn), "%s/block", data_path.c_str());
311 if (::stat(fn, &st) == 0 &&
312 S_ISLNK(st.st_mode)) {
313 derr << "missing 'type' file, inferring bluestore from block symlink"
315 store_type = "bluestore";
317 derr << "missing 'type' file and unable to infer osd type" << dendl;
324 std::string journal_path = g_conf().get_val<std::string>("osd_journal");
325 uint32_t flags = g_conf().get_val<uint64_t>("osd_os_flags");
326 ObjectStore *store = ObjectStore::create(g_ceph_context,
332 derr << "unable to create object store" << dendl;
333 forker.exit(-ENODEV);
338 common_init_finish(g_ceph_context);
341 EntityName ename{g_conf()->name};
344 std::string keyring_path = g_conf().get_val<std::string>("keyring");
345 int ret = keyring.load(g_ceph_context, keyring_path);
347 keyring.get_auth(ename, eauth)) {
348 derr << "already have key in keyring " << keyring_path << dendl;
350 eauth.key.create(g_ceph_context, CEPH_CRYPTO_AES);
351 keyring.add(ename, eauth);
353 keyring.encode_plaintext(bl);
354 int r = bl.write_file(keyring_path.c_str(), 0600);
356 derr << TEXT_RED << " ** ERROR: writing new keyring to "
357 << keyring_path << ": " << cpp_strerror(r) << TEXT_NORMAL
360 derr << "created new key in keyring " << keyring_path << dendl;
365 common_init_finish(g_ceph_context);
367 if (g_conf().get_val<uuid_d>("fsid").is_zero()) {
368 derr << "must specify cluster fsid" << dendl;
369 forker.exit(-EINVAL);
372 int err = OSD::mkfs(g_ceph_context, store, g_conf().get_val<uuid_d>("fsid"),
373 whoami, osdspec_affinity);
375 derr << TEXT_RED << " ** ERROR: error creating empty object store in "
376 << data_path << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl;
379 dout(0) << "created object store " << data_path
380 << " for osd." << whoami
381 << " fsid " << g_conf().get_val<uuid_d>("fsid")
388 common_init_finish(g_ceph_context);
389 int err = store->mkjournal();
391 derr << TEXT_RED << " ** ERROR: error creating fresh journal "
392 << journal_path << " for object store " << data_path << ": "
393 << cpp_strerror(-err) << TEXT_NORMAL << dendl;
396 derr << "created new journal " << journal_path
397 << " for object store " << data_path << dendl;
400 if (check_wants_journal) {
401 if (store->wants_journal()) {
402 cout << "wants journal: yes" << std::endl;
405 cout << "wants journal: no" << std::endl;
409 if (check_allows_journal) {
410 if (store->allows_journal()) {
411 cout << "allows journal: yes" << std::endl;
414 cout << "allows journal: no" << std::endl;
418 if (check_needs_journal) {
419 if (store->needs_journal()) {
420 cout << "needs journal: yes" << std::endl;
423 cout << "needs journal: no" << std::endl;
428 common_init_finish(g_ceph_context);
429 int err = store->mount();
431 derr << TEXT_RED << " ** ERROR: error flushing journal " << journal_path
432 << " for object store " << data_path
433 << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl;
434 goto flushjournal_out;
437 derr << "flushed journal " << journal_path
438 << " for object store " << data_path
442 forker.exit(err < 0 ? 1 : 0);
445 common_init_finish(g_ceph_context);
446 int err = store->dump_journal(cout);
448 derr << TEXT_RED << " ** ERROR: error dumping journal " << journal_path
449 << " for object store " << data_path
450 << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl;
453 derr << "dumped journal " << journal_path
454 << " for object store " << data_path
459 if (convertfilestore) {
460 int err = store->mount();
462 derr << TEXT_RED << " ** ERROR: error mounting store " << data_path
463 << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl;
466 err = store->upgrade();
469 derr << TEXT_RED << " ** ERROR: error converting store " << data_path
470 << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl;
477 uuid_d cluster_fsid, osd_fsid;
478 ceph_release_t require_osd_release = ceph_release_t::unknown;
480 int r = OSD::peek_meta(store, &magic, &cluster_fsid, &osd_fsid, &w,
481 &require_osd_release);
483 derr << TEXT_RED << " ** ERROR: unable to open OSD superblock on "
484 << data_path << ": " << cpp_strerror(-r)
485 << TEXT_NORMAL << dendl;
487 derr << TEXT_RED << " ** please verify that underlying storage "
488 << "supports xattrs" << TEXT_NORMAL << dendl;
493 derr << "OSD id " << w << " != my id " << whoami << dendl;
496 if (strcmp(magic.c_str(), CEPH_OSD_ONDISK_MAGIC)) {
497 derr << "OSD magic " << magic << " != my " << CEPH_OSD_ONDISK_MAGIC
502 if (get_cluster_fsid) {
503 cout << cluster_fsid << std::endl;
507 cout << osd_fsid << std::endl;
513 if (!can_upgrade_from(require_osd_release, "require_osd_release", err)) {
514 derr << err.str() << dendl;
519 // consider objectstore numa node
520 int os_numa_node = -1;
521 r = store->get_numa_node(&os_numa_node, nullptr, nullptr);
522 if (r >= 0 && os_numa_node >= 0) {
523 dout(1) << " objectstore numa_node " << os_numa_node << dendl;
525 int iface_preferred_numa_node = -1;
526 if (g_conf().get_val<bool>("osd_numa_prefer_iface")) {
527 iface_preferred_numa_node = os_numa_node;
531 std::string msg_type = g_conf().get_val<std::string>("ms_type");
532 std::string public_msg_type =
533 g_conf().get_val<std::string>("ms_public_type");
534 std::string cluster_msg_type =
535 g_conf().get_val<std::string>("ms_cluster_type");
537 public_msg_type = public_msg_type.empty() ? msg_type : public_msg_type;
538 cluster_msg_type = cluster_msg_type.empty() ? msg_type : cluster_msg_type;
539 uint64_t nonce = Messenger::get_pid_nonce();
540 Messenger *ms_public = Messenger::create(g_ceph_context, public_msg_type,
541 entity_name_t::OSD(whoami), "client", nonce);
542 Messenger *ms_cluster = Messenger::create(g_ceph_context, cluster_msg_type,
543 entity_name_t::OSD(whoami), "cluster", nonce);
544 Messenger *ms_hb_back_client = Messenger::create(g_ceph_context, cluster_msg_type,
545 entity_name_t::OSD(whoami), "hb_back_client", nonce);
546 Messenger *ms_hb_front_client = Messenger::create(g_ceph_context, public_msg_type,
547 entity_name_t::OSD(whoami), "hb_front_client", nonce);
548 Messenger *ms_hb_back_server = Messenger::create(g_ceph_context, cluster_msg_type,
549 entity_name_t::OSD(whoami), "hb_back_server", nonce);
550 Messenger *ms_hb_front_server = Messenger::create(g_ceph_context, public_msg_type,
551 entity_name_t::OSD(whoami), "hb_front_server", nonce);
552 Messenger *ms_objecter = Messenger::create(g_ceph_context, public_msg_type,
553 entity_name_t::OSD(whoami), "ms_objecter", nonce);
554 if (!ms_public || !ms_cluster || !ms_hb_front_client || !ms_hb_back_client || !ms_hb_back_server || !ms_hb_front_server || !ms_objecter)
556 ms_cluster->set_cluster_protocol(CEPH_OSD_PROTOCOL);
557 ms_hb_front_client->set_cluster_protocol(CEPH_OSD_PROTOCOL);
558 ms_hb_back_client->set_cluster_protocol(CEPH_OSD_PROTOCOL);
559 ms_hb_back_server->set_cluster_protocol(CEPH_OSD_PROTOCOL);
560 ms_hb_front_server->set_cluster_protocol(CEPH_OSD_PROTOCOL);
562 dout(0) << "starting osd." << whoami
563 << " osd_data " << data_path
564 << " " << ((journal_path.empty()) ?
565 "(no journal)" : journal_path)
568 uint64_t message_size =
569 g_conf().get_val<Option::size_t>("osd_client_message_size_cap");
570 boost::scoped_ptr<Throttle> client_byte_throttler(
571 new Throttle(g_ceph_context, "osd_client_bytes", message_size));
572 uint64_t message_cap = g_conf().get_val<uint64_t>("osd_client_message_cap");
573 boost::scoped_ptr<Throttle> client_msg_throttler(
574 new Throttle(g_ceph_context, "osd_client_messages", message_cap));
576 // All feature bits 0 - 34 should be present from dumpling v0.67 forward
577 uint64_t osd_required =
579 CEPH_FEATURE_PGID64 |
582 ms_public->set_default_policy(Messenger::Policy::stateless_registered_server(0));
583 ms_public->set_policy_throttlers(entity_name_t::TYPE_CLIENT,
584 client_byte_throttler.get(),
585 client_msg_throttler.get());
586 ms_public->set_policy(entity_name_t::TYPE_MON,
587 Messenger::Policy::lossy_client(osd_required));
588 ms_public->set_policy(entity_name_t::TYPE_MGR,
589 Messenger::Policy::lossy_client(osd_required));
591 ms_cluster->set_default_policy(Messenger::Policy::stateless_server(0));
592 ms_cluster->set_policy(entity_name_t::TYPE_MON, Messenger::Policy::lossy_client(0));
593 ms_cluster->set_policy(entity_name_t::TYPE_OSD,
594 Messenger::Policy::lossless_peer(osd_required));
595 ms_cluster->set_policy(entity_name_t::TYPE_CLIENT,
596 Messenger::Policy::stateless_server(0));
598 ms_hb_front_client->set_policy(entity_name_t::TYPE_OSD,
599 Messenger::Policy::lossy_client(0));
600 ms_hb_back_client->set_policy(entity_name_t::TYPE_OSD,
601 Messenger::Policy::lossy_client(0));
602 ms_hb_back_server->set_policy(entity_name_t::TYPE_OSD,
603 Messenger::Policy::stateless_server(0));
604 ms_hb_front_server->set_policy(entity_name_t::TYPE_OSD,
605 Messenger::Policy::stateless_server(0));
607 ms_objecter->set_default_policy(Messenger::Policy::lossy_client(CEPH_FEATURE_OSDREPLYMUX));
609 entity_addrvec_t public_addrs, cluster_addrs;
610 r = pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC, &public_addrs,
611 iface_preferred_numa_node);
613 derr << "Failed to pick public address." << dendl;
616 r = pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_CLUSTER, &cluster_addrs,
617 iface_preferred_numa_node);
619 derr << "Failed to pick cluster address." << dendl;
623 if (ms_public->bindv(public_addrs) < 0)
626 if (ms_cluster->bindv(cluster_addrs) < 0)
629 bool is_delay = g_conf().get_val<bool>("osd_heartbeat_use_min_delay_socket");
631 ms_hb_front_client->set_socket_priority(SOCKET_PRIORITY_MIN_DELAY);
632 ms_hb_back_client->set_socket_priority(SOCKET_PRIORITY_MIN_DELAY);
633 ms_hb_back_server->set_socket_priority(SOCKET_PRIORITY_MIN_DELAY);
634 ms_hb_front_server->set_socket_priority(SOCKET_PRIORITY_MIN_DELAY);
637 entity_addrvec_t hb_front_addrs = public_addrs;
638 for (auto& a : hb_front_addrs.v) {
641 if (ms_hb_front_server->bindv(hb_front_addrs) < 0)
643 if (ms_hb_front_client->client_bind(hb_front_addrs.front()) < 0)
646 entity_addrvec_t hb_back_addrs = cluster_addrs;
647 for (auto& a : hb_back_addrs.v) {
650 if (ms_hb_back_server->bindv(hb_back_addrs) < 0)
652 if (ms_hb_back_client->client_bind(hb_back_addrs.front()) < 0)
655 // install signal handlers
656 init_async_signal_handler();
657 register_async_signal_handler(SIGHUP, sighup_handler);
659 TracepointProvider::initialize<osd_tracepoint_traits>(g_ceph_context);
660 TracepointProvider::initialize<os_tracepoint_traits>(g_ceph_context);
661 TracepointProvider::initialize<bluestore_tracepoint_traits>(g_ceph_context);
662 #ifdef WITH_OSD_INSTRUMENT_FUNCTIONS
663 TracepointProvider::initialize<cyg_profile_traits>(g_ceph_context);
666 srand(time(NULL) + getpid());
668 ceph::async::io_context_pool poolctx(
669 cct->_conf.get_val<std::uint64_t>("osd_asio_thread_count"));
671 MonClient mc(g_ceph_context, poolctx);
672 if (mc.build_initial_monmap() < 0)
674 global_init_chdir(g_ceph_context);
676 if (global_init_preload_erasure_code(g_ceph_context) < 0) {
680 osdptr = new OSD(g_ceph_context,
695 int err = osdptr->pre_init();
697 derr << TEXT_RED << " ** ERROR: osd pre_init failed: " << cpp_strerror(-err)
698 << TEXT_NORMAL << dendl;
703 ms_hb_front_client->start();
704 ms_hb_back_client->start();
705 ms_hb_front_server->start();
706 ms_hb_back_server->start();
708 ms_objecter->start();
711 err = osdptr->init();
713 derr << TEXT_RED << " ** ERROR: osd init failed: " << cpp_strerror(-err)
714 << TEXT_NORMAL << dendl;
720 if (g_conf()->daemonize) {
721 global_init_postfork_finish(g_ceph_context);
726 register_async_signal_handler_oneshot(SIGINT, handle_osd_signal);
727 register_async_signal_handler_oneshot(SIGTERM, handle_osd_signal);
729 osdptr->final_init();
731 if (g_conf().get_val<bool>("inject_early_sigterm"))
732 kill(getpid(), SIGTERM);
735 ms_hb_front_client->wait();
736 ms_hb_back_client->wait();
737 ms_hb_front_server->wait();
738 ms_hb_back_server->wait();
742 unregister_async_signal_handler(SIGHUP, sighup_handler);
743 unregister_async_signal_handler(SIGINT, handle_osd_signal);
744 unregister_async_signal_handler(SIGTERM, handle_osd_signal);
745 shutdown_async_signal_handler();
751 delete ms_hb_front_client;
752 delete ms_hb_back_client;
753 delete ms_hb_front_server;
754 delete ms_hb_back_server;
758 client_byte_throttler.reset();
759 client_msg_throttler.reset();
761 // cd on exit, so that gmon.out (if any) goes into a separate directory for each node.
763 snprintf(s, sizeof(s), "gmon/%d", getpid());
764 if ((mkdir(s, 0755) == 0) && (chdir(s) == 0)) {
765 dout(0) << "ceph-osd: gmon.out should be in " << s << dendl;