From: Haomai Wang Date: Wed, 6 May 2015 11:52:35 +0000 (+0800) Subject: ceph_perf_local: rename from ceph_perf X-Git-Tag: v9.0.2~104^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=61d70db320d82494a7e14d9fa3e4cb3155ed2984;p=ceph.git ceph_perf_local: rename from ceph_perf Signed-off-by: Haomai Wang --- diff --git a/src/test/Makefile-server.am b/src/test/Makefile-server.am index abc4ae23d8e3..2d5964ef3c09 100644 --- a/src/test/Makefile-server.am +++ b/src/test/Makefile-server.am @@ -30,9 +30,10 @@ ceph_perf_objectstore_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL) ceph_perf_objectstore_CXXFLAGS = $(UNITTEST_CXXFLAGS) bin_DEBUGPROGRAMS += ceph_perf_objectstore -ceph_perf_SOURCES = test/perf.cc test/perf_helper.cc -ceph_perf_LDADD = $(LIBOS) $(CEPH_GLOBAL) -bin_DEBUGPROGRAMS += ceph_perf +ceph_perf_local_SOURCES = test/perf_local.cc test/perf_helper.cc +ceph_perf_local_LDADD = $(LIBOS) $(CEPH_GLOBAL) +noinst_HEADERS += test/perf_helper.h +bin_DEBUGPROGRAMS += ceph_perf_local if LINUX ceph_test_objectstore_SOURCES = test/objectstore/store_test.cc diff --git a/src/test/perf.cc b/src/test/perf.cc deleted file mode 100644 index b4e83001f5aa..000000000000 --- a/src/test/perf.cc +++ /dev/null @@ -1,1009 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* Copyright (c) 2015 Haomai Wang - * Copyright (c) 2011 Facebook - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -// This program contains a collection of low-level performance measurements -// for Ceph, which can be run either individually or altogether. These -// tests measure performance in a single stand-alone process, not in a cluster -// with multiple servers. Invoke the program like this: -// -// Perf test1 test2 ... -// -// test1 and test2 are the names of individual performance measurements to -// run. If no test names are provided then all of the performance tests -// are run. -// -// To add a new test: -// * Write a function that implements the test. Use existing test functions -// as a guideline, and be sure to generate output in the same form as -// other tests. -// * Create a new entry for the test in the #tests table. -#include -#include -#include - -#include "include/atomic.h" -#include "include/buffer.h" -#include "include/encoding.h" -#include "include/ceph_hash.h" -#include "include/Spinlock.h" -#include "common/ceph_argparse.h" -#include "common/Cycles.h" -#include "common/Cond.h" -#include "common/Mutex.h" -#include "common/Thread.h" -#include "common/Timer.h" -#include "msg/async/Event.h" -#include "global/global_init.h" - -#include "perf_helper.h" - -using namespace ceph; - -/** - * Ask the operating system to pin the current thread to a given CPU. - * - * \param cpu - * Indicates the desired CPU and hyperthread; low order 2 bits - * specify CPU, next bit specifies hyperthread. - */ -void bind_thread_to_cpu(int cpu) -{ -#ifdef HAVE_SCHED - cpu_set_t set; - CPU_ZERO(&set); - CPU_SET(cpu, &set); - sched_setaffinity(0, sizeof(set), &set); -#endif -} - -/* - * This function just discards its argument. It's used to make it - * appear that data is used, so that the compiler won't optimize - * away the code we're trying to measure. - * - * \param value - * Pointer to arbitrary value; it's discarded. - */ -void discard(void* value) { - int x = *reinterpret_cast(value); - if (x == 0x43924776) { - printf("Value was 0x%x\n", x); - } -} - -//---------------------------------------------------------------------- -// Test functions start here -//---------------------------------------------------------------------- - -// Measure the cost of atomic_t::compare_and_swap -double atomic_int_cmp() -{ - int count = 1000000; - atomic_t value(11); - int test = 11; - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - value.compare_and_swap(test, test+2); - test += 2; - } - uint64_t stop = Cycles::rdtsc(); - // printf("Final value: %d\n", value.load()); - return Cycles::to_seconds(stop - start)/count; -} - -// Measure the cost of atomic_t::inc -double atomic_int_inc() -{ - int count = 1000000; - atomic_t value(11); - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - value.inc(); - } - uint64_t stop = Cycles::rdtsc(); - // printf("Final value: %d\n", value.load()); - return Cycles::to_seconds(stop - start)/count; -} - -// Measure the cost of reading an atomic_t -double atomic_int_read() -{ - int count = 1000000; - atomic_t value(11); - int total = 0; - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - total += value.read(); - } - uint64_t stop = Cycles::rdtsc(); - // printf("Total: %d\n", total); - return Cycles::to_seconds(stop - start)/count; -} - -// Measure the cost of storing a new value in a atomic_t -double atomic_int_set() -{ - int count = 1000000; - atomic_t value(11); - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - value.set(88); - } - uint64_t stop = Cycles::rdtsc(); - return Cycles::to_seconds(stop - start)/count; -} - -// Measure the cost of acquiring and releasing a mutex in the -// fast case where the mutex is free. -double mutex_nonblock() -{ - int count = 1000000; - Mutex m("mutex_nonblock::m"); - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - m.Lock(); - m.Unlock(); - } - uint64_t stop = Cycles::rdtsc(); - return Cycles::to_seconds(stop - start)/count; -} - -// Measure the cost of allocating and deallocating a buffer, plus -// appending (logically) one ptr. -double buffer_basic() -{ - int count = 1000000; - uint64_t start = Cycles::rdtsc(); - bufferptr ptr("abcdefg", 7); - for (int i = 0; i < count; i++) { - bufferlist b; - b.append(ptr, 0, 5); - } - uint64_t stop = Cycles::rdtsc(); - return Cycles::to_seconds(stop - start)/count; -} - -struct DummyBlock { - int a, b, c, d; - void encode(bufferlist &bl) const { - ENCODE_START(1, 1, bl); - ::encode(a, bl); - ::encode(b, bl); - ::encode(c, bl); - ::encode(d, bl); - ENCODE_FINISH(bl); - } - void decode(bufferlist::iterator &bl) { - DECODE_START(1, bl); - ::decode(a, bl); - ::decode(b, bl); - ::decode(c, bl); - ::decode(d, bl); - DECODE_FINISH(bl); - } -}; -WRITE_CLASS_ENCODER(DummyBlock) - -// Measure the cost of encoding and decoding a buffer, plus -// allocating space for one chunk. -double buffer_encode_decode() -{ - int count = 1000000; - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - bufferlist b; - DummyBlock dummy_block; - ::encode(dummy_block, b); - bufferlist::iterator iter = b.begin(); - ::decode(dummy_block, iter); - } - uint64_t stop = Cycles::rdtsc(); - return Cycles::to_seconds(stop - start)/count; -} - -// Measure the cost of allocating and deallocating a buffer, plus -// copying in a small block. -double buffer_basic_copy() -{ - int count = 1000000; - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - bufferlist b; - b.append("abcdefg", 6); - } - uint64_t stop = Cycles::rdtsc(); - return Cycles::to_seconds(stop - start)/count; -} - -// Measure the cost of making a copy of parts of two ptrs. -double buffer_copy() -{ - int count = 1000000; - bufferlist b; - b.append("abcde", 5); - b.append("01234", 5); - char copy[10]; - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - b.copy(2, 6, copy); - } - uint64_t stop = Cycles::rdtsc(); - return Cycles::to_seconds(stop - start)/count; -} - -// Measure the cost of allocating new space by extending the -// bufferlist -double buffer_encode() -{ - int count = 100000; - uint64_t total = 0; - for (int i = 0; i < count; i++) { - bufferlist b; - DummyBlock dummy_block; - ::encode(dummy_block, b); - uint64_t start = Cycles::rdtsc(); - ::encode(dummy_block, b); - ::encode(dummy_block, b); - ::encode(dummy_block, b); - ::encode(dummy_block, b); - ::encode(dummy_block, b); - ::encode(dummy_block, b); - ::encode(dummy_block, b); - ::encode(dummy_block, b); - ::encode(dummy_block, b); - ::encode(dummy_block, b); - total += Cycles::rdtsc() - start; - } - return Cycles::to_seconds(total)/(count*10); -} - -// Measure the cost of retrieving an object from the beginning of a buffer. -double buffer_get_contiguous() -{ - int count = 1000000; - int value = 11; - bufferlist b; - b.append((char*)&value, sizeof(value)); - int sum = 0; - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - sum += *reinterpret_cast(b.get_contiguous(0, sizeof(value))); - } - uint64_t stop = Cycles::rdtsc(); - return Cycles::to_seconds(stop - start)/count; -} - -// Measure the cost of creating an iterator and iterating over 10 -// chunks in a buffer. -double buffer_iterator() -{ - bufferlist b; - const char s[] = "abcdefghijklmnopqrstuvwxyz"; - bufferptr ptr(s, sizeof(s)); - for (int i = 0; i < 5; i++) { - b.append(ptr, i, 5); - } - int count = 100000; - int sum = 0; - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - bufferlist::iterator it = b.begin(); - while (!it.end()) { - sum += (static_cast(it.get_current_ptr().c_str()))[it.get_remaining()-1]; - ++it; - } - } - uint64_t stop = Cycles::rdtsc(); - discard(&sum); - return Cycles::to_seconds(stop - start)/count; -} - -// Implements the CondPingPong test. -class CondPingPong { - Mutex mutex; - Cond cond; - int prod; - int cons; - const int count; - - class Consumer : public Thread { - CondPingPong *p; - public: - Consumer(CondPingPong *p): p(p) {} - void* entry() { - p->consume(); - return 0; - } - } consumer; - - public: - CondPingPong(): mutex("CondPingPong::mutex"), prod(0), cons(0), count(10000), consumer(this) {} - - double run() { - consumer.create(); - uint64_t start = Cycles::rdtsc(); - produce(); - uint64_t stop = Cycles::rdtsc(); - consumer.join(); - return Cycles::to_seconds(stop - start)/count; - } - - void produce() { - Mutex::Locker l(mutex); - while (cons < count) { - while (cons < prod) - cond.Wait(mutex); - ++prod; - cond.Signal(); - } - } - - void consume() { - Mutex::Locker l(mutex); - while (cons < count) { - while (cons == prod) - cond.Wait(mutex); - ++cons; - cond.Signal(); - } - } -}; - -// Measure the cost of coordinating between threads using a condition variable. -double cond_ping_pong() -{ - return CondPingPong().run(); -} - -// Measure the cost of a 32-bit divide. Divides don't take a constant -// number of cycles. Values were chosen here semi-randomly to depict a -// fairly expensive scenario. Someone with fancy ALU knowledge could -// probably pick worse values. -double div32() -{ - int count = 1000000; - uint64_t start = Cycles::rdtsc(); - // NB: Expect an x86 processor exception is there's overflow. - uint32_t numeratorHi = 0xa5a5a5a5U; - uint32_t numeratorLo = 0x55aa55aaU; - uint32_t divisor = 0xaa55aa55U; - uint32_t quotient; - uint32_t remainder; - for (int i = 0; i < count; i++) { - __asm__ __volatile__("div %4" : - "=a"(quotient), "=d"(remainder) : - "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) : - "cc"); - } - uint64_t stop = Cycles::rdtsc(); - return Cycles::to_seconds(stop - start)/count; -} - -// Measure the cost of a 64-bit divide. Divides don't take a constant -// number of cycles. Values were chosen here semi-randomly to depict a -// fairly expensive scenario. Someone with fancy ALU knowledge could -// probably pick worse values. -double div64() -{ - int count = 1000000; - // NB: Expect an x86 processor exception is there's overflow. - uint64_t start = Cycles::rdtsc(); - uint64_t numeratorHi = 0x5a5a5a5a5a5UL; - uint64_t numeratorLo = 0x55aa55aa55aa55aaUL; - uint64_t divisor = 0xaa55aa55aa55aa55UL; - uint64_t quotient; - uint64_t remainder; - for (int i = 0; i < count; i++) { - __asm__ __volatile__("divq %4" : - "=a"(quotient), "=d"(remainder) : - "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) : - "cc"); - } - uint64_t stop = Cycles::rdtsc(); - return Cycles::to_seconds(stop - start)/count; -} - -// Measure the cost of calling a non-inlined function. -double function_call() -{ - int count = 1000000; - uint64_t x = 0; - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - x = PerfHelper::plus_one(x); - } - uint64_t stop = Cycles::rdtsc(); - return Cycles::to_seconds(stop - start)/count; -} - -// Measure the minimum cost of EventCenter::process_events, when there are no -// Pollers and no Timers. -double eventcenter_poll() -{ - int count = 1000000; - EventCenter center(g_ceph_context); - center.init(1000); - center.set_owner(pthread_self()); - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - center.process_events(0); - } - uint64_t stop = Cycles::rdtsc(); - return Cycles::to_seconds(stop - start)/count; -} - -class CenterWorker : public Thread { - CephContext *cct; - bool done; - - public: - EventCenter center; - CenterWorker(CephContext *c): cct(c), done(false), center(c) { - center.init(100); - } - void stop() { - done = true; - center.wakeup(); - } - void* entry() { - center.set_owner(pthread_self()); - bind_thread_to_cpu(2); - while (!done) - center.process_events(1000); - return 0; - } -}; - -class CountEvent: public EventCallback { - atomic_t *count; - - public: - CountEvent(atomic_t *atomic): count(atomic) {} - void do_request(int id) { - count->dec(); - } -}; - -double eventcenter_dispatch() -{ - int count = 100000; - - CenterWorker worker(g_ceph_context); - atomic_t flag(1); - worker.create(); - EventCallbackRef count_event(new CountEvent(&flag)); - - worker.center.dispatch_event_external(count_event); - // Start a new thread and wait for it to ready. - while (flag.read()) - usleep(100); - - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - flag.set(1); - worker.center.dispatch_event_external(count_event); - while (flag.read()) - ; - } - uint64_t stop = Cycles::rdtsc(); - worker.stop(); - worker.join(); - return Cycles::to_seconds(stop - start)/count; -} - -// Measure the cost of copying a given number of bytes with memcpy. -double memcpy_shared(size_t size) -{ - int count = 1000000; - char src[size], dst[size]; - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - memcpy(dst, src, size); - } - uint64_t stop = Cycles::rdtsc(); - return Cycles::to_seconds(stop - start)/count; -} - -double memcpy100() -{ - return memcpy_shared(100); -} - -double memcpy1000() -{ - return memcpy_shared(1000); -} - -double memcpy10000() -{ - return memcpy_shared(10000); -} - -// Benchmark rjenkins hashing performance on cached data. -template -double ceph_str_hash_rjenkins() -{ - int count = 100000; - char buf[key_length]; - - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) - ceph_str_hash(CEPH_STR_HASH_RJENKINS, buf, sizeof(buf)); - uint64_t stop = Cycles::rdtsc(); - - return Cycles::to_seconds(stop - start)/count; -} - -// Measure the cost of reading the fine-grain cycle counter. -double rdtsc_test() -{ - int count = 1000000; - uint64_t start = Cycles::rdtsc(); - uint64_t total = 0; - for (int i = 0; i < count; i++) { - total += Cycles::rdtsc(); - } - uint64_t stop = Cycles::rdtsc(); - return Cycles::to_seconds(stop - start)/count; -} - -// Measure the cost of the Cycles::to_seconds method. -double perf_cycles_to_seconds() -{ - int count = 1000000; - double total = 0; - uint64_t cycles = 994261; - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - total += Cycles::to_seconds(cycles); - } - uint64_t stop = Cycles::rdtsc(); - // printf("Result: %.4f\n", total/count); - return Cycles::to_seconds(stop - start)/count; -} - -// Measure the cost of the Cylcles::toNanoseconds method. -double perf_cycles_to_nanoseconds() -{ - int count = 1000000; - uint64_t total = 0; - uint64_t cycles = 994261; - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - total += Cycles::to_nanoseconds(cycles); - } - uint64_t stop = Cycles::rdtsc(); - // printf("Result: %lu\n", total/count); - return Cycles::to_seconds(stop - start)/count; -} - - -/** - * Prefetch the cache lines containing [object, object + numBytes) into the - * processor's caches. - * The best docs for this are in the Intel instruction set reference under - * PREFETCH. - * \param object - * The start of the region of memory to prefetch. - * \param num_bytes - * The size of the region of memory to prefetch. - */ -static inline void prefetch(const void *object, uint64_t num_bytes) -{ - uint64_t offset = reinterpret_cast(object) & 0x3fUL; - const char* p = reinterpret_cast(object) - offset; - for (uint64_t i = 0; i < offset + num_bytes; i += 64) - _mm_prefetch(p + i, _MM_HINT_T0); -} - -// Measure the cost of the prefetch instruction. -double perf_prefetch() -{ - uint64_t total_ticks = 0; - int count = 10; - char buf[16 * 64]; - uint64_t start, stop; - - for (int i = 0; i < count; i++) { - PerfHelper::flush_cache(); - start = Cycles::rdtsc(); - prefetch(&buf[576], 64); - prefetch(&buf[0], 64); - prefetch(&buf[512], 64); - prefetch(&buf[960], 64); - prefetch(&buf[640], 64); - prefetch(&buf[896], 64); - prefetch(&buf[256], 64); - prefetch(&buf[704], 64); - prefetch(&buf[320], 64); - prefetch(&buf[384], 64); - prefetch(&buf[128], 64); - prefetch(&buf[448], 64); - prefetch(&buf[768], 64); - prefetch(&buf[832], 64); - prefetch(&buf[64], 64); - prefetch(&buf[192], 64); - stop = Cycles::rdtsc(); - total_ticks += stop - start; - } - return Cycles::to_seconds(total_ticks) / count / 16; -} - -/** - * This function is used to seralize machine instructions so that no - * instructions that appear after it in the current thread can run before any - * instructions that appear before it. - * - * It is useful for putting around rdpmc instructions (to pinpoint cache - * misses) as well as before rdtsc instructions, to prevent time pollution from - * instructions supposed to be executing before the timer starts. - */ -static inline void serialize() { - uint32_t eax, ebx, ecx, edx; - __asm volatile("cpuid" - : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) - : "a" (1U)); -} - -// Measure the cost of cpuid -double perf_serialize() { - int count = 1000000; - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - serialize(); - } - uint64_t stop = Cycles::rdtsc(); - return Cycles::to_seconds(stop - start)/count; -} - -// Measure the cost of an lfence instruction. -double lfence() -{ - int count = 1000000; - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - __asm__ __volatile__("lfence" ::: "memory"); - } - uint64_t stop = Cycles::rdtsc(); - return Cycles::to_seconds(stop - start)/count; -} - -// Measure the cost of an sfence instruction. -double sfence() -{ - int count = 1000000; - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - __asm__ __volatile__("sfence" ::: "memory"); - } - uint64_t stop = Cycles::rdtsc(); - return Cycles::to_seconds(stop - start)/count; -} - -// Measure the cost of acquiring and releasing a SpinLock (assuming the -// lock is initially free). -double test_spinlock() -{ - int count = 1000000; - Spinlock lock; - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - lock.lock(); - lock.unlock(); - } - uint64_t stop = Cycles::rdtsc(); - return Cycles::to_seconds(stop - start)/count; -} - -// Helper for spawn_thread. This is the main function that the thread executes -// (intentionally empty). -class ThreadHelper : public Thread { - void *entry() { return 0; } -}; - -// Measure the cost of start and joining with a thread. -double spawn_thread() -{ - int count = 10000; - ThreadHelper thread; - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - thread.create(); - thread.join(); - } - uint64_t stop = Cycles::rdtsc(); - return Cycles::to_seconds(stop - start)/count; -} - -class FakeContext : public Context { - public: - virtual void finish(int r) {} -}; - -// Measure the cost of starting and stopping a Dispatch::Timer. -double perf_timer() -{ - int count = 1000000; - Mutex lock("perf_timer::lock"); - SafeTimer timer(g_ceph_context, lock); - FakeContext **c = new FakeContext*[count]; - for (int i = 0; i < count; i++) { - c[i] = new FakeContext(); - } - uint64_t start = Cycles::rdtsc(); - Mutex::Locker l(lock); - for (int i = 0; i < count; i++) { - timer.add_event_after(12345, c[i]); - timer.cancel_event(c[i]); - } - uint64_t stop = Cycles::rdtsc(); - delete c; - return Cycles::to_seconds(stop - start)/count; -} - -// Measure the cost of throwing and catching an int. This uses an integer as -// the value thrown, which is presumably as fast as possible. -double throw_int() -{ - int count = 10000; - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - try { - throw 0; - } catch (int) { // NOLINT - // pass - } - } - uint64_t stop = Cycles::rdtsc(); - return Cycles::to_seconds(stop - start)/count; -} - -// Measure the cost of throwing and catching an int from a function call. -double throw_int_call() -{ - int count = 10000; - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - try { - PerfHelper::throw_int(); - } catch (int) { // NOLINT - // pass - } - } - uint64_t stop = Cycles::rdtsc(); - return Cycles::to_seconds(stop - start)/count; -} - -// Measure the cost of throwing and catching an Exception. This uses an actual -// exception as the value thrown, which may be slower than throwInt. -double throw_exception() -{ - int count = 10000; - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - try { - throw buffer::end_of_buffer(); - } catch (const buffer::end_of_buffer&) { - // pass - } - } - uint64_t stop = Cycles::rdtsc(); - return Cycles::to_seconds(stop - start)/count; -} - -// Measure the cost of throwing and catching an Exception from a function call. -double throw_exception_call() -{ - int count = 10000; - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - try { - PerfHelper::throw_end_of_buffer(); - } catch (const buffer::end_of_buffer&) { - // pass - } - } - uint64_t stop = Cycles::rdtsc(); - return Cycles::to_seconds(stop - start)/count; -} - -// Measure the cost of pushing a new element on a std::vector, copying -// from the end to an internal element, and popping the end element. -double vector_push_pop() -{ - int count = 100000; - std::vector vector; - vector.push_back(1); - vector.push_back(2); - vector.push_back(3); - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - vector.push_back(i); - vector.push_back(i+1); - vector.push_back(i+2); - vector[2] = vector.back(); - vector.pop_back(); - vector[0] = vector.back(); - vector.pop_back(); - vector[1] = vector.back(); - vector.pop_back(); - } - uint64_t stop = Cycles::rdtsc(); - return Cycles::to_seconds(stop - start)/(count*3); -} - -// Measure the cost of ceph_clock_now -double perf_ceph_clock_now() -{ - int count = 100000; - uint64_t start = Cycles::rdtsc(); - for (int i = 0; i < count; i++) { - ceph_clock_now(g_ceph_context); - } - uint64_t stop = Cycles::rdtsc(); - return Cycles::to_seconds(stop - start)/count; -} - -// The following struct and table define each performance test in terms of -// a string name and a function that implements the test. -struct TestInfo { - const char* name; // Name of the performance test; this is - // what gets typed on the command line to - // run the test. - double (*func)(); // Function that implements the test; - // returns the time (in seconds) for each - // iteration of that test. - const char *description; // Short description of this test (not more - // than about 40 characters, so the entire - // test output fits on a single line). -}; -TestInfo tests[] = { - {"atomic_int_cmp", atomic_int_cmp, - "atomic_t::compare_and_swap"}, - {"atomic_int_inc", atomic_int_inc, - "atomic_t::inc"}, - {"atomic_int_read", atomic_int_read, - "atomic_t::read"}, - {"atomic_int_set", atomic_int_set, - "atomic_t::set"}, - {"mutex_nonblock", mutex_nonblock, - "Mutex lock/unlock (no blocking)"}, - {"buffer_basic", buffer_basic, - "buffer create, add one ptr, delete"}, - {"buffer_encode_decode", buffer_encode_decode, - "buffer create, encode/decode object, delete"}, - {"buffer_basic_copy", buffer_basic_copy, - "buffer create, copy small block, delete"}, - {"buffer_copy", buffer_copy, - "copy out 2 small ptrs from buffer"}, - {"buffer_encode10", buffer_encode, - "buffer encoding 10 structures onto existing ptr"}, - {"buffer_get_contiguous", buffer_get_contiguous, - "Buffer::get_contiguous"}, - {"buffer_iterator", buffer_iterator, - "iterate over buffer with 5 ptrs"}, - {"cond_ping_pong", cond_ping_pong, - "condition variable round-trip"}, - {"div32", div32, - "32-bit integer division instruction"}, - {"div64", div64, - "64-bit integer division instruction"}, - {"function_call", function_call, - "Call a function that has not been inlined"}, - {"eventcenter_poll", eventcenter_poll, - "EventCenter::process_events (no timers or events)"}, - {"eventcenter_dispatch", eventcenter_dispatch, - "EventCenter::dispatch_event_external latency"}, - {"memcpy100", memcpy100, - "Copy 100 bytes with memcpy"}, - {"memcpy1000", memcpy1000, - "Copy 1000 bytes with memcpy"}, - {"memcpy10000", memcpy10000, - "Copy 10000 bytes with memcpy"}, - {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<16>, - "rjenkins hash on 16 byte of data"}, - {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<256>, - "rjenkins hash on 256 bytes of data"}, - {"rdtsc", rdtsc_test, - "Read the fine-grain cycle counter"}, - {"cycles_to_seconds", perf_cycles_to_seconds, - "Convert a rdtsc result to (double) seconds"}, - {"cycles_to_seconds", perf_cycles_to_nanoseconds, - "Convert a rdtsc result to (uint64_t) nanoseconds"}, - {"prefetch", perf_prefetch, - "Prefetch instruction"}, - {"serialize", perf_serialize, - "serialize instruction"}, - {"lfence", lfence, - "Lfence instruction"}, - {"sfence", sfence, - "Sfence instruction"}, - {"spin_lock", test_spinlock, - "Acquire/release SpinLock"}, - {"spawn_thread", spawn_thread, - "Start and stop a thread"}, - {"perf_timer", perf_timer, - "Insert and cancel a SafeTimer"}, - {"throw_int", throw_int, - "Throw an int"}, - {"throw_int_call", throw_int_call, - "Throw an int in a function call"}, - {"throw_exception", throw_exception, - "Throw an Exception"}, - {"throw_exception_call", throw_exception_call, - "Throw an Exception in a function call"}, - {"vector_push_pop", vector_push_pop, - "Push and pop a std::vector"}, - {"ceph_clock_now", perf_ceph_clock_now, - "ceph_clock_now function"}, -}; - -/** - * Runs a particular test and prints a one-line result message. - * - * \param info - * Describes the test to run. - */ -void run_test(TestInfo& info) -{ - double secs = info.func(); - int width = printf("%-24s ", info.name); - if (secs < 1.0e-06) { - width += printf("%8.2fns", 1e09*secs); - } else if (secs < 1.0e-03) { - width += printf("%8.2fus", 1e06*secs); - } else if (secs < 1.0) { - width += printf("%8.2fms", 1e03*secs); - } else { - width += printf("%8.2fs", secs); - } - printf("%*s %s\n", 32-width, "", info.description); -} - -int main(int argc, char *argv[]) -{ - vector args; - argv_to_vec(argc, (const char **)argv, args); - - global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0); - common_init_finish(g_ceph_context); - - bind_thread_to_cpu(3); - if (argc == 1) { - // No test names specified; run all tests. - for (size_t i = 0; i < sizeof(tests)/sizeof(TestInfo); ++i) { - run_test(tests[i]); - } - } else { - // Run only the tests that were specified on the command line. - for (int i = 1; i < argc; i++) { - bool found_test = false; - for (size_t j = 0; j < sizeof(tests)/sizeof(TestInfo); ++j) { - if (strcmp(argv[i], tests[j].name) == 0) { - found_test = true; - run_test(tests[j]); - break; - } - } - if (!found_test) { - int width = printf("%-24s ??", argv[i]); - printf("%*s No such test\n", 32-width, ""); - } - } - } -} diff --git a/src/test/perf_local.cc b/src/test/perf_local.cc new file mode 100644 index 000000000000..ce0217c5d12d --- /dev/null +++ b/src/test/perf_local.cc @@ -0,0 +1,1009 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* Copyright (c) 2015 Haomai Wang + * Copyright (c) 2011 Facebook + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +// This program contains a collection of low-level performance measurements +// for Ceph, which can be run either individually or altogether. These +// tests measure performance in a single stand-alone process, not in a cluster +// with multiple servers. Invoke the program like this: +// +// Perf test1 test2 ... +// +// test1 and test2 are the names of individual performance measurements to +// run. If no test names are provided then all of the performance tests +// are run. +// +// To add a new test: +// * Write a function that implements the test. Use existing test functions +// as a guideline, and be sure to generate output in the same form as +// other tests. +// * Create a new entry for the test in the #tests table. +#include +#include +#include + +#include "include/atomic.h" +#include "include/buffer.h" +#include "include/encoding.h" +#include "include/ceph_hash.h" +#include "include/Spinlock.h" +#include "common/ceph_argparse.h" +#include "common/Cycles.h" +#include "common/Cond.h" +#include "common/Mutex.h" +#include "common/Thread.h" +#include "common/Timer.h" +#include "msg/async/Event.h" +#include "global/global_init.h" + +#include "test/perf_helper.h" + +using namespace ceph; + +/** + * Ask the operating system to pin the current thread to a given CPU. + * + * \param cpu + * Indicates the desired CPU and hyperthread; low order 2 bits + * specify CPU, next bit specifies hyperthread. + */ +void bind_thread_to_cpu(int cpu) +{ +#ifdef HAVE_SCHED + cpu_set_t set; + CPU_ZERO(&set); + CPU_SET(cpu, &set); + sched_setaffinity(0, sizeof(set), &set); +#endif +} + +/* + * This function just discards its argument. It's used to make it + * appear that data is used, so that the compiler won't optimize + * away the code we're trying to measure. + * + * \param value + * Pointer to arbitrary value; it's discarded. + */ +void discard(void* value) { + int x = *reinterpret_cast(value); + if (x == 0x43924776) { + printf("Value was 0x%x\n", x); + } +} + +//---------------------------------------------------------------------- +// Test functions start here +//---------------------------------------------------------------------- + +// Measure the cost of atomic_t::compare_and_swap +double atomic_int_cmp() +{ + int count = 1000000; + atomic_t value(11); + int test = 11; + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + value.compare_and_swap(test, test+2); + test += 2; + } + uint64_t stop = Cycles::rdtsc(); + // printf("Final value: %d\n", value.load()); + return Cycles::to_seconds(stop - start)/count; +} + +// Measure the cost of atomic_t::inc +double atomic_int_inc() +{ + int count = 1000000; + atomic_t value(11); + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + value.inc(); + } + uint64_t stop = Cycles::rdtsc(); + // printf("Final value: %d\n", value.load()); + return Cycles::to_seconds(stop - start)/count; +} + +// Measure the cost of reading an atomic_t +double atomic_int_read() +{ + int count = 1000000; + atomic_t value(11); + int total = 0; + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + total += value.read(); + } + uint64_t stop = Cycles::rdtsc(); + // printf("Total: %d\n", total); + return Cycles::to_seconds(stop - start)/count; +} + +// Measure the cost of storing a new value in a atomic_t +double atomic_int_set() +{ + int count = 1000000; + atomic_t value(11); + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + value.set(88); + } + uint64_t stop = Cycles::rdtsc(); + return Cycles::to_seconds(stop - start)/count; +} + +// Measure the cost of acquiring and releasing a mutex in the +// fast case where the mutex is free. +double mutex_nonblock() +{ + int count = 1000000; + Mutex m("mutex_nonblock::m"); + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + m.Lock(); + m.Unlock(); + } + uint64_t stop = Cycles::rdtsc(); + return Cycles::to_seconds(stop - start)/count; +} + +// Measure the cost of allocating and deallocating a buffer, plus +// appending (logically) one ptr. +double buffer_basic() +{ + int count = 1000000; + uint64_t start = Cycles::rdtsc(); + bufferptr ptr("abcdefg", 7); + for (int i = 0; i < count; i++) { + bufferlist b; + b.append(ptr, 0, 5); + } + uint64_t stop = Cycles::rdtsc(); + return Cycles::to_seconds(stop - start)/count; +} + +struct DummyBlock { + int a, b, c, d; + void encode(bufferlist &bl) const { + ENCODE_START(1, 1, bl); + ::encode(a, bl); + ::encode(b, bl); + ::encode(c, bl); + ::encode(d, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::iterator &bl) { + DECODE_START(1, bl); + ::decode(a, bl); + ::decode(b, bl); + ::decode(c, bl); + ::decode(d, bl); + DECODE_FINISH(bl); + } +}; +WRITE_CLASS_ENCODER(DummyBlock) + +// Measure the cost of encoding and decoding a buffer, plus +// allocating space for one chunk. +double buffer_encode_decode() +{ + int count = 1000000; + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + bufferlist b; + DummyBlock dummy_block; + ::encode(dummy_block, b); + bufferlist::iterator iter = b.begin(); + ::decode(dummy_block, iter); + } + uint64_t stop = Cycles::rdtsc(); + return Cycles::to_seconds(stop - start)/count; +} + +// Measure the cost of allocating and deallocating a buffer, plus +// copying in a small block. +double buffer_basic_copy() +{ + int count = 1000000; + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + bufferlist b; + b.append("abcdefg", 6); + } + uint64_t stop = Cycles::rdtsc(); + return Cycles::to_seconds(stop - start)/count; +} + +// Measure the cost of making a copy of parts of two ptrs. +double buffer_copy() +{ + int count = 1000000; + bufferlist b; + b.append("abcde", 5); + b.append("01234", 5); + char copy[10]; + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + b.copy(2, 6, copy); + } + uint64_t stop = Cycles::rdtsc(); + return Cycles::to_seconds(stop - start)/count; +} + +// Measure the cost of allocating new space by extending the +// bufferlist +double buffer_encode() +{ + int count = 100000; + uint64_t total = 0; + for (int i = 0; i < count; i++) { + bufferlist b; + DummyBlock dummy_block; + ::encode(dummy_block, b); + uint64_t start = Cycles::rdtsc(); + ::encode(dummy_block, b); + ::encode(dummy_block, b); + ::encode(dummy_block, b); + ::encode(dummy_block, b); + ::encode(dummy_block, b); + ::encode(dummy_block, b); + ::encode(dummy_block, b); + ::encode(dummy_block, b); + ::encode(dummy_block, b); + ::encode(dummy_block, b); + total += Cycles::rdtsc() - start; + } + return Cycles::to_seconds(total)/(count*10); +} + +// Measure the cost of retrieving an object from the beginning of a buffer. +double buffer_get_contiguous() +{ + int count = 1000000; + int value = 11; + bufferlist b; + b.append((char*)&value, sizeof(value)); + int sum = 0; + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + sum += *reinterpret_cast(b.get_contiguous(0, sizeof(value))); + } + uint64_t stop = Cycles::rdtsc(); + return Cycles::to_seconds(stop - start)/count; +} + +// Measure the cost of creating an iterator and iterating over 10 +// chunks in a buffer. +double buffer_iterator() +{ + bufferlist b; + const char s[] = "abcdefghijklmnopqrstuvwxyz"; + bufferptr ptr(s, sizeof(s)); + for (int i = 0; i < 5; i++) { + b.append(ptr, i, 5); + } + int count = 100000; + int sum = 0; + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + bufferlist::iterator it = b.begin(); + while (!it.end()) { + sum += (static_cast(it.get_current_ptr().c_str()))[it.get_remaining()-1]; + ++it; + } + } + uint64_t stop = Cycles::rdtsc(); + discard(&sum); + return Cycles::to_seconds(stop - start)/count; +} + +// Implements the CondPingPong test. +class CondPingPong { + Mutex mutex; + Cond cond; + int prod; + int cons; + const int count; + + class Consumer : public Thread { + CondPingPong *p; + public: + Consumer(CondPingPong *p): p(p) {} + void* entry() { + p->consume(); + return 0; + } + } consumer; + + public: + CondPingPong(): mutex("CondPingPong::mutex"), prod(0), cons(0), count(10000), consumer(this) {} + + double run() { + consumer.create(); + uint64_t start = Cycles::rdtsc(); + produce(); + uint64_t stop = Cycles::rdtsc(); + consumer.join(); + return Cycles::to_seconds(stop - start)/count; + } + + void produce() { + Mutex::Locker l(mutex); + while (cons < count) { + while (cons < prod) + cond.Wait(mutex); + ++prod; + cond.Signal(); + } + } + + void consume() { + Mutex::Locker l(mutex); + while (cons < count) { + while (cons == prod) + cond.Wait(mutex); + ++cons; + cond.Signal(); + } + } +}; + +// Measure the cost of coordinating between threads using a condition variable. +double cond_ping_pong() +{ + return CondPingPong().run(); +} + +// Measure the cost of a 32-bit divide. Divides don't take a constant +// number of cycles. Values were chosen here semi-randomly to depict a +// fairly expensive scenario. Someone with fancy ALU knowledge could +// probably pick worse values. +double div32() +{ + int count = 1000000; + uint64_t start = Cycles::rdtsc(); + // NB: Expect an x86 processor exception is there's overflow. + uint32_t numeratorHi = 0xa5a5a5a5U; + uint32_t numeratorLo = 0x55aa55aaU; + uint32_t divisor = 0xaa55aa55U; + uint32_t quotient; + uint32_t remainder; + for (int i = 0; i < count; i++) { + __asm__ __volatile__("div %4" : + "=a"(quotient), "=d"(remainder) : + "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) : + "cc"); + } + uint64_t stop = Cycles::rdtsc(); + return Cycles::to_seconds(stop - start)/count; +} + +// Measure the cost of a 64-bit divide. Divides don't take a constant +// number of cycles. Values were chosen here semi-randomly to depict a +// fairly expensive scenario. Someone with fancy ALU knowledge could +// probably pick worse values. +double div64() +{ + int count = 1000000; + // NB: Expect an x86 processor exception is there's overflow. + uint64_t start = Cycles::rdtsc(); + uint64_t numeratorHi = 0x5a5a5a5a5a5UL; + uint64_t numeratorLo = 0x55aa55aa55aa55aaUL; + uint64_t divisor = 0xaa55aa55aa55aa55UL; + uint64_t quotient; + uint64_t remainder; + for (int i = 0; i < count; i++) { + __asm__ __volatile__("divq %4" : + "=a"(quotient), "=d"(remainder) : + "a"(numeratorLo), "d"(numeratorHi), "r"(divisor) : + "cc"); + } + uint64_t stop = Cycles::rdtsc(); + return Cycles::to_seconds(stop - start)/count; +} + +// Measure the cost of calling a non-inlined function. +double function_call() +{ + int count = 1000000; + uint64_t x = 0; + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + x = PerfHelper::plus_one(x); + } + uint64_t stop = Cycles::rdtsc(); + return Cycles::to_seconds(stop - start)/count; +} + +// Measure the minimum cost of EventCenter::process_events, when there are no +// Pollers and no Timers. +double eventcenter_poll() +{ + int count = 1000000; + EventCenter center(g_ceph_context); + center.init(1000); + center.set_owner(pthread_self()); + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + center.process_events(0); + } + uint64_t stop = Cycles::rdtsc(); + return Cycles::to_seconds(stop - start)/count; +} + +class CenterWorker : public Thread { + CephContext *cct; + bool done; + + public: + EventCenter center; + CenterWorker(CephContext *c): cct(c), done(false), center(c) { + center.init(100); + } + void stop() { + done = true; + center.wakeup(); + } + void* entry() { + center.set_owner(pthread_self()); + bind_thread_to_cpu(2); + while (!done) + center.process_events(1000); + return 0; + } +}; + +class CountEvent: public EventCallback { + atomic_t *count; + + public: + CountEvent(atomic_t *atomic): count(atomic) {} + void do_request(int id) { + count->dec(); + } +}; + +double eventcenter_dispatch() +{ + int count = 100000; + + CenterWorker worker(g_ceph_context); + atomic_t flag(1); + worker.create(); + EventCallbackRef count_event(new CountEvent(&flag)); + + worker.center.dispatch_event_external(count_event); + // Start a new thread and wait for it to ready. + while (flag.read()) + usleep(100); + + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + flag.set(1); + worker.center.dispatch_event_external(count_event); + while (flag.read()) + ; + } + uint64_t stop = Cycles::rdtsc(); + worker.stop(); + worker.join(); + return Cycles::to_seconds(stop - start)/count; +} + +// Measure the cost of copying a given number of bytes with memcpy. +double memcpy_shared(size_t size) +{ + int count = 1000000; + char src[size], dst[size]; + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + memcpy(dst, src, size); + } + uint64_t stop = Cycles::rdtsc(); + return Cycles::to_seconds(stop - start)/count; +} + +double memcpy100() +{ + return memcpy_shared(100); +} + +double memcpy1000() +{ + return memcpy_shared(1000); +} + +double memcpy10000() +{ + return memcpy_shared(10000); +} + +// Benchmark rjenkins hashing performance on cached data. +template +double ceph_str_hash_rjenkins() +{ + int count = 100000; + char buf[key_length]; + + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) + ceph_str_hash(CEPH_STR_HASH_RJENKINS, buf, sizeof(buf)); + uint64_t stop = Cycles::rdtsc(); + + return Cycles::to_seconds(stop - start)/count; +} + +// Measure the cost of reading the fine-grain cycle counter. +double rdtsc_test() +{ + int count = 1000000; + uint64_t start = Cycles::rdtsc(); + uint64_t total = 0; + for (int i = 0; i < count; i++) { + total += Cycles::rdtsc(); + } + uint64_t stop = Cycles::rdtsc(); + return Cycles::to_seconds(stop - start)/count; +} + +// Measure the cost of the Cycles::to_seconds method. +double perf_cycles_to_seconds() +{ + int count = 1000000; + double total = 0; + uint64_t cycles = 994261; + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + total += Cycles::to_seconds(cycles); + } + uint64_t stop = Cycles::rdtsc(); + // printf("Result: %.4f\n", total/count); + return Cycles::to_seconds(stop - start)/count; +} + +// Measure the cost of the Cylcles::toNanoseconds method. +double perf_cycles_to_nanoseconds() +{ + int count = 1000000; + uint64_t total = 0; + uint64_t cycles = 994261; + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + total += Cycles::to_nanoseconds(cycles); + } + uint64_t stop = Cycles::rdtsc(); + // printf("Result: %lu\n", total/count); + return Cycles::to_seconds(stop - start)/count; +} + + +/** + * Prefetch the cache lines containing [object, object + numBytes) into the + * processor's caches. + * The best docs for this are in the Intel instruction set reference under + * PREFETCH. + * \param object + * The start of the region of memory to prefetch. + * \param num_bytes + * The size of the region of memory to prefetch. + */ +static inline void prefetch(const void *object, uint64_t num_bytes) +{ + uint64_t offset = reinterpret_cast(object) & 0x3fUL; + const char* p = reinterpret_cast(object) - offset; + for (uint64_t i = 0; i < offset + num_bytes; i += 64) + _mm_prefetch(p + i, _MM_HINT_T0); +} + +// Measure the cost of the prefetch instruction. +double perf_prefetch() +{ + uint64_t total_ticks = 0; + int count = 10; + char buf[16 * 64]; + uint64_t start, stop; + + for (int i = 0; i < count; i++) { + PerfHelper::flush_cache(); + start = Cycles::rdtsc(); + prefetch(&buf[576], 64); + prefetch(&buf[0], 64); + prefetch(&buf[512], 64); + prefetch(&buf[960], 64); + prefetch(&buf[640], 64); + prefetch(&buf[896], 64); + prefetch(&buf[256], 64); + prefetch(&buf[704], 64); + prefetch(&buf[320], 64); + prefetch(&buf[384], 64); + prefetch(&buf[128], 64); + prefetch(&buf[448], 64); + prefetch(&buf[768], 64); + prefetch(&buf[832], 64); + prefetch(&buf[64], 64); + prefetch(&buf[192], 64); + stop = Cycles::rdtsc(); + total_ticks += stop - start; + } + return Cycles::to_seconds(total_ticks) / count / 16; +} + +/** + * This function is used to seralize machine instructions so that no + * instructions that appear after it in the current thread can run before any + * instructions that appear before it. + * + * It is useful for putting around rdpmc instructions (to pinpoint cache + * misses) as well as before rdtsc instructions, to prevent time pollution from + * instructions supposed to be executing before the timer starts. + */ +static inline void serialize() { + uint32_t eax, ebx, ecx, edx; + __asm volatile("cpuid" + : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) + : "a" (1U)); +} + +// Measure the cost of cpuid +double perf_serialize() { + int count = 1000000; + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + serialize(); + } + uint64_t stop = Cycles::rdtsc(); + return Cycles::to_seconds(stop - start)/count; +} + +// Measure the cost of an lfence instruction. +double lfence() +{ + int count = 1000000; + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + __asm__ __volatile__("lfence" ::: "memory"); + } + uint64_t stop = Cycles::rdtsc(); + return Cycles::to_seconds(stop - start)/count; +} + +// Measure the cost of an sfence instruction. +double sfence() +{ + int count = 1000000; + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + __asm__ __volatile__("sfence" ::: "memory"); + } + uint64_t stop = Cycles::rdtsc(); + return Cycles::to_seconds(stop - start)/count; +} + +// Measure the cost of acquiring and releasing a SpinLock (assuming the +// lock is initially free). +double test_spinlock() +{ + int count = 1000000; + Spinlock lock; + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + lock.lock(); + lock.unlock(); + } + uint64_t stop = Cycles::rdtsc(); + return Cycles::to_seconds(stop - start)/count; +} + +// Helper for spawn_thread. This is the main function that the thread executes +// (intentionally empty). +class ThreadHelper : public Thread { + void *entry() { return 0; } +}; + +// Measure the cost of start and joining with a thread. +double spawn_thread() +{ + int count = 10000; + ThreadHelper thread; + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + thread.create(); + thread.join(); + } + uint64_t stop = Cycles::rdtsc(); + return Cycles::to_seconds(stop - start)/count; +} + +class FakeContext : public Context { + public: + virtual void finish(int r) {} +}; + +// Measure the cost of starting and stopping a Dispatch::Timer. +double perf_timer() +{ + int count = 1000000; + Mutex lock("perf_timer::lock"); + SafeTimer timer(g_ceph_context, lock); + FakeContext **c = new FakeContext*[count]; + for (int i = 0; i < count; i++) { + c[i] = new FakeContext(); + } + uint64_t start = Cycles::rdtsc(); + Mutex::Locker l(lock); + for (int i = 0; i < count; i++) { + timer.add_event_after(12345, c[i]); + timer.cancel_event(c[i]); + } + uint64_t stop = Cycles::rdtsc(); + delete c; + return Cycles::to_seconds(stop - start)/count; +} + +// Measure the cost of throwing and catching an int. This uses an integer as +// the value thrown, which is presumably as fast as possible. +double throw_int() +{ + int count = 10000; + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + try { + throw 0; + } catch (int) { // NOLINT + // pass + } + } + uint64_t stop = Cycles::rdtsc(); + return Cycles::to_seconds(stop - start)/count; +} + +// Measure the cost of throwing and catching an int from a function call. +double throw_int_call() +{ + int count = 10000; + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + try { + PerfHelper::throw_int(); + } catch (int) { // NOLINT + // pass + } + } + uint64_t stop = Cycles::rdtsc(); + return Cycles::to_seconds(stop - start)/count; +} + +// Measure the cost of throwing and catching an Exception. This uses an actual +// exception as the value thrown, which may be slower than throwInt. +double throw_exception() +{ + int count = 10000; + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + try { + throw buffer::end_of_buffer(); + } catch (const buffer::end_of_buffer&) { + // pass + } + } + uint64_t stop = Cycles::rdtsc(); + return Cycles::to_seconds(stop - start)/count; +} + +// Measure the cost of throwing and catching an Exception from a function call. +double throw_exception_call() +{ + int count = 10000; + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + try { + PerfHelper::throw_end_of_buffer(); + } catch (const buffer::end_of_buffer&) { + // pass + } + } + uint64_t stop = Cycles::rdtsc(); + return Cycles::to_seconds(stop - start)/count; +} + +// Measure the cost of pushing a new element on a std::vector, copying +// from the end to an internal element, and popping the end element. +double vector_push_pop() +{ + int count = 100000; + std::vector vector; + vector.push_back(1); + vector.push_back(2); + vector.push_back(3); + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + vector.push_back(i); + vector.push_back(i+1); + vector.push_back(i+2); + vector[2] = vector.back(); + vector.pop_back(); + vector[0] = vector.back(); + vector.pop_back(); + vector[1] = vector.back(); + vector.pop_back(); + } + uint64_t stop = Cycles::rdtsc(); + return Cycles::to_seconds(stop - start)/(count*3); +} + +// Measure the cost of ceph_clock_now +double perf_ceph_clock_now() +{ + int count = 100000; + uint64_t start = Cycles::rdtsc(); + for (int i = 0; i < count; i++) { + ceph_clock_now(g_ceph_context); + } + uint64_t stop = Cycles::rdtsc(); + return Cycles::to_seconds(stop - start)/count; +} + +// The following struct and table define each performance test in terms of +// a string name and a function that implements the test. +struct TestInfo { + const char* name; // Name of the performance test; this is + // what gets typed on the command line to + // run the test. + double (*func)(); // Function that implements the test; + // returns the time (in seconds) for each + // iteration of that test. + const char *description; // Short description of this test (not more + // than about 40 characters, so the entire + // test output fits on a single line). +}; +TestInfo tests[] = { + {"atomic_int_cmp", atomic_int_cmp, + "atomic_t::compare_and_swap"}, + {"atomic_int_inc", atomic_int_inc, + "atomic_t::inc"}, + {"atomic_int_read", atomic_int_read, + "atomic_t::read"}, + {"atomic_int_set", atomic_int_set, + "atomic_t::set"}, + {"mutex_nonblock", mutex_nonblock, + "Mutex lock/unlock (no blocking)"}, + {"buffer_basic", buffer_basic, + "buffer create, add one ptr, delete"}, + {"buffer_encode_decode", buffer_encode_decode, + "buffer create, encode/decode object, delete"}, + {"buffer_basic_copy", buffer_basic_copy, + "buffer create, copy small block, delete"}, + {"buffer_copy", buffer_copy, + "copy out 2 small ptrs from buffer"}, + {"buffer_encode10", buffer_encode, + "buffer encoding 10 structures onto existing ptr"}, + {"buffer_get_contiguous", buffer_get_contiguous, + "Buffer::get_contiguous"}, + {"buffer_iterator", buffer_iterator, + "iterate over buffer with 5 ptrs"}, + {"cond_ping_pong", cond_ping_pong, + "condition variable round-trip"}, + {"div32", div32, + "32-bit integer division instruction"}, + {"div64", div64, + "64-bit integer division instruction"}, + {"function_call", function_call, + "Call a function that has not been inlined"}, + {"eventcenter_poll", eventcenter_poll, + "EventCenter::process_events (no timers or events)"}, + {"eventcenter_dispatch", eventcenter_dispatch, + "EventCenter::dispatch_event_external latency"}, + {"memcpy100", memcpy100, + "Copy 100 bytes with memcpy"}, + {"memcpy1000", memcpy1000, + "Copy 1000 bytes with memcpy"}, + {"memcpy10000", memcpy10000, + "Copy 10000 bytes with memcpy"}, + {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<16>, + "rjenkins hash on 16 byte of data"}, + {"ceph_str_hash_rjenkins", ceph_str_hash_rjenkins<256>, + "rjenkins hash on 256 bytes of data"}, + {"rdtsc", rdtsc_test, + "Read the fine-grain cycle counter"}, + {"cycles_to_seconds", perf_cycles_to_seconds, + "Convert a rdtsc result to (double) seconds"}, + {"cycles_to_seconds", perf_cycles_to_nanoseconds, + "Convert a rdtsc result to (uint64_t) nanoseconds"}, + {"prefetch", perf_prefetch, + "Prefetch instruction"}, + {"serialize", perf_serialize, + "serialize instruction"}, + {"lfence", lfence, + "Lfence instruction"}, + {"sfence", sfence, + "Sfence instruction"}, + {"spin_lock", test_spinlock, + "Acquire/release SpinLock"}, + {"spawn_thread", spawn_thread, + "Start and stop a thread"}, + {"perf_timer", perf_timer, + "Insert and cancel a SafeTimer"}, + {"throw_int", throw_int, + "Throw an int"}, + {"throw_int_call", throw_int_call, + "Throw an int in a function call"}, + {"throw_exception", throw_exception, + "Throw an Exception"}, + {"throw_exception_call", throw_exception_call, + "Throw an Exception in a function call"}, + {"vector_push_pop", vector_push_pop, + "Push and pop a std::vector"}, + {"ceph_clock_now", perf_ceph_clock_now, + "ceph_clock_now function"}, +}; + +/** + * Runs a particular test and prints a one-line result message. + * + * \param info + * Describes the test to run. + */ +void run_test(TestInfo& info) +{ + double secs = info.func(); + int width = printf("%-24s ", info.name); + if (secs < 1.0e-06) { + width += printf("%8.2fns", 1e09*secs); + } else if (secs < 1.0e-03) { + width += printf("%8.2fus", 1e06*secs); + } else if (secs < 1.0) { + width += printf("%8.2fms", 1e03*secs); + } else { + width += printf("%8.2fs", secs); + } + printf("%*s %s\n", 32-width, "", info.description); +} + +int main(int argc, char *argv[]) +{ + vector args; + argv_to_vec(argc, (const char **)argv, args); + + global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0); + common_init_finish(g_ceph_context); + + bind_thread_to_cpu(3); + if (argc == 1) { + // No test names specified; run all tests. + for (size_t i = 0; i < sizeof(tests)/sizeof(TestInfo); ++i) { + run_test(tests[i]); + } + } else { + // Run only the tests that were specified on the command line. + for (int i = 1; i < argc; i++) { + bool found_test = false; + for (size_t j = 0; j < sizeof(tests)/sizeof(TestInfo); ++j) { + if (strcmp(argv[i], tests[j].name) == 0) { + found_test = true; + run_test(tests[j]); + break; + } + } + if (!found_test) { + int width = printf("%-24s ??", argv[i]); + printf("%*s No such test\n", 32-width, ""); + } + } + } +}