From: Sage Weil Date: Thu, 20 Aug 2015 19:50:24 +0000 (-0400) Subject: buffer: move inline memory ops to inline_memory.h; gcc + x86_64 only X-Git-Tag: v9.1.0~296^2~3 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=566907779d4e63da89d15aa9d9d2d62ef7ba13d2;p=ceph.git buffer: move inline memory ops to inline_memory.h; gcc + x86_64 only Keep the architecture-sensitive code in a separate header. Avoid duplicating the unrolled memcpy in each buffer.cc method. Signed-off-by: Sage Weil --- diff --git a/src/common/buffer.cc b/src/common/buffer.cc index f2148fdcdb50..eabc9e978a00 100644 --- a/src/common/buffer.cc +++ b/src/common/buffer.cc @@ -24,6 +24,7 @@ #include "common/RWLock.h" #include "include/types.h" #include "include/compat.h" +#include "include/inline_memory.h" #if defined(HAVE_XIO) #include "msg/xio/XioMsg.h" #endif @@ -37,10 +38,6 @@ #include namespace ceph { -#if defined(__GNUC__) && defined(__x86_64__) - typedef unsigned uint128_t __attribute__ ((mode (TI))); -#endif - #ifdef BUFFER_DEBUG static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; # define bdout { simple_spin_lock(&buffer_debug_lock); std::cout @@ -786,32 +783,8 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; if (o+l > _len) throw end_of_buffer(); char* src = _raw->data + _off + o; - if (l > 8) { - memcpy(dest, src, l); - return; - } - switch (l) { - case 8: - *((uint64_t*)(dest)) = *((uint64_t*)(src)); - return; - case 4: - *((uint32_t*)(dest)) = *((uint32_t*)(src)); - return; - case 3: - *((uint16_t*)(dest)) = *((uint16_t*)(src)); - *((uint8_t*)(dest+2)) = *((uint8_t*)(src+2)); - return; - case 2: - *((uint16_t*)(dest)) = *((uint16_t*)(src)); - return; - case 1: - *((uint8_t*)(dest)) = *((uint8_t*)(src)); - return; - default: - memcpy(dest, src, l); - return; - } - } + maybe_inline_memcpy(dest, src, l, 8); + } unsigned buffer::ptr::wasted() { @@ -836,50 +809,7 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; bool buffer::ptr::is_zero() const { - const char* data = c_str(); - const char* max = data + _len; - const char* max32 = data + (_len / sizeof(uint32_t))*sizeof(uint32_t); -#if defined(__GNUC__) && defined(__x86_64__) - // we do have XMM registers in x86-64, so if we need to check at least - // 16 bytes, make use of them - int left = _len; - if (left / sizeof(uint128_t) > 0) { - // align data pointer to 16 bytes, otherwise it'll segfault due to bug - // in (at least some) GCC versions (using MOVAPS instead of MOVUPS). - // check up to 15 first bytes while at it. - while (((unsigned long long)data) & 15) { - if (*(uint8_t*)data != 0) { - return false; - } - data += sizeof(uint8_t); - left--; - } - - const char* max128 = data + (left / sizeof(uint128_t))*sizeof(uint128_t); - - while (data < max128) { - if (*(uint128_t*)data != 0) { - return false; - } - data += sizeof(uint128_t); - } - } -#endif - while (data < max32) { - if (*(uint32_t*)data != 0) { - return false; - } - data += sizeof(uint32_t); - } - - while (data < max) { - if (*(uint8_t*)data != 0) { - return false; - } - data += sizeof(uint8_t); - } - - return true; + return mem_is_zero(c_str(), _len); } unsigned buffer::ptr::append(char c) @@ -897,47 +827,8 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; assert(_raw); assert(l <= unused_tail_length()); char* c = _raw->data + _off + _len; - if (l <= 32) { - _len += l; - switch (l) { - case 16: - *((uint64_t*)(c)) = *((uint64_t*)(p)); - *((uint64_t*)(c+sizeof(uint64_t))) = *((uint64_t*)(p+sizeof(uint64_t))); - return _len + _off; - case 8: - *((uint64_t*)(c)) = *((uint64_t*)(p)); - return _len + _off; - case 4: - *((uint32_t*)(c)) = *((uint32_t*)(p)); - return _len + _off; - case 2: - *((uint16_t*)(c)) = *((uint16_t*)(p)); - return _len + _off; - case 1: - *((uint8_t*)(c)) = *((uint8_t*)(p)); - return _len + _off; - } - int cursor = 0; - while (l >= sizeof(uint64_t)) { - *((uint64_t*)(c + cursor)) = *((uint64_t*)(p + cursor)); - cursor += sizeof(uint64_t); - l -= sizeof(uint64_t); - } - while (l >= sizeof(uint32_t)) { - *((uint32_t*)(c + cursor)) = *((uint32_t*)(p + cursor)); - cursor += sizeof(uint32_t); - l -= sizeof(uint32_t); - } - while (l > 0) { - *(c+cursor) = *(p+cursor); - cursor++; - l--; - } - } - else { - memcpy(c, p, l); - _len += l; - } + maybe_inline_memcpy(c, p, l, 32); + _len += l; return _len + _off; } @@ -949,46 +840,7 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER; char* dest = _raw->data + _off + o; if (crc_reset) _raw->invalidate_crc(); - if (l < 64) { - switch (l) { - case 1: - *((uint8_t*)(dest)) = *((uint8_t*)(src)); - return; - case 2: - *((uint16_t*)(dest)) = *((uint16_t*)(src)); - return; - case 3: - *((uint16_t*)(dest)) = *((uint16_t*)(src)); - *((uint8_t*)(dest+2)) = *((uint8_t*)(src+2)); - return; - case 4: - *((uint32_t*)(dest)) = *((uint32_t*)(src)); - return; - case 8: - *((uint64_t*)(dest)) = *((uint64_t*)(src)); - return; - default: - int cursor = 0; - while (l >= sizeof(uint64_t)) { - *((uint64_t*)(dest + cursor)) = *((uint64_t*)(src + cursor)); - cursor += sizeof(uint64_t); - l -= sizeof(uint64_t); - } - while (l >= sizeof(uint32_t)) { - *((uint32_t*)(dest + cursor)) = *((uint32_t*)(src + cursor)); - cursor += sizeof(uint32_t); - l -= sizeof(uint32_t); - } - while (l > 0) { - *(dest + cursor) = *(src + cursor); - cursor++; - l--; - } - return; - } - } else { - memcpy(dest, src, l); - } + maybe_inline_memcpy(dest, src, l, 64); } void buffer::ptr::zero(bool crc_reset) diff --git a/src/include/Makefile.am b/src/include/Makefile.am index b3ceb24bf1a5..56fa49ecf8d0 100644 --- a/src/include/Makefile.am +++ b/src/include/Makefile.am @@ -68,6 +68,7 @@ noinst_HEADERS += \ include/filepath.h \ include/frag.h \ include/hash.h \ + include/inline_memory.h \ include/intarith.h \ include/interval_set.h \ include/int_types.h \ diff --git a/src/include/inline_memory.h b/src/include/inline_memory.h new file mode 100644 index 000000000000..6e08e420e535 --- /dev/null +++ b/src/include/inline_memory.h @@ -0,0 +1,139 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_INLINE_MEMORY_H +#define CEPH_INLINE_MEMORY_H + +// only define these for x86_64 for now. + +#if defined(__GNUC__) && defined(__x86_64__) + +typedef unsigned uint128_t __attribute__ ((mode (TI))); + +// optimize for the common case, which is very small copies +static inline void maybe_inline_memcpy(char *dest, const char *src, size_t l, + size_t inline_len) + __attribute__((always_inline)); + +void maybe_inline_memcpy(char *dest, const char *src, size_t l, + size_t inline_len) +{ + if (l > inline_len) { + memcpy(dest, src, l); + return; + } + switch (l) { + case 8: + *((uint64_t*)(dest)) = *((uint64_t*)(src)); + return; + case 4: + *((uint32_t*)(dest)) = *((uint32_t*)(src)); + return; + case 3: + *((uint16_t*)(dest)) = *((uint16_t*)(src)); + *((uint8_t*)(dest+2)) = *((uint8_t*)(src+2)); + return; + case 2: + *((uint16_t*)(dest)) = *((uint16_t*)(src)); + return; + case 1: + *((uint8_t*)(dest)) = *((uint8_t*)(src)); + return; + default: + int cursor = 0; + while (l >= sizeof(uint64_t)) { + *((uint64_t*)(dest + cursor)) = *((uint64_t*)(src + cursor)); + cursor += sizeof(uint64_t); + l -= sizeof(uint64_t); + } + while (l >= sizeof(uint32_t)) { + *((uint32_t*)(dest + cursor)) = *((uint32_t*)(src + cursor)); + cursor += sizeof(uint32_t); + l -= sizeof(uint32_t); + } + while (l > 0) { + *(dest + cursor) = *(src + cursor); + cursor++; + l--; + } + } +} + +static inline bool mem_is_zero(const char *data, size_t len) + __attribute__((always_inline)); + +bool mem_is_zero(const char *data, size_t len) +{ + const char *max = data + len; + const char* max32 = data + (len / sizeof(uint32_t))*sizeof(uint32_t); +#if defined(__GNUC__) && defined(__x86_64__) + // we do have XMM registers in x86-64, so if we need to check at least + // 16 bytes, make use of them + int left = len; + if (left / sizeof(uint128_t) > 0) { + // align data pointer to 16 bytes, otherwise it'll segfault due to bug + // in (at least some) GCC versions (using MOVAPS instead of MOVUPS). + // check up to 15 first bytes while at it. + while (((unsigned long long)data) & 15) { + if (*(uint8_t*)data != 0) { + return false; + } + data += sizeof(uint8_t); + left--; + } + + const char* max128 = data + (left / sizeof(uint128_t))*sizeof(uint128_t); + + while (data < max128) { + if (*(uint128_t*)data != 0) { + return false; + } + data += sizeof(uint128_t); + } + } +#endif + while (data < max32) { + if (*(uint32_t*)data != 0) { + return false; + } + data += sizeof(uint32_t); + } + while (data < max) { + if (*(uint8_t*)data != 0) { + return false; + } + data += sizeof(uint8_t); + } + return true; +} + +#else // x86_64 + +// on other architectures, default to something simple. + +#define maybe_inline_memcpy(d, s, l, x) memcpy(d, s, l) + +static inline bool mem_is_zero(const char *data, size_t len) { + const char *end = data + len; + while (data < end) { + if (*data != 0) { + return false; + } + ++data; + } + return true; +} + +#endif // !x86_64 + +#endif