${async_rdma_common_srcs}
${dpdk_common_srcs}
msg/msg_types.cc
+ common/reverse.c
common/hobject.cc
osd/OSDMap.cc
osd/OSDMapMapping.cc
elseif(HAVE_POWER8)
list(APPEND libcommon_files
common/crc32c_ppc.c
- common/crc32c_ppc_asm.S)
+ common/crc32c_ppc_asm.S
+ common/crc32c_ppc_fast_zero_asm.S)
endif(HAVE_INTEL)
if(LINUX)
* 2 of the License, or (at your option) any later version.
*/
#define CRC_TABLE
+#define FAST_ZERO_TABLE
+
#include "acconfig.h"
#include "include/int_types.h"
#include "crc32c_ppc_constants.h"
+#include "reverse.h"
#include <stdlib.h>
#include <strings.h>
}
#endif
-
#ifdef HAVE_POWER8
+static inline unsigned long polynomial_multiply(unsigned int a, unsigned int b) {
+ vector unsigned int va = {a, 0, 0, 0};
+ vector unsigned int vb = {b, 0, 0, 0};
+ vector unsigned long vt;
+
+ __asm__("vpmsumw %0,%1,%2" : "=v"(vt) : "v"(va), "v"(vb));
+
+ return vt[0];
+}
+
+unsigned int barrett_reduction(unsigned long val);
+
+static inline unsigned int gf_multiply(unsigned int a, unsigned int b) {
+ return barrett_reduction(polynomial_multiply(a, b));
+}
+
+unsigned int append_zeros(unsigned int crc, unsigned long length) {
+ unsigned long i = 0;
+
+ while (length) {
+ if (length & 1) {
+ crc = gf_multiply(crc, crc_zero[i]);
+ }
+ i++;
+ length /= 2;
+ }
+
+ return crc;
+}
+
+
unsigned int __crc32_vpmsum(unsigned int crc, unsigned char const *p,
unsigned long len);
}
/* This wrapper function works around the fact that crc32_vpmsum
- * does not gracefully handle the case where the data pointer is NULL. There
- * may be room for performance improvement here.
+ * does not gracefully handle the case where the data pointer is NULL.
*/
uint32_t ceph_crc32c_ppc(uint32_t crc, unsigned char const *data, unsigned len)
{
- unsigned char *buf2;
-
if (!data) {
- buf2 = malloc(len);
- bzero(buf2, len);
- crc = crc32_vpmsum(crc, buf2, len);
- free(buf2);
+ /* Handle the NULL buffer case. */
+#ifdef REFLECT
+ crc = reverse_bits(crc);
+#endif
+
+ crc = append_zeros(crc, len);
+
+#ifdef REFLECT
+ crc = reverse_bits(crc);
+#endif
} else {
+ /* Handle the valid buffer case. */
crc = crc32_vpmsum(crc, data, (unsigned long)len);
}
return crc;
0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351,};
#endif
+
+#ifdef FAST_ZERO_TABLE
+/* fast zero table */
+unsigned int crc_zero[] = {
+ 0x100,
+ 0x10000,
+ 0x1edc6f41,
+ 0x3aab4576,
+ 0x18571d18,
+ 0x59a3508a,
+ 0xaa97d41d,
+ 0xe78dbf1d,
+ 0x4ef6a711,
+ 0x2506c32e,
+ 0x68d4e827,
+ 0x546ea6b0,
+ 0x465cebac,
+ 0x26a86214,
+ 0x964aa2fd,
+ 0x3b4c5747,
+ 0x6702ee7f,
+ 0xd086629f,
+ 0xf1f2043c,
+ 0xc761a1ca,
+ 0xa8964e9a,
+ 0x90cab2ce,
+ 0xc6e3583d,
+ 0x3344e0be,
+ 0x7d53914b,
+ 0x3d953297,
+ 0xfcf2eda0,
+ 0x42f878a5,
+ 0x2,
+ 0x4,
+ 0x10,
+ 0x100,
+ 0x10000,
+ 0x1edc6f41,
+ 0x3aab4576,
+ 0x18571d18,
+ 0x59a3508a,
+ 0xaa97d41d,
+ 0xe78dbf1d,
+ 0x4ef6a711,
+ 0x2506c32e,
+ 0x68d4e827,
+ 0x546ea6b0,
+ 0x465cebac,
+ 0x26a86214,
+ 0x964aa2fd,
+ 0x3b4c5747,
+ 0x6702ee7f,
+ 0xd086629f,
+ 0xf1f2043c,
+ 0xc761a1ca,
+ 0xa8964e9a,
+ 0x90cab2ce,
+ 0xc6e3583d,
+ 0x3344e0be,
+ 0x7d53914b,
+ 0x3d953297,
+ 0xfcf2eda0,
+ 0x42f878a5,
+ 0x2,
+ 0x4,
+ 0x10,
+ 0x100,
+ 0x10000
+};
+#endif
+
#else
#define MAX_SIZE 32768
.constants:
--- /dev/null
+/*
+ * Use the fixed point version of Barrett reduction to compute a mod n
+ * over GF(2) for given n using POWER8 instructions. We use k = 32.
+ *
+ * http://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of either:
+ *
+ * a) the GNU General Public License as published by the Free Software
+ * Foundation; either version 2 of the License, or (at your option)
+ * any later version, or
+ * b) the Apache License, Version 2.0
+ */
+#include <ppc-asm.h>
+#include "common/ppc-opcode.h"
+
+#undef toc
+
+#ifndef r1
+#define r1 1
+#endif
+
+#ifndef r2
+#define r2 2
+#endif
+
+ .section .data
+.balign 16
+
+.barrett_fz_constants:
+ /* Barrett constant m - (4^32)/n */
+ .octa 0x0000000000000000000000011f91caf6 /* x^64 div p(x) */
+ /* Barrett constant n */
+ .octa 0x0000000000000000000000011edc6f41
+
+.text
+/* unsigned int barrett_reduction(unsigned long val) */
+FUNC_START(barrett_reduction)
+ addis r4,r2,.barrett_fz_constants@toc@ha
+ addi r4,r4,.barrett_fz_constants@toc@l
+
+ li r5,16
+ vxor v1,v1,v1 /* zero v1 */
+
+ /* Get a into v0 */
+ MTVRD(v0, r3)
+ vsldoi v0,v1,v0,8 /* shift into bottom 64 bits, this is a */
+
+ /* Load constants */
+ lvx v2,0,r4 /* m */
+ lvx v3,r5,r4 /* n */
+
+ /*
+ * Now for the actual algorithm. The idea is to calculate q,
+ * the multiple of our polynomial that we need to subtract. By
+ * doing the computation 2x bits higher (ie 64 bits) and shifting the
+ * result back down 2x bits, we round down to the nearest multiple.
+ */
+ VPMSUMD(v4,v0,v2) /* ma */
+ vsldoi v4,v1,v4,8 /* q = floor(ma/(2^64)) */
+ VPMSUMD(v4,v4,v3) /* qn */
+ vxor v0,v0,v4 /* a - qn, subtraction is xor in GF(2) */
+
+ /*
+ * Get the result into r3. We need to shift it left 8 bytes:
+ * V0 [ 0 1 2 X ]
+ * V0 [ 0 X 2 3 ]
+ */
+ vsldoi v0,v0,v1,8 /* shift result into top 64 bits of v0 */
+ MFVRD(r3, v0)
+
+ blr
+FUNC_END(barrett_reduction)
+
#include "json_spirit/json_spirit_value.h"
#include "include/assert.h" // spirit clobbers it!
+#include "reverse.h"
+
namespace ceph {
class Formatter;
}
}
static uint32_t _reverse_bits(uint32_t v) {
- if (v == 0)
- return v;
- // reverse bits
- // swap odd and even bits
- v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
- // swap consecutive pairs
- v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
- // swap nibbles ...
- v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
- // swap bytes
- v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
- // swap 2-byte long pairs
- v = ( v >> 16 ) | ( v << 16);
- return v;
+ return reverse_bits(v);
}
static uint32_t _reverse_nibbles(uint32_t retval) {
- // reverse nibbles
- retval = ((retval & 0x0f0f0f0f) << 4) | ((retval & 0xf0f0f0f0) >> 4);
- retval = ((retval & 0x00ff00ff) << 8) | ((retval & 0xff00ff00) >> 8);
- retval = ((retval & 0x0000ffff) << 16) | ((retval & 0xffff0000) >> 16);
- return retval;
+ return reverse_nibbles(retval);
}
/**
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "reverse.h"
+
+uint32_t reverse_bits(uint32_t v) {
+ if (v == 0)
+ return v;
+
+ /* reverse bits
+ * swap odd and even bits
+ */
+ v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+ /* swap consecutive pairs */
+ v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+ /* swap nibbles ... */
+ v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+ /* swap bytes */
+ v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
+ /* swap 2-byte long pairs */
+ v = ( v >> 16 ) | ( v << 16);
+ return v;
+}
+
+uint32_t reverse_nibbles(uint32_t retval) {
+ /* reverse nibbles */
+ retval = ((retval & 0x0f0f0f0f) << 4) | ((retval & 0xf0f0f0f0) >> 4);
+ retval = ((retval & 0x00ff00ff) << 8) | ((retval & 0xff00ff00) >> 8);
+ retval = ((retval & 0x0000ffff) << 16) | ((retval & 0xffff0000) >> 16);
+ return retval;
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __CEPH_OS_REVERSE_H
+#define __CEPH_OS_REVERSE_H
+
+#include "include/int_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint32_t reverse_bits(uint32_t v);
+extern uint32_t reverse_nibbles(uint32_t retval);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
/**
* calculate crc32c for data that is entirely 0 (ZERO)
*
- * Note: works the same as \ref ceph_crc32c for data == nullptr, but faster
+ * Note: works the same as ceph_crc32c_func for data == nullptr,
+ * but faster than the optimized assembly on certain architectures.
+ * This is faster than intel optimized assembly, but not as fast as
+ * ppc64le optimized assembly.
*
* @param crc initial value
* @param length length of buffer
*/
static inline uint32_t ceph_crc32c(uint32_t crc, unsigned char const *data, unsigned length)
{
+#ifndef HAVE_POWER8
if (!data && length > 16)
return ceph_crc32c_zeros(crc, length);
- return ceph_crc32c_func(crc, data, length);
+#endif /* HAVE_POWER8 */
+
+ return ceph_crc32c_func(crc, data, length);
}
#ifdef __cplusplus
pre_start = ceph_clock_now();
start = ceph_clock_now();
+#ifdef HAVE_POWER8
+ uint32_t crc_b = ceph_crc32c_zeros(111, size);
+#else
uint32_t crc_b = ceph_crc32c_func(111, nullptr, size);
+#endif
end = ceph_clock_now();
time_adjusted = (end - start) - (start - pre_start);
+#ifdef HAVE_POWER8
+ std::cout << "ceph_crc32c_zeros method. size=" << size << " time="
+ << (double)(end-start) << " at " << (double)size/(1024*1024)/(time_adjusted)
+ << " MB/sec" << " error=" << resolution / time_adjusted * 100 << "%"
+ << std::endl;
+#else
std::cout << "fallback method. size=" << size << " time=" << (double)(end-start)
<< " at " << (double)size/(1024*1024)/(time_adjusted) << " MB/sec"
<< " error=" << resolution / time_adjusted * 100 << "%" << std::endl;
+#endif
EXPECT_EQ(crc_a, crc_b);
}
}
start = ceph_clock_now();
for (size_t i=0; i<ITER; i++)
- for (size_t scale=1; scale < 31; scale++)
{
- size_t size = (1<<scale) + rand() % (1<<scale);
- ceph_crc32c(rand(), nullptr, size);
+ for (size_t scale=1; scale < 31; scale++)
+ {
+ size_t size = (1<<scale) + rand() % (1<<scale);
+ ceph_crc32c(rand(), nullptr, size);
+ }
}
end = ceph_clock_now();
std::cout << "iterations="<< ITER*31 << " time=" << (double)(end-start) << std::endl;