* http://en.wikipedia.org/wiki/Barrett_reduction
*
* Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
- * Copyright (C) 2017 International Business Machines Corp.
- * All rights reserved.
*
* This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
+ * modify it under the terms of either:
+ *
+ * a) the GNU General Public License as published by the Free Software
+ * Foundation; either version 2 of the License, or (at your option)
+ * any later version, or
+ * b) the Apache License, Version 2.0
*/
+
+#if defined (__clang__)
+#ifndef __ALTIVEC__
+#define __ALTIVEC__
+#endif
+#include "ppc-asm.h"
+#else
#include <ppc-asm.h>
-#include "common/ppc-opcode.h"
+#endif
+#include "ppc-opcode.h"
#undef toc
/* byte reverse permute constant */
.octa 0x0F0E0D0C0B0A09080706050403020100
-#define __ASSEMBLY__
+#ifdef CRC32_CONSTANTS_HEADER
+#include CRC32_CONSTANTS_HEADER
+#else
#include "crc32c_ppc_constants.h"
+#endif
.text
#define VPERM(A, B, C, D)
#endif
+#ifndef CRC32_FUNCTION_ASM
+#define CRC32_FUNCTION_ASM __crc32_vpmsum
+#endif
+
/* unsigned int __crc32_vpmsum(unsigned int crc, void *p, unsigned long len) */
-FUNC_START(__crc32_vpmsum)
+FUNC_START(CRC32_FUNCTION_ASM)
std r31,-8(r1)
std r30,-16(r1)
std r29,-24(r1)
mr r3,r10
b .Lout
-FUNC_END(__crc32_vpmsum)
+FUNC_END(CRC32_FUNCTION_ASM)
/*
* Use the fixed point version of Barrett reduction to compute a mod n
- * over GF(2) for given n using POWER8 instructions. We use k = 32.
+ * over GF(2) for n = 0x104c11db7 using POWER8 instructions. We use k = 32.
*
* http://en.wikipedia.org/wiki/Barrett_reduction
*
* any later version, or
* b) the Apache License, Version 2.0
*/
-#include <ppc-asm.h>
-#include "common/ppc-opcode.h"
-
-#undef toc
-#ifndef r1
-#define r1 1
+#if defined (__clang__)
+#ifndef __ALTIVEC__
+#define __ALTIVEC__
#endif
-
-#ifndef r2
-#define r2 2
+#include "ppc-asm.h"
+#else
+#include <ppc-asm.h>
#endif
+#include "ppc-opcode.h"
.section .data
.balign 16
-
-.barrett_fz_constants:
+.constants:
/* Barrett constant m - (4^32)/n */
- .octa 0x0000000000000000000000011f91caf6 /* x^64 div p(x) */
+ .octa 0x00000000000000000000000104d101df
+
/* Barrett constant n */
- .octa 0x0000000000000000000000011edc6f41
+ .octa 0x00000000000000000000000104c11db7
+
+.bit_reflected_constants:
+ /* 33 bit reflected Barrett constant m - (4^32)/n */
+ .octa 0x000000000000000000000001f7011641
+
+ /* 33 bit reflected Barrett constant n */
+ .octa 0x000000000000000000000001db710641
+
+ .text
-.text
/* unsigned int barrett_reduction(unsigned long val) */
FUNC_START(barrett_reduction)
- addis r4,r2,.barrett_fz_constants@toc@ha
- addi r4,r4,.barrett_fz_constants@toc@l
+ lis r4,.constants@ha
+ la r4,.constants@l(r4)
li r5,16
vxor v1,v1,v1 /* zero v1 */
blr
FUNC_END(barrett_reduction)
-
+
+/* unsigned int barrett_reduction_reflected(unsigned long val) */
+FUNC_START(barrett_reduction_reflected)
+ lis r4,.bit_reflected_constants@ha
+ la r4,.bit_reflected_constants@l(r4)
+
+ li r5,16
+ vxor v1,v1,v1 /* zero v1 */
+
+ /* Get a into v0 */
+ MTVRD(v0, r3)
+ vsldoi v0,v1,v0,8 /* shift into bottom 64 bits, this is a */
+
+ /* Load constants */
+ lvx v2,0,r4 /* m */
+ lvx v3,r5,r4 /* n */
+
+ vspltisw v5,-1 /* all ones */
+ vsldoi v6,v1,v5,4 /* bitmask with low 32 bits set */
+
+ /*
+ * Now for the Barrett reduction algorithm. Instead of bit reflecting
+ * our data (which is expensive to do), we bit reflect our constants
+ * and our algorithm, which means the intermediate data in our vector
+ * registers goes from 0-63 instead of 63-0. We can reflect the
+ * algorithm because we don't carry in mod 2 arithmetic.
+ */
+ vand v4,v0,v6 /* bottom 32 bits of a */
+ VPMSUMD(v4,v4,v2) /* ma */
+ vand v4,v4,v6 /* bottom 32bits of ma */
+ VPMSUMD(v4,v4,v3) /* qn */
+ vxor v0,v0,v4 /* a - qn, subtraction is xor in GF(2) */
+
+ /*
+ * Since we are bit reflected, the result (ie the low 32 bits) is in the
+ * high 32 bits. We just need to shift it left 4 bytes
+ * V0 [ 0 1 X 3 ]
+ * V0 [ 0 X 2 3 ]
+ */
+ vsldoi v0,v0,v1,4 /* shift result into top 64 bits of v0 */
+ MFVRD(r3, v0)
+
+ blr
+FUNC_END(barrett_reduction_reflected)
--- /dev/null
+/* PowerPC asm definitions for GNU C.
+
+Copyright (C) 2002-2017 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+/* Under winnt, 1) gas supports the following as names and 2) in particular
+ defining "toc" breaks the FUNC_START macro as ".toc" becomes ".2" */
+
+#define r0 0
+#define sp 1
+#define toc 2
+#define r3 3
+#define r4 4
+#define r5 5
+#define r6 6
+#define r7 7
+#define r8 8
+#define r9 9
+#define r10 10
+#define r11 11
+#define r12 12
+#define r13 13
+#define r14 14
+#define r15 15
+#define r16 16
+#define r17 17
+#define r18 18
+#define r19 19
+#define r20 20
+#define r21 21
+#define r22 22
+#define r23 23
+#define r24 24
+#define r25 25
+#define r26 26
+#define r27 27
+#define r28 28
+#define r29 29
+#define r30 30
+#define r31 31
+
+#define cr0 0
+#define cr1 1
+#define cr2 2
+#define cr3 3
+#define cr4 4
+#define cr5 5
+#define cr6 6
+#define cr7 7
+
+#define f0 0
+#define f1 1
+#define f2 2
+#define f3 3
+#define f4 4
+#define f5 5
+#define f6 6
+#define f7 7
+#define f8 8
+#define f9 9
+#define f10 10
+#define f11 11
+#define f12 12
+#define f13 13
+#define f14 14
+#define f15 15
+#define f16 16
+#define f17 17
+#define f18 18
+#define f19 19
+#define f20 20
+#define f21 21
+#define f22 22
+#define f23 23
+#define f24 24
+#define f25 25
+#define f26 26
+#define f27 27
+#define f28 28
+#define f29 29
+#define f30 30
+#define f31 31
+
+#ifdef __VSX__
+#define f32 32
+#define f33 33
+#define f34 34
+#define f35 35
+#define f36 36
+#define f37 37
+#define f38 38
+#define f39 39
+#define f40 40
+#define f41 41
+#define f42 42
+#define f43 43
+#define f44 44
+#define f45 45
+#define f46 46
+#define f47 47
+#define f48 48
+#define f49 49
+#define f50 30
+#define f51 51
+#define f52 52
+#define f53 53
+#define f54 54
+#define f55 55
+#define f56 56
+#define f57 57
+#define f58 58
+#define f59 59
+#define f60 60
+#define f61 61
+#define f62 62
+#define f63 63
+#endif
+
+#ifdef __ALTIVEC__
+#define v0 0
+#define v1 1
+#define v2 2
+#define v3 3
+#define v4 4
+#define v5 5
+#define v6 6
+#define v7 7
+#define v8 8
+#define v9 9
+#define v10 10
+#define v11 11
+#define v12 12
+#define v13 13
+#define v14 14
+#define v15 15
+#define v16 16
+#define v17 17
+#define v18 18
+#define v19 19
+#define v20 20
+#define v21 21
+#define v22 22
+#define v23 23
+#define v24 24
+#define v25 25
+#define v26 26
+#define v27 27
+#define v28 28
+#define v29 29
+#define v30 30
+#define v31 31
+#endif
+
+#ifdef __VSX__
+#define vs0 0
+#define vs1 1
+#define vs2 2
+#define vs3 3
+#define vs4 4
+#define vs5 5
+#define vs6 6
+#define vs7 7
+#define vs8 8
+#define vs9 9
+#define vs10 10
+#define vs11 11
+#define vs12 12
+#define vs13 13
+#define vs14 14
+#define vs15 15
+#define vs16 16
+#define vs17 17
+#define vs18 18
+#define vs19 19
+#define vs20 20
+#define vs21 21
+#define vs22 22
+#define vs23 23
+#define vs24 24
+#define vs25 25
+#define vs26 26
+#define vs27 27
+#define vs28 28
+#define vs29 29
+#define vs30 30
+#define vs31 31
+#define vs32 32
+#define vs33 33
+#define vs34 34
+#define vs35 35
+#define vs36 36
+#define vs37 37
+#define vs38 38
+#define vs39 39
+#define vs40 40
+#define vs41 41
+#define vs42 42
+#define vs43 43
+#define vs44 44
+#define vs45 45
+#define vs46 46
+#define vs47 47
+#define vs48 48
+#define vs49 49
+#define vs50 30
+#define vs51 51
+#define vs52 52
+#define vs53 53
+#define vs54 54
+#define vs55 55
+#define vs56 56
+#define vs57 57
+#define vs58 58
+#define vs59 59
+#define vs60 60
+#define vs61 61
+#define vs62 62
+#define vs63 63
+#endif
+
+/*
+ * Macros to glue together two tokens.
+ */
+
+#ifdef __STDC__
+#define XGLUE(a,b) a##b
+#else
+#define XGLUE(a,b) a/**/b
+#endif
+
+#define GLUE(a,b) XGLUE(a,b)
+
+/*
+ * Macros to begin and end a function written in assembler. If -mcall-aixdesc
+ * or -mcall-nt, create a function descriptor with the given name, and create
+ * the real function with one or two leading periods respectively.
+ */
+
+#if defined(__powerpc64__) && _CALL_ELF == 2
+
+/* Defining "toc" above breaks @toc in assembler code. */
+#undef toc
+
+#define FUNC_NAME(name) GLUE(__USER_LABEL_PREFIX__,name)
+#define JUMP_TARGET(name) FUNC_NAME(name)
+#define FUNC_START(name) \
+ .type FUNC_NAME(name),@function; \
+ .globl FUNC_NAME(name); \
+FUNC_NAME(name): \
+0: addis 2,12,(.TOC.-0b)@ha; \
+ addi 2,2,(.TOC.-0b)@l; \
+ .localentry FUNC_NAME(name),.-FUNC_NAME(name)
+
+#define HIDDEN_FUNC(name) \
+ FUNC_START(name) \
+ .hidden FUNC_NAME(name);
+
+#define FUNC_END(name) \
+ .size FUNC_NAME(name),.-FUNC_NAME(name)
+
+#elif defined (__powerpc64__)
+
+#define FUNC_NAME(name) GLUE(.,name)
+#define JUMP_TARGET(name) FUNC_NAME(name)
+#define FUNC_START(name) \
+ .section ".opd","aw"; \
+name: \
+ .quad GLUE(.,name); \
+ .quad .TOC.@tocbase; \
+ .quad 0; \
+ .previous; \
+ .type GLUE(.,name),@function; \
+ .globl name; \
+ .globl GLUE(.,name); \
+GLUE(.,name):
+
+#define HIDDEN_FUNC(name) \
+ FUNC_START(name) \
+ .hidden name; \
+ .hidden GLUE(.,name);
+
+#define FUNC_END(name) \
+GLUE(.L,name): \
+ .size GLUE(.,name),GLUE(.L,name)-GLUE(.,name)
+
+#elif defined(_CALL_AIXDESC)
+
+#ifdef _RELOCATABLE
+#define DESC_SECTION ".got2"
+#else
+#define DESC_SECTION ".got1"
+#endif
+
+#define FUNC_NAME(name) GLUE(.,name)
+#define JUMP_TARGET(name) FUNC_NAME(name)
+#define FUNC_START(name) \
+ .section DESC_SECTION,"aw"; \
+name: \
+ .long GLUE(.,name); \
+ .long _GLOBAL_OFFSET_TABLE_; \
+ .long 0; \
+ .previous; \
+ .type GLUE(.,name),@function; \
+ .globl name; \
+ .globl GLUE(.,name); \
+GLUE(.,name):
+
+#define HIDDEN_FUNC(name) \
+ FUNC_START(name) \
+ .hidden name; \
+ .hidden GLUE(.,name);
+
+#define FUNC_END(name) \
+GLUE(.L,name): \
+ .size GLUE(.,name),GLUE(.L,name)-GLUE(.,name)
+
+#else
+
+#define FUNC_NAME(name) GLUE(__USER_LABEL_PREFIX__,name)
+#if defined __PIC__ || defined __pic__
+#define JUMP_TARGET(name) FUNC_NAME(name@plt)
+#else
+#define JUMP_TARGET(name) FUNC_NAME(name)
+#endif
+#define FUNC_START(name) \
+ .type FUNC_NAME(name),@function; \
+ .globl FUNC_NAME(name); \
+FUNC_NAME(name):
+
+#define HIDDEN_FUNC(name) \
+ FUNC_START(name) \
+ .hidden FUNC_NAME(name);
+
+#define FUNC_END(name) \
+GLUE(.L,name): \
+ .size FUNC_NAME(name),GLUE(.L,name)-FUNC_NAME(name)
+#endif
+
+#ifdef IN_GCC
+/* For HAVE_GAS_CFI_DIRECTIVE. */
+#include "auto-host.h"
+
+#ifdef HAVE_GAS_CFI_DIRECTIVE
+# define CFI_STARTPROC .cfi_startproc
+# define CFI_ENDPROC .cfi_endproc
+# define CFI_OFFSET(reg, off) .cfi_offset reg, off
+# define CFI_DEF_CFA_REGISTER(reg) .cfi_def_cfa_register reg
+# define CFI_RESTORE(reg) .cfi_restore reg
+#else
+# define CFI_STARTPROC
+# define CFI_ENDPROC
+# define CFI_OFFSET(reg, off)
+# define CFI_DEF_CFA_REGISTER(reg)
+# define CFI_RESTORE(reg)
+#endif
+#endif
+
+#if defined __linux__
+ .section .note.GNU-stack
+ .previous
+#endif
+/*
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of either:
+ *
+ * a) the GNU General Public License as published by the Free Software
+ * Foundation; either version 2 of the License, or (at your option)
+ * any later version, or
+ * b) the Apache License, Version 2.0
+ */
+#ifndef __OPCODES_H
+#define __OPCODES_H
+
+#define __PPC_RA(a) (((a) & 0x1f) << 16)
+#define __PPC_RB(b) (((b) & 0x1f) << 11)
+#define __PPC_XA(a) ((((a) & 0x1f) << 16) | (((a) & 0x20) >> 3))
+#define __PPC_XB(b) ((((b) & 0x1f) << 11) | (((b) & 0x20) >> 4))
+#define __PPC_XS(s) ((((s) & 0x1f) << 21) | (((s) & 0x20) >> 5))
+#define __PPC_XT(s) __PPC_XS(s)
+#define VSX_XX3(t, a, b) (__PPC_XT(t) | __PPC_XA(a) | __PPC_XB(b))
+#define VSX_XX1(s, a, b) (__PPC_XS(s) | __PPC_RA(a) | __PPC_RB(b))
+
+#define PPC_INST_VPMSUMW 0x10000488
+#define PPC_INST_VPMSUMD 0x100004c8
+#define PPC_INST_MFVSRD 0x7c000066
+#define PPC_INST_MTVSRD 0x7c000166
+
+#define VPMSUMW(t, a, b) .long PPC_INST_VPMSUMW | VSX_XX3((t), a, b)
+#define VPMSUMD(t, a, b) .long PPC_INST_VPMSUMD | VSX_XX3((t), a, b)
+#define MFVRD(a, t) .long PPC_INST_MFVSRD | VSX_XX1((t)+32, a, 0)
+#define MTVRD(t, a) .long PPC_INST_MTVSRD | VSX_XX1((t)+32, a, 0)
+
+#endif
/* Copyright (C) 2017 International Business Machines Corp.
* All rights reserved.
*