From e9eb641fc7714c1812fb3ee43fd9909991f79c3c Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Wed, 19 Mar 2014 00:12:43 +0100 Subject: [PATCH] remove gf-complete / jerasure sub modules and add the source files in the tree. Signed-off-by: Loic Dachary --- .gitmodules | 8 - src/erasure-code/jerasure/Makefile.am | 3 - src/erasure-code/jerasure/gf-complete | 1 - .../gf-complete/include/gf_complete.h | 192 ++ .../jerasure/gf-complete/include/gf_general.h | 61 + .../jerasure/gf-complete/include/gf_int.h | 206 ++ .../jerasure/gf-complete/include/gf_method.h | 20 + .../jerasure/gf-complete/include/gf_rand.h | 22 + .../jerasure/gf-complete/src/gf.c | 1039 +++++++ .../jerasure/gf-complete/src/gf_general.c | 538 ++++ .../jerasure/gf-complete/src/gf_method.c | 185 ++ .../jerasure/gf-complete/src/gf_rand.c | 80 + .../jerasure/gf-complete/src/gf_w128.c | 1769 +++++++++++ .../jerasure/gf-complete/src/gf_w16.c | 2489 +++++++++++++++ .../jerasure/gf-complete/src/gf_w32.c | 2741 +++++++++++++++++ .../jerasure/gf-complete/src/gf_w4.c | 2081 +++++++++++++ .../jerasure/gf-complete/src/gf_w64.c | 2244 ++++++++++++++ .../jerasure/gf-complete/src/gf_w8.c | 2456 +++++++++++++++ .../jerasure/gf-complete/src/gf_wgen.c | 1019 ++++++ src/erasure-code/jerasure/jerasure | 1 - .../jerasure/jerasure/include/cauchy.h | 45 + .../jerasure/jerasure/include/galois.h | 99 + .../jerasure/jerasure/include/jerasure.h | 294 ++ .../jerasure/jerasure/include/liberation.h | 47 + .../jerasure/jerasure/include/reed_sol.h | 50 + .../jerasure/jerasure/src/cauchy.c | 405 +++ .../jerasure/jerasure/src/galois.c | 353 +++ .../jerasure/jerasure/src/jerasure.c | 1387 +++++++++ .../jerasure/jerasure/src/liberation.c | 262 ++ .../jerasure/jerasure/src/reed_sol.c | 301 ++ 30 files changed, 20385 insertions(+), 13 deletions(-) delete mode 160000 src/erasure-code/jerasure/gf-complete create mode 100644 src/erasure-code/jerasure/gf-complete/include/gf_complete.h create mode 100644 src/erasure-code/jerasure/gf-complete/include/gf_general.h create mode 100644 src/erasure-code/jerasure/gf-complete/include/gf_int.h create mode 100644 src/erasure-code/jerasure/gf-complete/include/gf_method.h create mode 100644 src/erasure-code/jerasure/gf-complete/include/gf_rand.h create mode 100644 src/erasure-code/jerasure/gf-complete/src/gf.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_general.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_method.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_rand.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_w128.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_w16.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_w32.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_w4.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_w64.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_w8.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_wgen.c delete mode 160000 src/erasure-code/jerasure/jerasure create mode 100644 src/erasure-code/jerasure/jerasure/include/cauchy.h create mode 100644 src/erasure-code/jerasure/jerasure/include/galois.h create mode 100644 src/erasure-code/jerasure/jerasure/include/jerasure.h create mode 100644 src/erasure-code/jerasure/jerasure/include/liberation.h create mode 100644 src/erasure-code/jerasure/jerasure/include/reed_sol.h create mode 100644 src/erasure-code/jerasure/jerasure/src/cauchy.c create mode 100644 src/erasure-code/jerasure/jerasure/src/galois.c create mode 100644 src/erasure-code/jerasure/jerasure/src/jerasure.c create mode 100644 src/erasure-code/jerasure/jerasure/src/liberation.c create mode 100644 src/erasure-code/jerasure/jerasure/src/reed_sol.c diff --git a/.gitmodules b/.gitmodules index 5ed6ff5d028e..55ec09dbd47e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,11 +7,3 @@ [submodule "src/civetweb"] path = src/civetweb url = git://github.com/ceph/civetweb -[submodule "src/erasure-code/jerasure/gf-complete"] - path = src/erasure-code/jerasure/gf-complete - url = https://github.com/ceph/gf-complete.git - branch = v1 -[submodule "src/erasure-code/jerasure/jerasure"] - path = src/erasure-code/jerasure/jerasure - url = https://github.com/ceph/jerasure.git - branch = v2 diff --git a/src/erasure-code/jerasure/Makefile.am b/src/erasure-code/jerasure/Makefile.am index e82a7eb4c1aa..ff8114b80334 100644 --- a/src/erasure-code/jerasure/Makefile.am +++ b/src/erasure-code/jerasure/Makefile.am @@ -7,15 +7,12 @@ libec_jerasure_la_SOURCES = \ erasure-code/jerasure/jerasure/src/reed_sol.c \ erasure-code/jerasure/gf-complete/src/gf_wgen.c \ erasure-code/jerasure/gf-complete/src/gf_method.c \ - erasure-code/jerasure/gf-complete/src/gf_int.h \ erasure-code/jerasure/gf-complete/src/gf_w16.c \ erasure-code/jerasure/gf-complete/src/gf.c \ - erasure-code/jerasure/gf-complete/src/gf_rand.h \ erasure-code/jerasure/gf-complete/src/gf_w32.c \ erasure-code/jerasure/gf-complete/src/gf_w64.c \ erasure-code/jerasure/gf-complete/src/gf_w128.c \ erasure-code/jerasure/gf-complete/src/gf_general.c \ - erasure-code/jerasure/gf-complete/src/gf_general.h \ erasure-code/jerasure/gf-complete/src/gf_w4.c \ erasure-code/jerasure/gf-complete/src/gf_rand.c \ erasure-code/jerasure/gf-complete/src/gf_w8.c \ diff --git a/src/erasure-code/jerasure/gf-complete b/src/erasure-code/jerasure/gf-complete deleted file mode 160000 index a8ca2fb4ba0b..000000000000 --- a/src/erasure-code/jerasure/gf-complete +++ /dev/null @@ -1 +0,0 @@ -Subproject commit a8ca2fb4ba0b5896474e1d809aae7550b79eef10 diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_complete.h b/src/erasure-code/jerasure/gf-complete/include/gf_complete.h new file mode 100644 index 000000000000..57b439e27ee5 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/include/gf_complete.h @@ -0,0 +1,192 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_complete.h + * + * The main include file for gf_complete. + */ + +#ifndef _GF_COMPLETE_H_ +#define _GF_COMPLETE_H_ +#include + +#ifdef INTEL_SSE4 + #include +#endif + +#ifdef INTEL_SSSE3 + #include +#endif + +#ifdef INTEL_SSE2 + #include +#endif + +#ifdef INTEL_SSE4_PCLMUL + #include +#endif + + +/* These are the different ways to perform multiplication. + Not all are implemented for all values of w. + See the paper for an explanation of how they work. */ + +typedef enum {GF_MULT_DEFAULT, + GF_MULT_SHIFT, + GF_MULT_CARRY_FREE, + GF_MULT_GROUP, + GF_MULT_BYTWO_p, + GF_MULT_BYTWO_b, + GF_MULT_TABLE, + GF_MULT_LOG_TABLE, + GF_MULT_LOG_ZERO, + GF_MULT_LOG_ZERO_EXT, + GF_MULT_SPLIT_TABLE, + GF_MULT_COMPOSITE } gf_mult_type_t; + +/* These are the different ways to optimize region + operations. They are bits because you can compose them. + Certain optimizations only apply to certain gf_mult_type_t's. + Again, please see documentation for how to use these */ + +#define GF_REGION_DEFAULT (0x0) +#define GF_REGION_DOUBLE_TABLE (0x1) +#define GF_REGION_QUAD_TABLE (0x2) +#define GF_REGION_LAZY (0x4) +#define GF_REGION_SSE (0x8) +#define GF_REGION_NOSSE (0x10) +#define GF_REGION_ALTMAP (0x20) +#define GF_REGION_CAUCHY (0x40) + +typedef uint32_t gf_region_type_t; + +/* These are different ways to implement division. + Once again, it's best to use "DEFAULT". However, + there are times when you may want to experiment + with the others. */ + +typedef enum { GF_DIVIDE_DEFAULT, + GF_DIVIDE_MATRIX, + GF_DIVIDE_EUCLID } gf_division_type_t; + +/* We support w=4,8,16,32,64 and 128 with their own data types and + operations for multiplication, division, etc. We also support + a "gen" type so that you can do general gf arithmetic for any + value of w from 1 to 32. You can perform a "region" operation + on these if you use "CAUCHY" as the mapping. + */ + +typedef uint32_t gf_val_32_t; +typedef uint64_t gf_val_64_t; +typedef uint64_t *gf_val_128_t; + +extern int _gf_errno; +extern void gf_error(); + +typedef struct gf *GFP; + +typedef union gf_func_a_b { + gf_val_32_t (*w32) (GFP gf, gf_val_32_t a, gf_val_32_t b); + gf_val_64_t (*w64) (GFP gf, gf_val_64_t a, gf_val_64_t b); + void (*w128)(GFP gf, gf_val_128_t a, gf_val_128_t b, gf_val_128_t c); +} gf_func_a_b; + +typedef union { + gf_val_32_t (*w32) (GFP gf, gf_val_32_t a); + gf_val_64_t (*w64) (GFP gf, gf_val_64_t a); + void (*w128)(GFP gf, gf_val_128_t a, gf_val_128_t b); +} gf_func_a; + +typedef union { + void (*w32) (GFP gf, void *src, void *dest, gf_val_32_t val, int bytes, int add); + void (*w64) (GFP gf, void *src, void *dest, gf_val_64_t val, int bytes, int add); + void (*w128)(GFP gf, void *src, void *dest, gf_val_128_t val, int bytes, int add); +} gf_region; + +typedef union { + gf_val_32_t (*w32) (GFP gf, void *start, int bytes, int index); + gf_val_64_t (*w64) (GFP gf, void *start, int bytes, int index); + void (*w128)(GFP gf, void *start, int bytes, int index, gf_val_128_t rv); +} gf_extract; + +typedef struct gf { + gf_func_a_b multiply; + gf_func_a_b divide; + gf_func_a inverse; + gf_region multiply_region; + gf_extract extract_word; + void *scratch; +} gf_t; + +/* Initializes the GF to defaults. Pass it a pointer to a gf_t. + Returns 0 on failure, 1 on success. */ + +extern int gf_init_easy(GFP gf, int w); + +/* Initializes the GF changing the defaults. + Returns 0 on failure, 1 on success. + Pass it a pointer to a gf_t. + For mult_type and divide_type, use one of gf_mult_type_t gf_divide_type_t . + For region_type, OR together the GF_REGION_xxx's defined above. + Use 0 as prim_poly for defaults. Otherwise, the leading 1 is optional. + Use NULL for scratch_memory to have init_hard allocate memory. Otherwise, + use gf_scratch_size() to determine how big scratch_memory has to be. + */ + +extern int gf_init_hard(GFP gf, + int w, + int mult_type, + int region_type, + int divide_type, + uint64_t prim_poly, + int arg1, + int arg2, + GFP base_gf, + void *scratch_memory); + +/* Determines the size for scratch_memory. + Returns 0 on failure and non-zero on success. */ + +extern int gf_scratch_size(int w, + int mult_type, + int region_type, + int divide_type, + int arg1, + int arg2); + +/* This reports the gf_scratch_size of a gf_t that has already been created */ + +extern int gf_size(GFP gf); + +/* Frees scratch memory if gf_init_easy/gf_init_hard called malloc. + If recursive = 1, then it calls itself recursively on base_gf. */ + +extern int gf_free(GFP gf, int recursive); + +/* This is support for inline single multiplications and divisions. + I know it's yucky, but if you've got to be fast, you've got to be fast. + We support inlining for w=4, w=8 and w=16. + + To use inline multiplication and division with w=4 or 8, you should use the + default gf_t, or one with a single table. Otherwise, gf_w4/8_get_mult_table() + will return NULL. Similarly, with w=16, the gf_t must be LOG */ + +uint8_t *gf_w4_get_mult_table(GFP gf); +uint8_t *gf_w4_get_div_table(GFP gf); + +#define GF_W4_INLINE_MULTDIV(table, a, b) (table[((a)<<4)|(b)]) + +uint8_t *gf_w8_get_mult_table(GFP gf); +uint8_t *gf_w8_get_div_table(GFP gf); + +#define GF_W8_INLINE_MULTDIV(table, a, b) (table[(((uint32_t) (a))<<8)|(b)]) + +uint16_t *gf_w16_get_log_table(GFP gf); +uint16_t *gf_w16_get_mult_alog_table(GFP gf); +uint16_t *gf_w16_get_div_alog_table(GFP gf); + +#define GF_W16_INLINE_MULT(log, alog, a, b) ((a) == 0 || (b) == 0) ? 0 : (alog[(uint32_t)log[a]+(uint32_t)log[b]]) +#define GF_W16_INLINE_DIV(log, alog, a, b) ((a) == 0 || (b) == 0) ? 0 : (alog[(int)log[a]-(int)log[b]]) +#endif diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_general.h b/src/erasure-code/jerasure/gf-complete/include/gf_general.h new file mode 100644 index 000000000000..9a5de529dc00 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/include/gf_general.h @@ -0,0 +1,61 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_general.h + * + * This file has helper routines for doing basic GF operations with any + * legal value of w. The problem is that w <= 32, w=64 and w=128 all have + * different data types, which is a pain. The procedures in this file try + * to alleviate that pain. They are used in gf_unit and gf_time. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "gf_complete.h" + +typedef union { + uint32_t w32; + uint64_t w64; + uint64_t w128[2]; +} gf_general_t; + +void gf_general_set_zero(gf_general_t *v, int w); +void gf_general_set_one(gf_general_t *v, int w); +void gf_general_set_two(gf_general_t *v, int w); + +int gf_general_is_zero(gf_general_t *v, int w); +int gf_general_is_one(gf_general_t *v, int w); +int gf_general_are_equal(gf_general_t *v1, gf_general_t *v2, int w); + +void gf_general_val_to_s(gf_general_t *v, int w, char *s, int hex); +int gf_general_s_to_val(gf_general_t *v, int w, char *s, int hex); + +void gf_general_set_random(gf_general_t *v, int w, int zero_ok); + +void gf_general_add(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c); +void gf_general_multiply(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c); +void gf_general_divide(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c); +void gf_general_inverse(gf_t *gf, gf_general_t *a, gf_general_t *b); + +void gf_general_do_region_multiply(gf_t *gf, gf_general_t *a, + void *ra, void *rb, + int bytes, int xor); + +void gf_general_do_region_check(gf_t *gf, gf_general_t *a, + void *orig_a, void *orig_target, void *final_target, + int bytes, int xor); + + +/* Which is M, D or I for multiply, divide or inverse. */ + +void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size); +int gf_general_do_single_timing_test(gf_t *gf, void *ra, void *rb, int size, char which); diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_int.h b/src/erasure-code/jerasure/gf-complete/include/gf_int.h new file mode 100644 index 000000000000..2ce3d9817358 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/include/gf_int.h @@ -0,0 +1,206 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_int.h + * + * Internal code for Galois field routines. This is not meant for + * users to include, but for the internal GF files to use. + */ + +#pragma once + +#include "gf_complete.h" + +#include + +extern void timer_start (double *t); +extern double timer_split (const double *t); +extern void galois_fill_random (void *buf, int len, unsigned int seed); + +#define GF_SSE2 0x01 +#define GF_SSSE3 0x02 +#define GF_SSE4 0x04 +#define GF_SSE4_PCLMUL 0x08 + +typedef struct { + int mult_type; + int region_type; + int divide_type; + int w; + uint64_t prim_poly; + int free_me; + int arg1; + int arg2; + gf_t *base_gf; + void *private; + uint32_t sse; +} gf_internal_t; + +extern int gf_w4_init (gf_t *gf); +extern int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); + +extern int gf_w8_init (gf_t *gf); +extern int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); + +extern int gf_w16_init (gf_t *gf); +extern int gf_w16_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); + +extern int gf_w32_init (gf_t *gf); +extern int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); + +extern int gf_w64_init (gf_t *gf); +extern int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); + +extern int gf_w128_init (gf_t *gf); +extern int gf_w128_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); + +extern int gf_wgen_init (gf_t *gf); +extern int gf_wgen_scratch_size(int w, int mult_type, int region_type, int divide_type, int arg1, int arg2); + +void gf_wgen_cauchy_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor); +gf_val_32_t gf_wgen_extract_word(gf_t *gf, void *start, int bytes, int index); + +extern void gf_alignment_error(char *s, int a); + +extern uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp); + +/* This returns the correct default for prim_poly when base is used as the base + field for COMPOSITE. It returns 0 if we don't have a default prim_poly. */ + +extern uint64_t gf_composite_get_default_poly(gf_t *base); + +/* This structure lets you define a region multiply. It helps because you can handle + unaligned portions of the data with the procedures below, which really cleans + up the code. */ + +typedef struct { + gf_t *gf; + void *src; + void *dest; + int bytes; + uint64_t val; + int xor; + int align; /* The number of bytes to which to align. */ + void *s_start; /* The start and the top of the aligned region. */ + void *d_start; + void *s_top; + void *d_top; +} gf_region_data; + +/* This lets you set up one of these in one call. It also sets the start/top pointers. */ + +void gf_set_region_data(gf_region_data *rd, + gf_t *gf, + void *src, + void *dest, + int bytes, + uint64_t val, + int xor, + int align); + +/* This performs gf->multiply.32() on all of the unaligned bytes in the beginning of the region */ + +extern void gf_do_initial_region_alignment(gf_region_data *rd); + +/* This performs gf->multiply.32() on all of the unaligned bytes in the end of the region */ + +extern void gf_do_final_region_alignment(gf_region_data *rd); + +extern void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base); + +extern void gf_multby_zero(void *dest, int bytes, int xor); +extern void gf_multby_one(void *src, void *dest, int bytes, int xor); + +typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */ + GF_E_MDEFREG, /* Reg != Default && Mult == Default */ + GF_E_MDEFARG, /* Args != Default && Mult == Default */ + GF_E_DIVCOMP, /* Mult == Composite && Div != Default */ + GF_E_CAUCOMP, /* Mult == Composite && Reg == CAUCHY */ + GF_E_DOUQUAD, /* Reg == DOUBLE && Reg == QUAD */ + GF_E_SSE__NO, /* Reg == SSE && Reg == NOSSE */ + GF_E_CAUCHYB, /* Reg == CAUCHY && Other Reg */ + GF_E_CAUGT32, /* Reg == CAUCHY && w > 32*/ + GF_E_ARG1SET, /* Arg1 != 0 && Mult \notin COMPOSITE/SPLIT/GROUP */ + GF_E_ARG2SET, /* Arg2 != 0 && Mult \notin SPLIT/GROUP */ + GF_E_MATRIXW, /* Div == MATRIX && w > 32 */ + GF_E_BAD___W, /* Illegal w */ + GF_E_DOUBLET, /* Reg == DOUBLE && Mult != TABLE */ + GF_E_DOUBLEW, /* Reg == DOUBLE && w \notin {4,8} */ + GF_E_DOUBLEJ, /* Reg == DOUBLE && other Reg */ + GF_E_DOUBLEL, /* Reg == DOUBLE & LAZY but w = 4 */ + GF_E_QUAD__T, /* Reg == QUAD && Mult != TABLE */ + GF_E_QUAD__W, /* Reg == QUAD && w != 4 */ + GF_E_QUAD__J, /* Reg == QUAD && other Reg */ + GF_E_LAZY__X, /* Reg == LAZY && not DOUBLE or QUAD*/ + GF_E_ALTSHIF, /* Mult == Shift && Reg == ALTMAP */ + GF_E_SSESHIF, /* Mult == Shift && Reg == SSE|NOSSE */ + GF_E_ALT_CFM, /* Mult == CARRY_FREE && Reg == ALTMAP */ + GF_E_SSE_CFM, /* Mult == CARRY_FREE && Reg == SSE|NOSSE */ + GF_E_PCLMULX, /* Mult == Carry_Free && No PCLMUL */ + GF_E_ALT_BY2, /* Mult == Bytwo_x && Reg == ALTMAP */ + GF_E_BY2_SSE, /* Mult == Bytwo_x && Reg == SSE && No SSE2 */ + GF_E_LOGBADW, /* Mult == LOGx, w too big*/ + GF_E_LOG___J, /* Mult == LOGx, && Reg == SSE|ALTMAP|NOSSE */ + GF_E_ZERBADW, /* Mult == LOG_ZERO, w \notin {8,16} */ + GF_E_ZEXBADW, /* Mult == LOG_ZERO_EXT, w != 8 */ + GF_E_LOGPOLY, /* Mult == LOG & poly not primitive */ + GF_E_GR_ARGX, /* Mult == GROUP, Bad arg1/2 */ + GF_E_GR_W_48, /* Mult == GROUP, w \in { 4, 8 } */ + GF_E_GR_W_16, /* Mult == GROUP, w == 16, arg1 != 4 || arg2 != 4 */ + GF_E_GR_128A, /* Mult == GROUP, w == 128, bad args */ + GF_E_GR_A_27, /* Mult == GROUP, either arg > 27 */ + GF_E_GR_AR_W, /* Mult == GROUP, either arg > w */ + GF_E_GR____J, /* Mult == GROUP, Reg == SSE|ALTMAP|NOSSE */ + GF_E_TABLE_W, /* Mult == TABLE, w too big */ + GF_E_TAB_SSE, /* Mult == TABLE, SSE|NOSSE only apply to w == 4 */ + GF_E_TABSSE3, /* Mult == TABLE, Need SSSE3 for SSE */ + GF_E_TAB_ALT, /* Mult == TABLE, Reg == ALTMAP */ + GF_E_SP128AR, /* Mult == SPLIT, w=128, Bad arg1/arg2 */ + GF_E_SP128AL, /* Mult == SPLIT, w=128, SSE requires ALTMAP */ + GF_E_SP128AS, /* Mult == SPLIT, w=128, ALTMAP requires SSE */ + GF_E_SP128_A, /* Mult == SPLIT, w=128, SSE only with 4/128 */ + GF_E_SP128_S, /* Mult == SPLIT, w=128, ALTMAP only with 4/128 */ + GF_E_SPLIT_W, /* Mult == SPLIT, Bad w (8, 16, 32, 64, 128) */ + GF_E_SP_16AR, /* Mult == SPLIT, w=16, Bad arg1/arg2 */ + GF_E_SP_16_A, /* Mult == SPLIT, w=16, ALTMAP only with 4/16 */ + GF_E_SP_16_S, /* Mult == SPLIT, w=16, SSE only with 4/16 */ + GF_E_SP_32AR, /* Mult == SPLIT, w=32, Bad arg1/arg2 */ + GF_E_SP_32AS, /* Mult == SPLIT, w=32, ALTMAP requires SSE */ + GF_E_SP_32_A, /* Mult == SPLIT, w=32, ALTMAP only with 4/32 */ + GF_E_SP_32_S, /* Mult == SPLIT, w=32, SSE only with 4/32 */ + GF_E_SP_64AR, /* Mult == SPLIT, w=64, Bad arg1/arg2 */ + GF_E_SP_64AS, /* Mult == SPLIT, w=64, ALTMAP requires SSE */ + GF_E_SP_64_A, /* Mult == SPLIT, w=64, ALTMAP only with 4/64 */ + GF_E_SP_64_S, /* Mult == SPLIT, w=64, SSE only with 4/64 */ + GF_E_SP_8_AR, /* Mult == SPLIT, w=8, Bad arg1/arg2 */ + GF_E_SP_8__A, /* Mult == SPLIT, w=8, no ALTMAP */ + GF_E_SP_SSE3, /* Mult == SPLIT, Need SSSE3 for SSE */ + GF_E_COMP_A2, /* Mult == COMP, arg1 must be = 2 */ + GF_E_COMP_SS, /* Mult == COMP, SSE|NOSSE */ + GF_E_COMP__W, /* Mult == COMP, Bad w. */ + GF_E_UNKFLAG, /* Unknown flag in create_from.... */ + GF_E_UNKNOWN, /* Unknown mult_type. */ + GF_E_UNK_REG, /* Unknown region_type. */ + GF_E_UNK_DIV, /* Unknown divide_type. */ + GF_E_CFM___W, /* Mult == CFM, Bad w. */ + GF_E_CFM4POL, /* Mult == CFM & Prim Poly has high bits set. */ + GF_E_CFM8POL, /* Mult == CFM & Prim Poly has high bits set. */ + GF_E_CF16POL, /* Mult == CFM & Prim Poly has high bits set. */ + GF_E_CF32POL, /* Mult == CFM & Prim Poly has high bits set. */ + GF_E_CF64POL, /* Mult == CFM & Prim Poly has high bits set. */ + GF_E_FEWARGS, /* Too few args in argc/argv. */ + GF_E_BADPOLY, /* Bad primitive polynomial -- too many bits set. */ + GF_E_COMP_PP, /* Bad primitive polynomial -- bigger than sub-field. */ + GF_E_COMPXPP, /* Can't derive a default pp for composite field. */ + GF_E_BASE__W, /* Composite -- Base field is the wrong size. */ + GF_E_TWOMULT, /* In create_from... two -m's. */ + GF_E_TWO_DIV, /* In create_from... two -d's. */ + GF_E_POLYSPC, /* Bad numbera after -p. */ + GF_E_SPLITAR, /* Ran out of arguments in SPLIT */ + GF_E_SPLITNU, /* Arguments not integers in SPLIT. */ + GF_E_GROUPAR, /* Ran out of arguments in GROUP */ + GF_E_GROUPNU, /* Arguments not integers in GROUP. */ + GF_E_DEFAULT } gf_error_type_t; + diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_method.h b/src/erasure-code/jerasure/gf-complete/include/gf_method.h new file mode 100644 index 000000000000..880b349676dc --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/include/gf_method.h @@ -0,0 +1,20 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_method.h + * + * Parses argv to figure out the flags and arguments. Creates the gf. + */ + +#pragma once + +#include "gf_complete.h" + +/* Parses argv starting at "starting". + + Returns 0 on failure. + On success, it returns one past the last argument it read in argv. */ + +extern int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting); diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_rand.h b/src/erasure-code/jerasure/gf-complete/include/gf_rand.h new file mode 100644 index 000000000000..24294adc704f --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/include/gf_rand.h @@ -0,0 +1,22 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_rand.h + * + * Random number generation, using the "Mother of All" random number generator. */ + +#pragma once +#include +#include +#include + +/* These are all pretty self-explanatory */ +uint32_t MOA_Random_32(); +uint64_t MOA_Random_64(); +void MOA_Random_128(uint64_t *x); +uint32_t MOA_Random_W(int w, int zero_ok); +void MOA_Fill_Random_Region (void *reg, int size); /* reg should be aligned to 4 bytes, but + size can be anything. */ +void MOA_Seed(uint32_t seed); diff --git a/src/erasure-code/jerasure/gf-complete/src/gf.c b/src/erasure-code/jerasure/gf-complete/src/gf.c new file mode 100644 index 000000000000..0e475b93408e --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/gf.c @@ -0,0 +1,1039 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf.c + * + * Generic routines for Galois fields + */ + +#include "gf_int.h" +#include +#include + +int _gf_errno = GF_E_DEFAULT; + +void gf_error() +{ + char *s; + + switch(_gf_errno) { + case GF_E_DEFAULT: s = "No Error."; break; + case GF_E_TWOMULT: s = "Cannot specify two -m's."; break; + case GF_E_TWO_DIV: s = "Cannot specify two -d's."; break; + case GF_E_POLYSPC: s = "-p needs to be followed by a number in hex (0x optional)."; break; + case GF_E_GROUPAR: s = "Ran out of arguments in -m GROUP."; break; + case GF_E_GROUPNU: s = "In -m GROUP g_s g_r -- g_s and g_r need to be numbers."; break; + case GF_E_SPLITAR: s = "Ran out of arguments in -m SPLIT."; break; + case GF_E_SPLITNU: s = "In -m SPLIT w_a w_b -- w_a and w_b need to be numbers."; break; + case GF_E_FEWARGS: s = "Not enough arguments (Perhaps end with '-'?)"; break; + case GF_E_CFM___W: s = "-m CARRY_FREE, w must be 4, 8, 16, 32, 64 or 128."; break; + case GF_E_COMPXPP: s = "-m COMPOSITE, No poly specified, and we don't have a default for the given sub-field."; break; + case GF_E_BASE__W: s = "-m COMPOSITE and the base field is not for w/2."; break; + case GF_E_CFM4POL: s = "-m CARRY_FREE, w=4. (Prim-poly & 0xc) must equal 0."; break; + case GF_E_CFM8POL: s = "-m CARRY_FREE, w=8. (Prim-poly & 0x80) must equal 0."; break; + case GF_E_CF16POL: s = "-m CARRY_FREE, w=16. (Prim-poly & 0xe000) must equal 0."; break; + case GF_E_CF32POL: s = "-m CARRY_FREE, w=32. (Prim-poly & 0xfe000000) must equal 0."; break; + case GF_E_CF64POL: s = "-m CARRY_FREE, w=64. (Prim-poly & 0xfffe000000000000ULL) must equal 0."; break; + case GF_E_MDEFDIV: s = "If multiplication method == default, can't change division."; break; + case GF_E_MDEFREG: s = "If multiplication method == default, can't change region."; break; + case GF_E_MDEFARG: s = "If multiplication method == default, can't use arg1/arg2."; break; + case GF_E_DIVCOMP: s = "Cannot change the division technique with -m COMPOSITE."; break; + case GF_E_DOUQUAD: s = "Cannot specify -r DOUBLE and -r QUAD."; break; + case GF_E_SSE__NO: s = "Cannot specify -r SSE and -r NOSSE."; break; + case GF_E_CAUCHYB: s = "Cannot specify -r CAUCHY and any other -r."; break; + case GF_E_CAUCOMP: s = "Cannot specify -m COMPOSITE and -r CAUCHY."; break; + case GF_E_CAUGT32: s = "Cannot specify -r CAUCHY with w > 32."; break; + case GF_E_ARG1SET: s = "Only use arg1 with SPLIT, GROUP or COMPOSITE."; break; + case GF_E_ARG2SET: s = "Only use arg2 with SPLIT or GROUP."; break; + case GF_E_MATRIXW: s = "Cannot specify -d MATRIX with w > 32."; break; + case GF_E_BAD___W: s = "W must be 1-32, 64 or 128."; break; + case GF_E_DOUBLET: s = "Can only specify -r DOUBLE with -m TABLE."; break; + case GF_E_DOUBLEW: s = "Can only specify -r DOUBLE w = 4 or w = 8."; break; + case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SSE|NOSSE."; break; + case GF_E_DOUBLEL: s = "Can only specify -r DOUBLE -r LAZY with w = 8"; break; + case GF_E_QUAD__T: s = "Can only specify -r QUAD with -m TABLE."; break; + case GF_E_QUAD__W: s = "Can only specify -r QUAD w = 4."; break; + case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SSE|NOSSE."; break; + case GF_E_BADPOLY: s = "Bad primitive polynomial (high bits set)."; break; + case GF_E_COMP_PP: s = "Bad primitive polynomial -- bigger than sub-field."; break; + case GF_E_LAZY__X: s = "If -r LAZY, then -r must be DOUBLE or QUAD."; break; + case GF_E_ALTSHIF: s = "Cannot specify -m SHIFT and -r ALTMAP."; break; + case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SSE|NOSSE."; break; + case GF_E_ALT_CFM: s = "Cannot specify -m CARRY_FREE and -r ALTMAP."; break; + case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SSE|NOSSE."; break; + case GF_E_PCLMULX: s = "Specified -m CARRY_FREE, but PCLMUL is not supported."; break; + case GF_E_ALT_BY2: s = "Cannot specify -m BYTWO_x and -r ALTMAP."; break; + case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SSE, but SSE2 is not supported."; break; + case GF_E_LOGBADW: s = "With Log Tables, w must be <= 27."; break; + case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SSE|NOSSE."; break; + case GF_E_LOGPOLY: s = "Cannot use Log tables because the polynomial is not primitive."; break; + case GF_E_ZERBADW: s = "With -m LOG_ZERO, w must be 8 or 16."; break; + case GF_E_ZEXBADW: s = "With -m LOG_ZERO_EXT, w must be 8."; break; + case GF_E_GR_ARGX: s = "With -m GROUP, arg1 and arg2 must be >= 0."; break; + case GF_E_GR_W_48: s = "With -m GROUP, w cannot be 4 or 8."; break; + case GF_E_GR_W_16: s = "With -m GROUP, w == 16, arg1 and arg2 must be 4."; break; + case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break; + case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break; + case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break; + case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SSE|NOSSE."; break; + case GF_E_TABLE_W: s = "With -m TABLE, w must be < 15, or == 16."; break; + case GF_E_TAB_SSE: s = "With -m TABLE, SSE|NOSSE only applies to w=4."; break; + case GF_E_TABSSE3: s = "With -m TABLE, -r SSE, you need SSSE3 supported."; break; + case GF_E_TAB_ALT: s = "With -m TABLE, you cannot use ALTMAP."; break; + case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break; + case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SSE requires -r ALTMAP."; break; + case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break; + case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r SSE|NOSSE only with arg1/arg2 = 4/128."; break; + case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break; + case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break; + case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break; + case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break; + case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SSE|NOSSE only with arg1/arg2 = 4/16."; break; + case GF_E_SP_32AR: s = "With -m SPLIT, w=32, Bad arg1/arg2."; break; + case GF_E_SP_32AS: s = "With -m SPLIT, w=32, -r ALTMAP needs SSSE3 supported."; break; + case GF_E_SP_32_A: s = "With -m SPLIT, w=32, -r ALTMAP only with arg1/arg2 = 4/32."; break; + case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SSE|NOSSE only with arg1/arg2 = 4/32."; break; + case GF_E_SP_64AR: s = "With -m SPLIT, w=64, Bad arg1/arg2."; break; + case GF_E_SP_64AS: s = "With -m SPLIT, w=64, -r ALTMAP needs SSSE3 supported."; break; + case GF_E_SP_64_A: s = "With -m SPLIT, w=64, -r ALTMAP only with arg1/arg2 = 4/64."; break; + case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SSE|NOSSE only with arg1/arg2 = 4/64."; break; + case GF_E_SP_8_AR: s = "With -m SPLIT, w=8, Bad arg1/arg2."; break; + case GF_E_SP_8__A: s = "With -m SPLIT, w=8, Can't have -r ALTMAP."; break; + case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SSE."; break; + case GF_E_COMP_A2: s = "With -m COMPOSITE, arg1 must equal 2."; break; + case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SSE and -r NOSSE do not apply."; break; + case GF_E_COMP__W: s = "With -m COMPOSITE, w must be 8, 16, 32, 64 or 128."; break; + case GF_E_UNKFLAG: s = "Unknown method flag - should be -m, -d, -r or -p."; break; + case GF_E_UNKNOWN: s = "Unknown multiplication type."; break; + case GF_E_UNK_REG: s = "Unknown region type."; break; + case GF_E_UNK_DIV: s = "Unknown division type."; break; + default: s = "Undefined error."; + } + + fprintf(stderr, "%s\n", s); +} + +uint64_t gf_composite_get_default_poly(gf_t *base) +{ + gf_internal_t *h; + int rv; + + h = (gf_internal_t *) base->scratch; + if (h->w == 4) { + if (h->mult_type == GF_MULT_COMPOSITE) return 0; + if (h->prim_poly == 0x13) return 2; + return 0; + } + if (h->w == 8) { + if (h->mult_type == GF_MULT_COMPOSITE) return 0; + if (h->prim_poly == 0x11d) return 3; + return 0; + } + if (h->w == 16) { + if (h->mult_type == GF_MULT_COMPOSITE) { + rv = gf_composite_get_default_poly(h->base_gf); + if (rv != h->prim_poly) return 0; + if (rv == 3) return 0x105; + return 0; + } else { + if (h->prim_poly == 0x1100b) return 2; + if (h->prim_poly == 0x1002d) return 7; + return 0; + } + } + if (h->w == 32) { + if (h->mult_type == GF_MULT_COMPOSITE) { + rv = gf_composite_get_default_poly(h->base_gf); + if (rv != h->prim_poly) return 0; + if (rv == 2) return 0x10005; + if (rv == 7) return 0x10008; + if (rv == 0x105) return 0x10002; + return 0; + } else { + if (h->prim_poly == 0x400007) return 2; + if (h->prim_poly == 0xc5) return 3; + return 0; + } + } + if (h->w == 64) { + if (h->mult_type == GF_MULT_COMPOSITE) { + rv = gf_composite_get_default_poly(h->base_gf); + if (rv != h->prim_poly) return 0; + if (rv == 3) return 0x100000009ULL; + if (rv == 2) return 0x100000004ULL; + if (rv == 0x10005) return 0x100000003ULL; + if (rv == 0x10002) return 0x100000005ULL; + if (rv == 0x10008) return 0x100000006ULL; /* JSP: (0x0x100000003 works too, + but I want to differentiate cases). */ + return 0; + } else { + if (h->prim_poly == 0x1bULL) return 2; + return 0; + } + } + return 0; +} + +int gf_error_check(int w, int mult_type, int region_type, int divide_type, + int arg1, int arg2, uint64_t poly, gf_t *base) +{ + int sse3 = 0; + int sse2 = 0; + int pclmul = 0; + int rdouble, rquad, rlazy, rsse, rnosse, raltmap, rcauchy, tmp; + gf_internal_t *sub; + + rdouble = (region_type & GF_REGION_DOUBLE_TABLE); + rquad = (region_type & GF_REGION_QUAD_TABLE); + rlazy = (region_type & GF_REGION_LAZY); + rsse = (region_type & GF_REGION_SSE); + rnosse = (region_type & GF_REGION_NOSSE); + raltmap = (region_type & GF_REGION_ALTMAP); + rcauchy = (region_type & GF_REGION_CAUCHY); + + if (divide_type != GF_DIVIDE_DEFAULT && + divide_type != GF_DIVIDE_MATRIX && + divide_type != GF_DIVIDE_EUCLID) { + _gf_errno = GF_E_UNK_DIV; + return 0; + } + + tmp = ( GF_REGION_DOUBLE_TABLE | GF_REGION_QUAD_TABLE | GF_REGION_LAZY | + GF_REGION_SSE | GF_REGION_NOSSE | GF_REGION_ALTMAP | GF_REGION_CAUCHY ); + if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; } + +#ifdef INTEL_SSE2 + sse2 = 1; +#endif + +#ifdef INTEL_SSSE3 + sse3 = 1; +#endif + +#ifdef INTEL_SSE4_PCLMUL + pclmul = 1; +#endif + + + if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; } + + if (mult_type != GF_MULT_COMPOSITE && w < 64) { + if ((poly >> (w+1)) != 0) { _gf_errno = GF_E_BADPOLY; return 0; } + } + + if (mult_type == GF_MULT_DEFAULT) { + if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_MDEFDIV; return 0; } + if (region_type != GF_REGION_DEFAULT) { _gf_errno = GF_E_MDEFREG; return 0; } + if (arg1 != 0 || arg2 != 0) { _gf_errno = GF_E_MDEFARG; return 0; } + return 1; + } + + if (rsse && rnosse) { _gf_errno = GF_E_SSE__NO; return 0; } + if (rcauchy && w > 32) { _gf_errno = GF_E_CAUGT32; return 0; } + if (rcauchy && region_type != GF_REGION_CAUCHY) { _gf_errno = GF_E_CAUCHYB; return 0; } + if (rcauchy && mult_type == GF_MULT_COMPOSITE) { _gf_errno = GF_E_CAUCOMP; return 0; } + + if (arg1 != 0 && mult_type != GF_MULT_COMPOSITE && + mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) { + _gf_errno = GF_E_ARG1SET; + return 0; + } + + if (arg2 != 0 && mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) { + _gf_errno = GF_E_ARG2SET; + return 0; + } + + if (divide_type == GF_DIVIDE_MATRIX && w > 32) { _gf_errno = GF_E_MATRIXW; return 0; } + + if (rdouble) { + if (rquad) { _gf_errno = GF_E_DOUQUAD; return 0; } + if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_DOUBLET; return 0; } + if (w != 4 && w != 8) { _gf_errno = GF_E_DOUBLEW; return 0; } + if (rsse || rnosse || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; } + if (rlazy && w == 4) { _gf_errno = GF_E_DOUBLEL; return 0; } + return 1; + } + + if (rquad) { + if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_QUAD__T; return 0; } + if (w != 4) { _gf_errno = GF_E_QUAD__W; return 0; } + if (rsse || rnosse || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; } + return 1; + } + + if (rlazy) { _gf_errno = GF_E_LAZY__X; return 0; } + + if (mult_type == GF_MULT_SHIFT) { + if (raltmap) { _gf_errno = GF_E_ALTSHIF; return 0; } + if (rsse || rnosse) { _gf_errno = GF_E_SSESHIF; return 0; } + return 1; + } + + if (mult_type == GF_MULT_CARRY_FREE) { + if (w != 4 && w != 8 && w != 16 && + w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; } + if (w == 4 && (poly & 0xc)) { _gf_errno = GF_E_CFM4POL; return 0; } + if (w == 8 && (poly & 0x80)) { _gf_errno = GF_E_CFM8POL; return 0; } + if (w == 16 && (poly & 0xe000)) { _gf_errno = GF_E_CF16POL; return 0; } + if (w == 32 && (poly & 0xfe000000)) { _gf_errno = GF_E_CF32POL; return 0; } + if (w == 64 && (poly & 0xfffe000000000000ULL)) { _gf_errno = GF_E_CF64POL; return 0; } + if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; } + if (rsse || rnosse) { _gf_errno = GF_E_SSE_CFM; return 0; } + if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; } + return 1; + } + + if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) { + if (raltmap) { _gf_errno = GF_E_ALT_BY2; return 0; } + if (rsse && !sse2) { _gf_errno = GF_E_BY2_SSE; return 0; } + return 1; + } + + if (mult_type == GF_MULT_LOG_TABLE || mult_type == GF_MULT_LOG_ZERO + || mult_type == GF_MULT_LOG_ZERO_EXT ) { + if (w > 27) { _gf_errno = GF_E_LOGBADW; return 0; } + if (raltmap || rsse || rnosse) { _gf_errno = GF_E_LOG___J; return 0; } + + if (mult_type == GF_MULT_LOG_TABLE) return 1; + + if (w != 8 && w != 16) { _gf_errno = GF_E_ZERBADW; return 0; } + + if (mult_type == GF_MULT_LOG_ZERO) return 1; + + if (w != 8) { _gf_errno = GF_E_ZEXBADW; return 0; } + return 1; + } + + if (mult_type == GF_MULT_GROUP) { + if (arg1 <= 0 || arg2 <= 0) { _gf_errno = GF_E_GR_ARGX; return 0; } + if (w == 4 || w == 8) { _gf_errno = GF_E_GR_W_48; return 0; } + if (w == 16 && (arg1 != 4 || arg2 != 4)) { _gf_errno = GF_E_GR_W_16; return 0; } + if (w == 128 && (arg1 != 4 || + (arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; } + if (arg1 > 27 || arg2 > 27) { _gf_errno = GF_E_GR_A_27; return 0; } + if (arg1 > w || arg2 > w) { _gf_errno = GF_E_GR_AR_W; return 0; } + if (raltmap || rsse || rnosse) { _gf_errno = GF_E_GR____J; return 0; } + return 1; + } + + if (mult_type == GF_MULT_TABLE) { + if (w != 16 && w >= 15) { _gf_errno = GF_E_TABLE_W; return 0; } + if (w != 4 && (rsse || rnosse)) { _gf_errno = GF_E_TAB_SSE; return 0; } + if (rsse && !sse3) { _gf_errno = GF_E_TABSSE3; return 0; } + if (raltmap) { _gf_errno = GF_E_TAB_ALT; return 0; } + return 1; + } + + if (mult_type == GF_MULT_SPLIT_TABLE) { + if (arg1 > arg2) { + tmp = arg1; + arg1 = arg2; + arg2 = tmp; + } + if (w == 8) { + if (arg1 != 4 || arg2 != 8) { _gf_errno = GF_E_SP_8_AR; return 0; } + if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } + if (raltmap) { _gf_errno = GF_E_SP_8__A; return 0; } + } else if (w == 16) { + if (arg1 == 4 && arg2 == 16) { + if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } + } else if (arg1 == 8 && (arg2 == 16 || arg2 == 8)) { + if (rsse || rnosse) { _gf_errno = GF_E_SP_16_S; return 0; } + if (raltmap) { _gf_errno = GF_E_SP_16_A; return 0; } + } else { _gf_errno = GF_E_SP_16AR; return 0; } + } else if (w == 32) { + if ((arg1 == 8 && arg2 == 8) || + (arg1 == 8 && arg2 == 32) || + (arg1 == 16 && arg2 == 32)) { + if (rsse || rnosse) { _gf_errno = GF_E_SP_32_S; return 0; } + if (raltmap) { _gf_errno = GF_E_SP_32_A; return 0; } + } else if ((arg1 == 4 && arg2 == 32) || + (arg1 == 4 && arg2 == 32)) { + if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } + if (raltmap && arg1 != 4) { _gf_errno = GF_E_SP_32_A; return 0; } + if (raltmap && !sse3) { _gf_errno = GF_E_SP_32AS; return 0; } + if (raltmap && rnosse) { _gf_errno = GF_E_SP_32AS; return 0; } + } else { _gf_errno = GF_E_SP_32AR; return 0; } + } else if (w == 64) { + if ((arg1 == 8 && arg2 == 8) || + (arg1 == 8 && arg2 == 64) || + (arg1 == 16 && arg2 == 64)) { + if (rsse || rnosse) { _gf_errno = GF_E_SP_64_S; return 0; } + if (raltmap) { _gf_errno = GF_E_SP_64_A; return 0; } + } else if (arg1 == 4 && arg2 == 64) { + if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } + if (raltmap && !sse3) { _gf_errno = GF_E_SP_64AS; return 0; } + if (raltmap && rnosse) { _gf_errno = GF_E_SP_64AS; return 0; } + } else { _gf_errno = GF_E_SP_64AR; return 0; } + } else if (w == 128) { + if (arg1 == 8 && arg2 == 128) { + if (rsse || rnosse) { _gf_errno = GF_E_SP128_S; return 0; } + if (raltmap) { _gf_errno = GF_E_SP128_A; return 0; } + } else if (arg1 == 4 && arg2 == 128) { + if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } + if (raltmap && !sse3) { _gf_errno = GF_E_SP128AS; return 0; } + if (raltmap && rnosse) { _gf_errno = GF_E_SP128AS; return 0; } + } else { _gf_errno = GF_E_SP128AR; return 0; } + } else { _gf_errno = GF_E_SPLIT_W; return 0; } + return 1; + } + + if (mult_type == GF_MULT_COMPOSITE) { + if (w != 8 && w != 16 && w != 32 + && w != 64 && w != 128) { _gf_errno = GF_E_COMP__W; return 0; } + if ((poly >> (w/2)) != 0) { _gf_errno = GF_E_COMP_PP; return 0; } + if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_DIVCOMP; return 0; } + if (arg1 != 2) { _gf_errno = GF_E_COMP_A2; return 0; } + if (rsse || rnosse) { _gf_errno = GF_E_COMP_SS; return 0; } + if (base != NULL) { + sub = (gf_internal_t *) base->scratch; + if (sub->w != w/2) { _gf_errno = GF_E_BASE__W; return 0; } + if (poly == 0) { + if (gf_composite_get_default_poly(base) == 0) { _gf_errno = GF_E_COMPXPP; return 0; } + } + } + return 1; + } + + _gf_errno = GF_E_UNKNOWN; + return 0; +} + +int gf_scratch_size(int w, + int mult_type, + int region_type, + int divide_type, + int arg1, + int arg2) +{ + if (gf_error_check(w, mult_type, region_type, divide_type, arg1, arg2, 0, NULL) == 0) return 0; + + switch(w) { + case 4: return gf_w4_scratch_size(mult_type, region_type, divide_type, arg1, arg2); + case 8: return gf_w8_scratch_size(mult_type, region_type, divide_type, arg1, arg2); + case 16: return gf_w16_scratch_size(mult_type, region_type, divide_type, arg1, arg2); + case 32: return gf_w32_scratch_size(mult_type, region_type, divide_type, arg1, arg2); + case 64: return gf_w64_scratch_size(mult_type, region_type, divide_type, arg1, arg2); + case 128: return gf_w128_scratch_size(mult_type, region_type, divide_type, arg1, arg2); + default: return gf_wgen_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2); + } +} + +extern int gf_size(gf_t *gf) +{ + gf_internal_t *h; + int s; + + s = sizeof(gf_t); + h = (gf_internal_t *) gf->scratch; + s += gf_scratch_size(h->w, h->mult_type, h->region_type, h->divide_type, h->arg1, h->arg2); + if (h->mult_type == GF_MULT_COMPOSITE) s += gf_size(h->base_gf); + return s; +} + + +int gf_init_easy(gf_t *gf, int w) +{ + return gf_init_hard(gf, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, + 0, 0, 0, NULL, NULL); +} + +/* Allen: What's going on here is this function is putting info into the + scratch mem of gf, and then calling the relevant REAL init + func for the word size. Probably done this way to consolidate + those aspects of initialization that don't rely on word size, + and then take care of word-size-specific stuff. */ + +int gf_init_hard(gf_t *gf, int w, int mult_type, + int region_type, + int divide_type, + uint64_t prim_poly, + int arg1, int arg2, + gf_t *base_gf, + void *scratch_memory) +{ + int sz; + gf_internal_t *h; + + if (gf_error_check(w, mult_type, region_type, divide_type, + arg1, arg2, prim_poly, base_gf) == 0) return 0; + + sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2); + if (sz <= 0) return 0; /* This shouldn't happen, as all errors should get caught + in gf_error_check() */ + + if (scratch_memory == NULL) { + h = (gf_internal_t *) malloc(sz); + h->free_me = 1; + } else { + h = scratch_memory; + h->free_me = 0; + } + gf->scratch = (void *) h; + h->mult_type = mult_type; + h->region_type = region_type; + h->divide_type = divide_type; + h->w = w; + h->prim_poly = prim_poly; + h->arg1 = arg1; + h->arg2 = arg2; + h->base_gf = base_gf; + h->private = (void *) gf->scratch; + h->private = (char*)h->private + (sizeof(gf_internal_t)); + h->sse = 0x00; +#ifdef INTEL_SSE2 + h->sse |= GF_SSE2; +#endif +#ifdef INTEL_SSSE3 + h->sse |= GF_SSSE3; +#endif +#ifdef INTEL_SSE4 + h->sse |= GF_SSE4; +#endif +#ifdef INTEL_SSE4_PCLMUL + h->sse |= GF_SSE4_PCLMUL; +#endif + gf->extract_word.w32 = NULL; + + switch(w) { + case 4: return gf_w4_init(gf); + case 8: return gf_w8_init(gf); + case 16: return gf_w16_init(gf); + case 32: return gf_w32_init(gf); + case 64: return gf_w64_init(gf); + case 128: return gf_w128_init(gf); + default: return gf_wgen_init(gf); + } +} + +int gf_free(gf_t *gf, int recursive) +{ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + if (recursive && h->base_gf != NULL) { + gf_free(h->base_gf, 1); + free(h->base_gf); + } + if (h->free_me) free(h); + return 0; /* Making compiler happy */ +} + +void gf_alignment_error(char *s, int a) +{ + fprintf(stderr, "Alignment error in %s:\n", s); + fprintf(stderr, " The source and destination buffers must be aligned to each other,\n"); + fprintf(stderr, " and they must be aligned to a %d-byte address.\n", a); + exit(1); +} + +static +void gf_invert_binary_matrix(uint32_t *mat, uint32_t *inv, int rows) { + int cols, i, j; + uint32_t tmp; + + cols = rows; + + for (i = 0; i < rows; i++) inv[i] = (1 << i); + + /* First -- convert into upper triangular */ + + for (i = 0; i < cols; i++) { + + /* Swap rows if we ave a zero i,i element. If we can't swap, then the + matrix was not invertible */ + + if ((mat[i] & (1 << i)) == 0) { + for (j = i+1; j < rows && (mat[j] & (1 << i)) == 0; j++) ; + if (j == rows) { + fprintf(stderr, "galois_invert_matrix: Matrix not invertible!!\n"); + exit(1); + } + tmp = mat[i]; mat[i] = mat[j]; mat[j] = tmp; + tmp = inv[i]; inv[i] = inv[j]; inv[j] = tmp; + } + + /* Now for each j>i, add A_ji*Ai to Aj */ + for (j = i+1; j != rows; j++) { + if ((mat[j] & (1 << i)) != 0) { + mat[j] ^= mat[i]; + inv[j] ^= inv[i]; + } + } + } + + /* Now the matrix is upper triangular. Start at the top and multiply down */ + + for (i = rows-1; i >= 0; i--) { + for (j = 0; j < i; j++) { + if (mat[j] & (1 << i)) { + /* mat[j] ^= mat[i]; */ + inv[j] ^= inv[i]; + } + } + } +} + +uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp) +{ + uint32_t mat[32], inv[32], mask; + int i; + + mask = (w == 32) ? 0xffffffff : (1 << w) - 1; + for (i = 0; i < w; i++) { + mat[i] = y; + + if (y & (1 << (w-1))) { + y = y << 1; + y = ((y ^ pp) & mask); + } else { + y = y << 1; + } + } + + gf_invert_binary_matrix(mat, inv, w); + return inv[0]; +} + +void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base) +{ + uint64_t a, prod; + int xor; + uint64_t *s64, *d64, *top; + + s64 = rd->s_start; + d64 = rd->d_start; + top = rd->d_top; + xor = rd->xor; + + if (xor) { + while (d64 != top) { + a = *s64; + prod = base[a >> 48]; + a <<= 16; + prod <<= 16; + prod ^= base[a >> 48]; + a <<= 16; + prod <<= 16; + prod ^= base[a >> 48]; + a <<= 16; + prod <<= 16; + prod ^= base[a >> 48]; + prod ^= *d64; + *d64 = prod; + s64++; + d64++; + } + } else { + while (d64 != top) { + a = *s64; + prod = base[a >> 48]; + a <<= 16; + prod <<= 16; + prod ^= base[a >> 48]; + a <<= 16; + prod <<= 16; + prod ^= base[a >> 48]; + a <<= 16; + prod <<= 16; + prod ^= base[a >> 48]; + *d64 = prod; + s64++; + d64++; + } + } +} + +static void gf_slow_multiply_region(gf_region_data *rd, void *src, void *dest, void *s_top) +{ + uint8_t *s8, *d8; + uint16_t *s16, *d16; + uint32_t *s32, *d32; + uint64_t *s64, *d64; + gf_internal_t *h; + int wb; + uint32_t p, a; + + h = rd->gf->scratch; + wb = (h->w)/8; + if (wb == 0) wb = 1; + + while (src < s_top) { + switch (h->w) { + case 8: + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + *d8 = (rd->xor) ? (*d8 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s8)) : + rd->gf->multiply.w32(rd->gf, rd->val, *s8); + break; + case 4: + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + a = *s8; + p = rd->gf->multiply.w32(rd->gf, rd->val, a&0xf); + p |= (rd->gf->multiply.w32(rd->gf, rd->val, a >> 4) << 4); + if (rd->xor) p ^= *d8; + *d8 = p; + break; + case 16: + s16 = (uint16_t *) src; + d16 = (uint16_t *) dest; + *d16 = (rd->xor) ? (*d16 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s16)) : + rd->gf->multiply.w32(rd->gf, rd->val, *s16); + break; + case 32: + s32 = (uint32_t *) src; + d32 = (uint32_t *) dest; + *d32 = (rd->xor) ? (*d32 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s32)) : + rd->gf->multiply.w32(rd->gf, rd->val, *s32); + break; + case 64: + s64 = (uint64_t *) src; + d64 = (uint64_t *) dest; + *d64 = (rd->xor) ? (*d64 ^ rd->gf->multiply.w64(rd->gf, rd->val, *s64)) : + rd->gf->multiply.w64(rd->gf, rd->val, *s64); + break; + default: + fprintf(stderr, "Error: gf_slow_multiply_region: w=%d not implemented.\n", h->w); + exit(1); + } + src = (char*)src + wb; + dest = (char*)dest + wb; + } +} + +/* JSP - The purpose of this procedure is to error check alignment, + and to set up the region operation so that it can best leverage + large words. + + It stores its information in rd. + + Assuming you're not doing Cauchy coding, (see below for that), + then w will be 4, 8, 16, 32 or 64. It can't be 128 (probably + should change that). + + src and dest must then be aligned on ceil(w/8)-byte boundaries. + Moreover, bytes must be a multiple of ceil(w/8). If the variable + align is equal to ceil(w/8), then we will set s_start = src, + d_start = dest, s_top to (src+bytes) and d_top to (dest+bytes). + And we return -- the implementation will go ahead and do the + multiplication on individual words (e.g. using discrete logs). + + If align is greater than ceil(w/8), then the implementation needs + to work on groups of "align" bytes. For example, suppose you are + implementing BYTWO, without SSE. Then you will be doing the region + multiplication in units of 8 bytes, so align = 8. Or, suppose you + are doing a Quad table in GF(2^4). You will be doing the region + multiplication in units of 2 bytes, so align = 2. Or, suppose you + are doing split multiplication with SSE operations in GF(2^8). + Then align = 16. Worse yet, suppose you are doing split + multiplication with SSE operations in GF(2^16), with or without + ALTMAP. Then, you will be doing the multiplication on 256 bits at + a time. So align = 32. + + When align does not equal ceil(w/8), we split the region + multiplication into three parts. We are going to make s_start be + the first address greater than or equal to src that is a multiple + of align. s_top is going to be the largest address >= src+bytes + such that (s_top - s_start) is a multiple of align. We do the + same with d_start and d_top. When we say that "src and dest must + be aligned with respect to each other, we mean that s_start-src + must equal d_start-dest. + + Now, the region multiplication is done in three parts -- the part + between src and s_start must be done using single words. + Similarly, the part between s_top and src+bytes must also be done + using single words. The part between s_start and s_top will be + done in chunks of "align" bytes. + + One final thing -- if align > 16, then s_start and d_start will be + aligned on a 16 byte boundary. Perhaps we should have two + variables: align and chunksize. Then we'd have s_start & d_start + aligned to "align", and have s_top-s_start be a multiple of + chunksize. That may be less confusing, but it would be a big + change. + + Finally, if align = -1, then we are doing Cauchy multiplication, + using only XOR's. In this case, we're not going to care about + alignment because we are just doing XOR's. Instead, the only + thing we care about is that bytes must be a multiple of w. + + This is not to say that alignment doesn't matter in performance + with XOR's. See that discussion in gf_multby_one(). + + After you call gf_set_region_data(), the procedure + gf_do_initial_region_alignment() calls gf->multiply.w32() on + everything between src and s_start. The procedure + gf_do_final_region_alignment() calls gf->multiply.w32() on + everything between s_top and src+bytes. + */ + +void gf_set_region_data(gf_region_data *rd, + gf_t *gf, + void *src, + void *dest, + int bytes, + uint64_t val, + int xor, + int align) +{ + gf_internal_t *h = NULL; + int wb; + uint32_t a; + unsigned long uls, uld; + + if (gf == NULL) { /* JSP - Can be NULL if you're just doing XOR's */ + wb = 1; + } else { + h = gf->scratch; + wb = (h->w)/8; + if (wb == 0) wb = 1; + } + + rd->gf = gf; + rd->src = src; + rd->dest = dest; + rd->bytes = bytes; + rd->val = val; + rd->xor = xor; + rd->align = align; + + uls = (unsigned long) src; + uld = (unsigned long) dest; + + a = (align <= 16) ? align : 16; + + if (align == -1) { /* JSP: This is cauchy. Error check bytes, then set up the pointers + so that there are no alignment regions. */ + if (h != NULL && bytes % h->w != 0) { + fprintf(stderr, "Error in region multiply operation.\n"); + fprintf(stderr, "The size must be a multiple of %d bytes.\n", h->w); + exit(1); + } + + rd->s_start = src; + rd->d_start = dest; + rd->s_top = (char*)src + bytes; + rd->d_top = (char*)src + bytes; + return; + } + + if (uls % a != uld % a) { + fprintf(stderr, "Error in region multiply operation.\n"); + fprintf(stderr, "The source & destination pointers must be aligned with respect\n"); + fprintf(stderr, "to each other along a %d byte boundary.\n", a); + fprintf(stderr, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src, + (unsigned long) dest); + exit(1); + } + + if (uls % wb != 0) { + fprintf(stderr, "Error in region multiply operation.\n"); + fprintf(stderr, "The pointers must be aligned along a %d byte boundary.\n", wb); + fprintf(stderr, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src, + (unsigned long) dest); + exit(1); + } + + if (bytes % wb != 0) { + fprintf(stderr, "Error in region multiply operation.\n"); + fprintf(stderr, "The size must be a multiple of %d bytes.\n", wb); + exit(1); + } + + uls %= a; + if (uls != 0) uls = (a-uls); + rd->s_start = (char*)rd->src + uls; + rd->d_start = (char*)rd->dest + uls; + bytes -= uls; + bytes -= (bytes % align); + rd->s_top = (char*)rd->s_start + bytes; + rd->d_top = (char*)rd->d_start + bytes; + +} + +void gf_do_initial_region_alignment(gf_region_data *rd) +{ + gf_slow_multiply_region(rd, rd->src, rd->dest, rd->s_start); +} + +void gf_do_final_region_alignment(gf_region_data *rd) +{ + gf_slow_multiply_region(rd, rd->s_top, rd->d_top, (char*)rd->src+rd->bytes); +} + +void gf_multby_zero(void *dest, int bytes, int xor) +{ + if (xor) return; + bzero(dest, bytes); + return; +} + +/* JSP - gf_multby_one tries to do this in the most efficient way + possible. If xor = 0, then simply call memcpy() since that + should be optimized by the system. Otherwise, try to do the xor + in the following order: + + If src and dest are aligned with respect to each other on 16-byte + boundaries and you have SSE instructions, then use aligned SSE + instructions. + + If they aren't but you still have SSE instructions, use unaligned + SSE instructions. + + If there are no SSE instructions, but they are aligned with + respect to each other on 8-byte boundaries, then do them with + uint64_t's. + + Otherwise, call gf_unaligned_xor(), which does the following: + align a destination pointer along an 8-byte boundary, and then + memcpy 32 bytes at a time from the src pointer to an array of + doubles. I'm not sure if that's the best -- probably needs + testing, but this seems like it could be a black hole. + */ + +static void gf_unaligned_xor(void *src, void *dest, int bytes); + +void gf_multby_one(void *src, void *dest, int bytes, int xor) +{ +#ifdef INTEL_SSE2 + __m128i ms, md; +#endif + unsigned long uls, uld; + uint8_t *s8, *d8; + uint64_t *s64, *d64, *dtop64; + gf_region_data rd; + + if (!xor) { + memcpy(dest, src, bytes); + return; + } + uls = (unsigned long) src; + uld = (unsigned long) dest; + +#ifdef INTEL_SSE2 + int abytes; + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + if (uls % 16 == uld % 16) { + gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16); + while (s8 != rd.s_start) { + *d8 ^= *s8; + d8++; + s8++; + } + while (s8 < (uint8_t *) rd.s_top) { + ms = _mm_load_si128 ((__m128i *)(s8)); + md = _mm_load_si128 ((__m128i *)(d8)); + md = _mm_xor_si128(md, ms); + _mm_store_si128((__m128i *)(d8), md); + s8 += 16; + d8 += 16; + } + while (s8 != (uint8_t *) src + bytes) { + *d8 ^= *s8; + d8++; + s8++; + } + return; + } + + abytes = (bytes & 0xfffffff0); + + while (d8 < (uint8_t *) dest + abytes) { + ms = _mm_loadu_si128 ((__m128i *)(s8)); + md = _mm_loadu_si128 ((__m128i *)(d8)); + md = _mm_xor_si128(md, ms); + _mm_storeu_si128((__m128i *)(d8), md); + s8 += 16; + d8 += 16; + } + while (d8 != (uint8_t *) dest+bytes) { + *d8 ^= *s8; + d8++; + s8++; + } + return; +#endif + + if (uls % 8 != uld % 8) { + gf_unaligned_xor(src, dest, bytes); + return; + } + + gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 8); + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + while (d8 != rd.d_start) { + *d8 ^= *s8; + d8++; + s8++; + } + dtop64 = (uint64_t *) rd.d_top; + + d64 = (uint64_t *) rd.d_start; + s64 = (uint64_t *) rd.s_start; + + while (d64 < dtop64) { + *d64 ^= *s64; + d64++; + s64++; + } + + s8 = (uint8_t *) rd.s_top; + d8 = (uint8_t *) rd.d_top; + + while (d8 != (uint8_t *) dest+bytes) { + *d8 ^= *s8; + d8++; + s8++; + } + return; +} + +#define UNALIGNED_BUFSIZE (8) + +static void gf_unaligned_xor(void *src, void *dest, int bytes) +{ + uint64_t scopy[UNALIGNED_BUFSIZE], *d64; + int i; + gf_region_data rd; + uint8_t *s8, *d8; + + /* JSP - call gf_set_region_data(), but use dest in both places. This is + because I only want to set up dest. If I used src, gf_set_region_data() + would fail because src and dest are not aligned to each other wrt + 8-byte pointers. I know this will actually align d_start to 16 bytes. + If I change gf_set_region_data() to split alignment & chunksize, then + I could do this correctly. */ + + gf_set_region_data(&rd, NULL, dest, dest, bytes, 1, 1, 8*UNALIGNED_BUFSIZE); + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + + while (d8 < (uint8_t *) rd.d_start) { + *d8 ^= *s8; + d8++; + s8++; + } + + d64 = (uint64_t *) d8; + while (d64 < (uint64_t *) rd.d_top) { + memcpy(scopy, s8, 8*UNALIGNED_BUFSIZE); + s8 += 8*UNALIGNED_BUFSIZE; + for (i = 0; i < UNALIGNED_BUFSIZE; i++) { + *d64 ^= scopy[i]; + d64++; + } + } + + d8 = (uint8_t *) d64; + while (d8 < (uint8_t *) ((char*)dest+bytes)) { + *d8 ^= *s8; + d8++; + s8++; + } +} diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_general.c b/src/erasure-code/jerasure/gf-complete/src/gf_general.c new file mode 100644 index 000000000000..c41059815351 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/gf_general.c @@ -0,0 +1,538 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_general.c + * + * This file has helper routines for doing basic GF operations with any + * legal value of w. The problem is that w <= 32, w=64 and w=128 all have + * different data types, which is a pain. The procedures in this file try + * to alleviate that pain. They are used in gf_unit and gf_time. + */ + +#include +#include +#include +#include +#include +#include + +#include "gf_complete.h" +#include "gf_int.h" +#include "gf_method.h" +#include "gf_rand.h" +#include "gf_general.h" + +void gf_general_set_zero(gf_general_t *v, int w) +{ + if (w <= 32) { + v->w32 = 0; + } else if (w <= 64) { + v->w64 = 0; + } else { + v->w128[0] = 0; + v->w128[1] = 0; + } +} + +void gf_general_set_one(gf_general_t *v, int w) +{ + if (w <= 32) { + v->w32 = 1; + } else if (w <= 64) { + v->w64 = 1; + } else { + v->w128[0] = 0; + v->w128[1] = 1; + } +} + +void gf_general_set_two(gf_general_t *v, int w) +{ + if (w <= 32) { + v->w32 = 2; + } else if (w <= 64) { + v->w64 = 2; + } else { + v->w128[0] = 0; + v->w128[1] = 2; + } +} + +int gf_general_is_zero(gf_general_t *v, int w) +{ + if (w <= 32) { + return (v->w32 == 0); + } else if (w <= 64) { + return (v->w64 == 0); + } else { + return (v->w128[0] == 0 && v->w128[1] == 0); + } +} + +int gf_general_is_one(gf_general_t *v, int w) +{ + if (w <= 32) { + return (v->w32 == 1); + } else if (w <= 64) { + return (v->w64 == 1); + } else { + return (v->w128[0] == 0 && v->w128[1] == 1); + } +} + +void gf_general_set_random(gf_general_t *v, int w, int zero_ok) +{ + if (w <= 32) { + v->w32 = MOA_Random_W(w, zero_ok); + } else if (w <= 64) { + while (1) { + v->w64 = MOA_Random_64(); + if (v->w64 != 0 || zero_ok) return; + } + } else { + while (1) { + MOA_Random_128(v->w128); + if (v->w128[0] != 0 || v->w128[1] != 0 || zero_ok) return; + } + } +} + +void gf_general_val_to_s(gf_general_t *v, int w, char *s, int hex) +{ + if (w <= 32) { + if (hex) { + sprintf(s, "%x", v->w32); + } else { + sprintf(s, "%u", v->w32); + } + } else if (w <= 64) { + if (hex) { + sprintf(s, "%llx", (long long unsigned int) v->w64); + } else { + sprintf(s, "%lld", (long long unsigned int) v->w64); + } + } else { + if (v->w128[0] == 0) { + sprintf(s, "%llx", (long long unsigned int) v->w128[1]); + } else { + sprintf(s, "%llx%016llx", (long long unsigned int) v->w128[0], + (long long unsigned int) v->w128[1]); + } + } +} + +int gf_general_s_to_val(gf_general_t *v, int w, char *s, int hex) +{ + int l; + int save; + + if (w <= 32) { + if (hex) { + if (sscanf(s, "%x", &(v->w32)) == 0) return 0; + } else { + if (sscanf(s, "%u", &(v->w32)) == 0) return 0; + } + if (w == 32) return 1; + if (w == 31) { + if (v->w32 & (1 << 31)) return 0; + return 1; + } + if (v->w32 & ~((1 << w)-1)) return 0; + return 1; + } else if (w <= 64) { + if (hex) return (sscanf(s, "%llx", (long long unsigned int *) (&(v->w64))) == 1); + return (sscanf(s, "%lld", (long long int *) (&(v->w64))) == 1); + } else { + if (!hex) return 0; + l = strlen(s); + if (l <= 16) { + v->w128[0] = 0; + return (sscanf(s, "%llx", (long long unsigned int *) (&(v->w128[1]))) == 1); + } else { + if (l > 32) return 0; + save = s[l-16]; + s[l-16] = '\0'; + if (sscanf(s, "%llx", (long long unsigned int *) (&(v->w128[0]))) == 0) { + s[l-16] = save; + return 0; + } + return (sscanf(s+(l-16), "%llx", (long long unsigned int *) (&(v->w128[1]))) == 1); + } + } +} + +void gf_general_add(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c) +{ + gf_internal_t *h; + int w; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + + if (w <= 32) { + c->w32 = a->w32 ^ b->w32; + } else if (w <= 64) { + c->w64 = a->w64 ^ b->w64; + } else { + c->w128[0] = a->w128[0] ^ b->w128[0]; + c->w128[1] = a->w128[1] ^ b->w128[1]; + } +} + +void gf_general_multiply(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c) +{ + gf_internal_t *h; + int w; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + + if (w <= 32) { + c->w32 = gf->multiply.w32(gf, a->w32, b->w32); + } else if (w <= 64) { + c->w64 = gf->multiply.w64(gf, a->w64, b->w64); + } else { + gf->multiply.w128(gf, a->w128, b->w128, c->w128); + } +} + +void gf_general_divide(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c) +{ + gf_internal_t *h; + int w; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + + if (w <= 32) { + c->w32 = gf->divide.w32(gf, a->w32, b->w32); + } else if (w <= 64) { + c->w64 = gf->divide.w64(gf, a->w64, b->w64); + } else { + gf->divide.w128(gf, a->w128, b->w128, c->w128); + } +} + +void gf_general_inverse(gf_t *gf, gf_general_t *a, gf_general_t *b) +{ + gf_internal_t *h; + int w; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + + if (w <= 32) { + b->w32 = gf->inverse.w32(gf, a->w32); + } else if (w <= 64) { + b->w64 = gf->inverse.w64(gf, a->w64); + } else { + gf->inverse.w128(gf, a->w128, b->w128); + } +} + +int gf_general_are_equal(gf_general_t *v1, gf_general_t *v2, int w) +{ + if (w <= 32) { + return (v1->w32 == v2->w32); + } else if (w <= 64) { + return (v1->w64 == v2->w64); + } else { + return (v1->w128[0] == v2->w128[0] && + v1->w128[0] == v2->w128[0]); + } +} + +void gf_general_do_region_multiply(gf_t *gf, gf_general_t *a, void *ra, void *rb, int bytes, int xor) +{ + gf_internal_t *h; + int w; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + + if (w <= 32) { + gf->multiply_region.w32(gf, ra, rb, a->w32, bytes, xor); + } else if (w <= 64) { + gf->multiply_region.w64(gf, ra, rb, a->w64, bytes, xor); + } else { + gf->multiply_region.w128(gf, ra, rb, a->w128, bytes, xor); + } +} + +void gf_general_do_region_check(gf_t *gf, gf_general_t *a, void *orig_a, void *orig_target, void *final_target, int bytes, int xor) +{ + gf_internal_t *h; + int w, words, i; + gf_general_t oa, ot, ft, sb; + char sa[50], soa[50], sot[50], sft[50], ssb[50]; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + + words = (bytes * 8) / w; + for (i = 0; i < words; i++) { + if (w <= 32) { + oa.w32 = gf->extract_word.w32(gf, orig_a, bytes, i); + ot.w32 = gf->extract_word.w32(gf, orig_target, bytes, i); + ft.w32 = gf->extract_word.w32(gf, final_target, bytes, i); + sb.w32 = gf->multiply.w32(gf, a->w32, oa.w32); + if (xor) sb.w32 ^= ot.w32; + } else if (w <= 64) { + oa.w64 = gf->extract_word.w64(gf, orig_a, bytes, i); + ot.w64 = gf->extract_word.w64(gf, orig_target, bytes, i); + ft.w64 = gf->extract_word.w64(gf, final_target, bytes, i); + sb.w64 = gf->multiply.w64(gf, a->w64, oa.w64); + if (xor) sb.w64 ^= ot.w64; + } else { + gf->extract_word.w128(gf, orig_a, bytes, i, oa.w128); + gf->extract_word.w128(gf, orig_target, bytes, i, ot.w128); + gf->extract_word.w128(gf, final_target, bytes, i, ft.w128); + gf->multiply.w128(gf, a->w128, oa.w128, sb.w128); + if (xor) { + sb.w128[0] ^= ot.w128[0]; + sb.w128[1] ^= ot.w128[1]; + } + } + + if (!gf_general_are_equal(&ft, &sb, w)) { + + fprintf(stderr,"Problem with region multiply (all values in hex):\n"); + fprintf(stderr," Target address base: 0x%lx. Word 0x%x of 0x%x. Xor: %d\n", + (unsigned long) final_target, i, words, xor); + gf_general_val_to_s(a, w, sa, 1); + gf_general_val_to_s(&oa, w, soa, 1); + gf_general_val_to_s(&ot, w, sot, 1); + gf_general_val_to_s(&ft, w, sft, 1); + gf_general_val_to_s(&sb, w, ssb, 1); + fprintf(stderr," Value: %s\n", sa); + fprintf(stderr," Original source word: %s\n", soa); + if (xor) fprintf(stderr," XOR with target word: %s\n", sot); + fprintf(stderr," Product word: %s\n", sft); + fprintf(stderr," It should be: %s\n", ssb); + exit(0); + } + } +} + +void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size) +{ + void *top; + gf_general_t g; + uint8_t *r8, *r8a; + uint16_t *r16; + uint32_t *r32; + uint64_t *r64; + int i; + + top = (char*)rb+size; + + /* If w is 8, 16, 32, 64 or 128, fill the regions with random bytes. + However, don't allow for zeros in rb, because that will screw up + division. + + When w is 4, you fill the regions with random 4-bit words in each byte. + + Otherwise, treat every four bytes as an uint32_t + and fill it with a random value mod (1 << w). + */ + + if (w == 8 || w == 16 || w == 32 || w == 64 || w == 128) { + MOA_Fill_Random_Region (ra, size); + while (rb < top) { + gf_general_set_random(&g, w, 0); + switch (w) { + case 8: + r8 = (uint8_t *) rb; + *r8 = g.w32; + break; + case 16: + r16 = (uint16_t *) rb; + *r16 = g.w32; + break; + case 32: + r32 = (uint32_t *) rb; + *r32 = g.w32; + break; + case 64: + r64 = (uint64_t *) rb; + *r64 = g.w64; + break; + case 128: + r64 = (uint64_t *) rb; + r64[0] = g.w128[0]; + r64[1] = g.w128[1]; + break; + } + rb = (char*)rb + (w/8); + } + } else if (w == 4) { + r8a = (uint8_t *) ra; + r8 = (uint8_t *) rb; + while (r8 < (uint8_t *) top) { + gf_general_set_random(&g, w, 1); + *r8a = g.w32; + gf_general_set_random(&g, w, 0); + *r8 = g.w32; + r8a++; + r8++; + } + } else { + r32 = (uint32_t *) ra; + for (i = 0; i < size/4; i++) r32[i] = MOA_Random_W(w, 1); + r32 = (uint32_t *) rb; + for (i = 0; i < size/4; i++) r32[i] = MOA_Random_W(w, 0); + } +} + +/* This sucks, but in order to time, you really need to avoid putting ifs in + the inner loops. So, I'm doing a separate timing test for each w: + (4 & 8), 16, 32, 64, 128 and everything else. Fortunately, the "everything else" + tests can be equivalent to w=32. + + I'm also putting the results back into ra, because otherwise, the optimizer might + figure out that we're not really doing anything in the inner loops and it + will chuck that. */ + +int gf_general_do_single_timing_test(gf_t *gf, void *ra, void *rb, int size, char test) +{ + gf_internal_t *h; + void *top; + uint8_t *r8a, *r8b, *top8; + uint16_t *r16a, *r16b, *top16; + uint32_t *r32a, *r32b, *top32; + uint64_t *r64a, *r64b, *top64, *r64c; + int w, rv; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + top = (char*)ra + size; + + if (w == 8 || w == 4) { + r8a = (uint8_t *) ra; + r8b = (uint8_t *) rb; + top8 = (uint8_t *) top; + if (test == 'M') { + while (r8a < top8) { + *r8a = gf->multiply.w32(gf, *r8a, *r8b); + r8a++; + r8b++; + } + } else if (test == 'D') { + while (r8a < top8) { + *r8a = gf->divide.w32(gf, *r8a, *r8b); + r8a++; + r8b++; + } + } else if (test == 'I') { + while (r8a < top8) { + *r8a = gf->inverse.w32(gf, *r8a); + r8a++; + } + } + return (top8 - (uint8_t *) ra); + } + + if (w == 16) { + r16a = (uint16_t *) ra; + r16b = (uint16_t *) rb; + top16 = (uint16_t *) top; + if (test == 'M') { + while (r16a < top16) { + *r16a = gf->multiply.w32(gf, *r16a, *r16b); + r16a++; + r16b++; + } + } else if (test == 'D') { + while (r16a < top16) { + *r16a = gf->divide.w32(gf, *r16a, *r16b); + r16a++; + r16b++; + } + } else if (test == 'I') { + while (r16a < top16) { + *r16a = gf->inverse.w32(gf, *r16a); + r16a++; + } + } + return (top16 - (uint16_t *) ra); + } + if (w <= 32) { + r32a = (uint32_t *) ra; + r32b = (uint32_t *) rb; + top32 = (uint32_t *) ra + (size/4); /* This is for the "everything elses" */ + + if (test == 'M') { + while (r32a < top32) { + *r32a = gf->multiply.w32(gf, *r32a, *r32b); + r32a++; + r32b++; + } + } else if (test == 'D') { + while (r32a < top32) { + *r32a = gf->divide.w32(gf, *r32a, *r32b); + r32a++; + r32b++; + } + } else if (test == 'I') { + while (r32a < top32) { + *r32a = gf->inverse.w32(gf, *r32a); + r32a++; + } + } + return (top32 - (uint32_t *) ra); + } + if (w == 64) { + r64a = (uint64_t *) ra; + r64b = (uint64_t *) rb; + top64 = (uint64_t *) top; + if (test == 'M') { + while (r64a < top64) { + *r64a = gf->multiply.w64(gf, *r64a, *r64b); + r64a++; + r64b++; + } + } else if (test == 'D') { + while (r64a < top64) { + *r64a = gf->divide.w64(gf, *r64a, *r64b); + r64a++; + r64b++; + } + } else if (test == 'I') { + while (r64a < top64) { + *r64a = gf->inverse.w64(gf, *r64a); + r64a++; + } + } + return (top64 - (uint64_t *) ra); + } + if (w == 128) { + r64a = (uint64_t *) ra; + r64c = r64a; + r64a += 2; + r64b = (uint64_t *) rb; + top64 = (uint64_t *) top; + rv = (top64 - r64a)/2; + if (test == 'M') { + while (r64a < top64) { + gf->multiply.w128(gf, r64a, r64b, r64c); + r64a += 2; + r64b += 2; + } + } else if (test == 'D') { + while (r64a < top64) { + gf->divide.w128(gf, r64a, r64b, r64c); + r64a += 2; + r64b += 2; + } + } else if (test == 'I') { + while (r64a < top64) { + gf->inverse.w128(gf, r64a, r64c); + r64a += 2; + } + } + return rv; + } + return 0; +} diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_method.c b/src/erasure-code/jerasure/gf-complete/src/gf_method.c new file mode 100644 index 000000000000..a7bcacff9767 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/gf_method.c @@ -0,0 +1,185 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_method.c + * + * Parses argv to figure out the mult_type and arguments. Returns the gf. + */ + +#include +#include +#include +#include +#include + +#include "gf_complete.h" +#include "gf_int.h" +#include "gf_method.h" + +int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting) +{ + int mult_type, divide_type, region_type; + int arg1, arg2; + uint64_t prim_poly; + gf_t *base; + + mult_type = GF_MULT_DEFAULT; + region_type = GF_REGION_DEFAULT; + divide_type = GF_DIVIDE_DEFAULT; + prim_poly = 0; + base = NULL; + arg1 = 0; + arg2 = 0; + while (1) { + if (argc > starting) { + if (strcmp(argv[starting], "-m") == 0) { + starting++; + if (mult_type != GF_MULT_DEFAULT) { + if (base != NULL) gf_free(base, 1); + _gf_errno = GF_E_TWOMULT; + return 0; + } + if (strcmp(argv[starting], "SHIFT") == 0) { + mult_type = GF_MULT_SHIFT; + starting++; + } else if (strcmp(argv[starting], "CARRY_FREE") == 0) { + mult_type = GF_MULT_CARRY_FREE; + starting++; + } else if (strcmp(argv[starting], "GROUP") == 0) { + mult_type = GF_MULT_GROUP; + if (argc < starting + 3) { + _gf_errno = GF_E_GROUPAR; + return 0; + } + if (sscanf(argv[starting+1], "%d", &arg1) == 0 || + sscanf(argv[starting+2], "%d", &arg2) == 0) { + _gf_errno = GF_E_GROUPNU; + return 0; + } + starting += 3; + } else if (strcmp(argv[starting], "BYTWO_p") == 0) { + mult_type = GF_MULT_BYTWO_p; + starting++; + } else if (strcmp(argv[starting], "BYTWO_b") == 0) { + mult_type = GF_MULT_BYTWO_b; + starting++; + } else if (strcmp(argv[starting], "TABLE") == 0) { + mult_type = GF_MULT_TABLE; + starting++; + } else if (strcmp(argv[starting], "LOG") == 0) { + mult_type = GF_MULT_LOG_TABLE; + starting++; + } else if (strcmp(argv[starting], "LOG_ZERO") == 0) { + mult_type = GF_MULT_LOG_ZERO; + starting++; + } else if (strcmp(argv[starting], "LOG_ZERO_EXT") == 0) { + mult_type = GF_MULT_LOG_ZERO_EXT; + starting++; + } else if (strcmp(argv[starting], "SPLIT") == 0) { + mult_type = GF_MULT_SPLIT_TABLE; + if (argc < starting + 3) { + _gf_errno = GF_E_SPLITAR; + return 0; + } + if (sscanf(argv[starting+1], "%d", &arg1) == 0 || + sscanf(argv[starting+2], "%d", &arg2) == 0) { + _gf_errno = GF_E_SPLITNU; + return 0; + } + starting += 3; + } else if (strcmp(argv[starting], "COMPOSITE") == 0) { + mult_type = GF_MULT_COMPOSITE; + if (argc < starting + 2) { _gf_errno = GF_E_FEWARGS; return 0; } + if (sscanf(argv[starting+1], "%d", &arg1) == 0) { + _gf_errno = GF_E_COMP_A2; + return 0; + } + starting += 2; + base = (gf_t *) malloc(sizeof(gf_t)); + starting = create_gf_from_argv(base, w/arg1, argc, argv, starting); + if (starting == 0) { + free(base); + return 0; + } + } else { + if (base != NULL) gf_free(base, 1); + _gf_errno = GF_E_UNKNOWN; + return 0; + } + } else if (strcmp(argv[starting], "-r") == 0) { + starting++; + if (strcmp(argv[starting], "DOUBLE") == 0) { + region_type |= GF_REGION_DOUBLE_TABLE; + starting++; + } else if (strcmp(argv[starting], "QUAD") == 0) { + region_type |= GF_REGION_QUAD_TABLE; + starting++; + } else if (strcmp(argv[starting], "LAZY") == 0) { + region_type |= GF_REGION_LAZY; + starting++; + } else if (strcmp(argv[starting], "SSE") == 0) { + region_type |= GF_REGION_SSE; + starting++; + } else if (strcmp(argv[starting], "NOSSE") == 0) { + region_type |= GF_REGION_NOSSE; + starting++; + } else if (strcmp(argv[starting], "CAUCHY") == 0) { + region_type |= GF_REGION_CAUCHY; + starting++; + } else if (strcmp(argv[starting], "ALTMAP") == 0) { + region_type |= GF_REGION_ALTMAP; + starting++; + } else { + if (base != NULL) gf_free(base, 1); + _gf_errno = GF_E_UNK_REG; + return 0; + } + } else if (strcmp(argv[starting], "-p") == 0) { + starting++; + if (sscanf(argv[starting], "%llx", (long long unsigned int *)(&prim_poly)) == 0) { + if (base != NULL) gf_free(base, 1); + _gf_errno = GF_E_POLYSPC; + return 0; + } + starting++; + } else if (strcmp(argv[starting], "-d") == 0) { + starting++; + if (divide_type != GF_DIVIDE_DEFAULT) { + if (base != NULL) gf_free(base, 1); + _gf_errno = GF_E_TWO_DIV; + return 0; + } else if (strcmp(argv[starting], "EUCLID") == 0) { + divide_type = GF_DIVIDE_EUCLID; + starting++; + } else if (strcmp(argv[starting], "MATRIX") == 0) { + divide_type = GF_DIVIDE_MATRIX; + starting++; + } else { + _gf_errno = GF_E_UNK_DIV; + return 0; + } + } else if (strcmp(argv[starting], "-") == 0) { + /* + printf("Scratch size: %d\n", gf_scratch_size(w, + mult_type, region_type, divide_type, arg1, arg2)); + */ + if (gf_init_hard(gf, w, mult_type, region_type, divide_type, + prim_poly, arg1, arg2, base, NULL) == 0) { + if (base != NULL) gf_free(base, 1); + return 0; + } else + return starting + 1; + } else { + if (base != NULL) gf_free(base, 1); + _gf_errno = GF_E_UNKFLAG; + return 0; + } + } else { + if (base != NULL) gf_free(base, 1); + _gf_errno = GF_E_FEWARGS; + return 0; + } + } +} diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_rand.c b/src/erasure-code/jerasure/gf-complete/src/gf_rand.c new file mode 100644 index 000000000000..a9aa7ad3605c --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/gf_rand.c @@ -0,0 +1,80 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_rand.c -- Random number generator. + */ + +#include +#include +#include +#include "gf_rand.h" + +/* Lifted the "Mother of All" random number generator from http://www.agner.org/random/ */ + +static uint32_t MOA_X[5]; + +uint32_t MOA_Random_32() { + uint64_t sum; + sum = (uint64_t)2111111111UL * (uint64_t)MOA_X[3] + + (uint64_t)1492 * (uint64_t)(MOA_X[2]) + + (uint64_t)1776 * (uint64_t)(MOA_X[1]) + + (uint64_t)5115 * (uint64_t)(MOA_X[0]) + + (uint64_t)MOA_X[4]; + MOA_X[3] = MOA_X[2]; MOA_X[2] = MOA_X[1]; MOA_X[1] = MOA_X[0]; + MOA_X[4] = (uint32_t)(sum >> 32); + MOA_X[0] = (uint32_t)sum; + return MOA_X[0]; +} + +uint64_t MOA_Random_64() { + uint64_t sum; + + sum = MOA_Random_32(); + sum <<= 32; + sum |= MOA_Random_32(); + return sum; +} + +void MOA_Random_128(uint64_t *x) { + x[0] = MOA_Random_64(); + x[1] = MOA_Random_64(); + return; +} + +uint32_t MOA_Random_W(int w, int zero_ok) +{ + uint32_t b; + + do { + b = MOA_Random_32(); + if (w == 31) b &= 0x7fffffff; + if (w < 31) b %= (1 << w); + } while (!zero_ok && b == 0); + return b; +} + +void MOA_Seed(uint32_t seed) { + int i; + uint32_t s = seed; + for (i = 0; i < 5; i++) { + s = s * 29943829 - 1; + MOA_X[i] = s; + } + for (i=0; i<19; i++) MOA_Random_32(); +} + + +void MOA_Fill_Random_Region (void *reg, int size) +{ + uint32_t *r32; + uint8_t *r8; + int i; + + r32 = (uint32_t *) reg; + r8 = (uint8_t *) reg; + for (i = 0; i < size/4; i++) r32[i] = MOA_Random_32(); + for (i *= 4; i < size; i++) r8[i] = MOA_Random_W(8, 1); +} + diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w128.c b/src/erasure-code/jerasure/gf-complete/src/gf_w128.c new file mode 100644 index 000000000000..d4336ae5ee6e --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/gf_w128.c @@ -0,0 +1,1769 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_w128.c + * + * Routines for 128-bit Galois fields + */ + +#include "gf_int.h" +#include +#include + +#define GF_FIELD_WIDTH (128) + +#define two_x(a) {\ + a[0] <<= 1; \ + if (a[1] & 1ULL << 63) a[0] ^= 1; \ + a[1] <<= 1; } + +#define a_get_b(a, i, b, j) {\ + a[i] = b[j]; \ + a[i + 1] = b[j + 1];} + +#define set_zero(a, i) {\ + a[i] = 0; \ + a[i + 1] = 0;} + +struct gf_w128_split_4_128_data { + uint64_t last_value[2]; + uint64_t tables[2][32][16]; +}; + +struct gf_w128_split_8_128_data { + uint64_t last_value[2]; + uint64_t tables[2][16][256]; +}; + +typedef struct gf_group_tables_s { + gf_val_128_t m_table; + gf_val_128_t r_table; +} gf_group_tables_t; + +#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); } + +static +void +gf_w128_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, +int xor) +{ + int i; + gf_val_128_t s128; + gf_val_128_t d128; + uint64_t c128[2]; + gf_region_data rd; + + /* We only do this to check on alignment. */ + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); + + if (val[0] == 0) { + if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } + } + + set_zero(c128, 0); + + s128 = (gf_val_128_t) src; + d128 = (gf_val_128_t) dest; + + if (xor) { + for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) { + gf->multiply.w128(gf, &s128[i], val, c128); + d128[i] ^= c128[0]; + d128[i+1] ^= c128[1]; + } + } else { + for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) { + gf->multiply.w128(gf, &s128[i], val, &d128[i]); + } + } +} + +static +void +gf_w128_clm_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, +int xor) +{ + gf_internal_t * h = gf->scratch; + if ((h->sse & GF_SSE4_PCLMUL) == 0) + return; +#if defined(INTEL_SSE4_PCLMUL) + int i; + gf_val_128_t s128; + gf_val_128_t d128; + gf_region_data rd; + __m128i a,b; + __m128i result0,result1; + __m128i prim_poly; + __m128i c,d,e,f; + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly); + /* We only do this to check on alignment. */ + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); + + if (val[0] == 0) { + if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } + } + + s128 = (gf_val_128_t) src; + d128 = (gf_val_128_t) dest; + + if (xor) { + for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) { + a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0); + b = _mm_insert_epi64 (a, val[1], 0); + a = _mm_insert_epi64 (a, s128[i], 1); + b = _mm_insert_epi64 (b, val[0], 1); + + c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/ + f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/ + e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/ + d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/ + + /* now reusing a and b as temporary variables*/ + result0 = _mm_setzero_si128(); + result1 = result0; + + result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0)); + a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1)); + result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a)); + + a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0)); + result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a)); + result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1)); + /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce. */ + + a = _mm_srli_si128 (result0, 8); + b = _mm_clmulepi64_si128 (a, prim_poly, 0x00); + result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8)); + result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8)); + + a = _mm_insert_epi64 (result0, 0, 1); + b = _mm_clmulepi64_si128 (a, prim_poly, 0x00); + result1 = _mm_xor_si128 (result1, b); + d128[i] ^= (uint64_t)_mm_extract_epi64(result1,1); + d128[i+1] ^= (uint64_t)_mm_extract_epi64(result1,0); + } + } else { + for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) { + a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0); + b = _mm_insert_epi64 (a, val[1], 0); + a = _mm_insert_epi64 (a, s128[i], 1); + b = _mm_insert_epi64 (b, val[0], 1); + + c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/ + f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/ + e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/ + d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/ + + /* now reusing a and b as temporary variables*/ + result0 = _mm_setzero_si128(); + result1 = result0; + + result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0)); + a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1)); + result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a)); + + a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0)); + result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a)); + result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1)); + /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/ + + a = _mm_srli_si128 (result0, 8); + b = _mm_clmulepi64_si128 (a, prim_poly, 0x00); + result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8)); + result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8)); + + a = _mm_insert_epi64 (result0, 0, 1); + b = _mm_clmulepi64_si128 (a, prim_poly, 0x00); + result1 = _mm_xor_si128 (result1, b); + d128[i] = (uint64_t)_mm_extract_epi64(result1,1); + d128[i+1] = (uint64_t)_mm_extract_epi64(result1,0); + } + } +#endif +} + +/* + * Some w128 notes: + * --Big Endian + * --return values allocated beforehand + */ + +#define GF_W128_IS_ZERO(val) (val[0] == 0 && val[1] == 0) + +void +gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) +{ + /* ordered highest bit to lowest l[0] l[1] r[0] r[1] */ + uint64_t pl[2], pr[2], ppl[2], ppr[2], i, a[2], bl[2], br[2], one, lbit; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + if (GF_W128_IS_ZERO(a128) || GF_W128_IS_ZERO(b128)) { + set_zero(c128, 0); + return; + } + + a_get_b(a, 0, a128, 0); + a_get_b(br, 0, b128, 0); + set_zero(bl, 0); + + one = 1; + lbit = (one << 63); + + set_zero(pl, 0); + set_zero(pr, 0); + + /* Allen: a*b for right half of a */ + for (i = 0; i < GF_FIELD_WIDTH/2; i++) { + if (a[1] & (one << i)) { + pl[1] ^= bl[1]; + pr[0] ^= br[0]; + pr[1] ^= br[1]; + } + bl[1] <<= 1; + if (br[0] & lbit) bl[1] ^= 1; + br[0] <<= 1; + if (br[1] & lbit) br[0] ^= 1; + br[1] <<= 1; + } + + /* Allen: a*b for left half of a */ + for (i = 0; i < GF_FIELD_WIDTH/2; i++) { + if (a[0] & (one << i)) { + pl[0] ^= bl[0]; + pl[1] ^= bl[1]; + pr[0] ^= br[0]; + } + bl[0] <<= 1; + if (bl[1] & lbit) bl[0] ^= 1; + bl[1] <<= 1; + if (br[0] & lbit) bl[1] ^= 1; + br[0] <<= 1; + } + + /* Allen: do first half of reduction (based on left quarter of initial product) */ + one = lbit >> 1; + ppl[0] = one; /* Allen: introduce leading one of primitive polynomial */ + ppl[1] = h->prim_poly >> 2; + ppr[0] = h->prim_poly << (GF_FIELD_WIDTH/2-2); + ppr[1] = 0; + while (one != 0) { + if (pl[0] & one) { + pl[0] ^= ppl[0]; + pl[1] ^= ppl[1]; + pr[0] ^= ppr[0]; + pr[1] ^= ppr[1]; + } + one >>= 1; + ppr[1] >>= 1; + if (ppr[0] & 1) ppr[1] ^= lbit; + ppr[0] >>= 1; + if (ppl[1] & 1) ppr[0] ^= lbit; + ppl[1] >>= 1; + if (ppl[0] & 1) ppl[1] ^= lbit; + ppl[0] >>= 1; + } + + /* Allen: final half of reduction */ + one = lbit; + while (one != 0) { + if (pl[1] & one) { + pl[1] ^= ppl[1]; + pr[0] ^= ppr[0]; + pr[1] ^= ppr[1]; + } + one >>= 1; + ppr[1] >>= 1; + if (ppr[0] & 1) ppr[1] ^= lbit; + ppr[0] >>= 1; + if (ppl[1] & 1) ppr[0] ^= lbit; + ppl[1] >>= 1; + } + + /* Allen: if we really want to optimize this we can just be using c128 instead of pr all along */ + c128[0] = pr[0]; + c128[1] = pr[1]; + + return; +} + +void +gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) +{ + gf_internal_t * h = gf->scratch; + if ((h->sse & GF_SSE4_PCLMUL) == 0) + return; +#if defined(INTEL_SSE4_PCLMUL) + + __m128i a,b; + __m128i result0,result1; + __m128i prim_poly; + __m128i c,d,e,f; + + a = _mm_insert_epi64 (_mm_setzero_si128(), a128[1], 0); + b = _mm_insert_epi64 (a, b128[1], 0); + a = _mm_insert_epi64 (a, a128[0], 1); + b = _mm_insert_epi64 (b, b128[0], 1); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly); + + /* we need to test algorithm 2 later*/ + c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/ + f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/ + e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/ + d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/ + + /* now reusing a and b as temporary variables*/ + result0 = _mm_setzero_si128(); + result1 = result0; + + result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0)); + a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1)); + result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a)); + + a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0)); + result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a)); + result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1)); + /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/ + + a = _mm_srli_si128 (result0, 8); + b = _mm_clmulepi64_si128 (a, prim_poly, 0x00); + result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8)); + result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8)); + + a = _mm_insert_epi64 (result0, 0, 1); + b = _mm_clmulepi64_si128 (a, prim_poly, 0x00); + result1 = _mm_xor_si128 (result1, b); + + c128[0] = (uint64_t)_mm_extract_epi64(result1,1); + c128[1] = (uint64_t)_mm_extract_epi64(result1,0); +#endif +return; +} + +void +gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) +{ + uint64_t amask[2], pmask, pp, prod[2]; /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/ + uint64_t topbit; /* this is used as a boolean value */ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + prod[0] = 0; + prod[1] = 0; + pmask = 0x8000000000000000ULL; + amask[0] = 0x8000000000000000ULL; + amask[1] = 0; + + while (amask[1] != 0 || amask[0] != 0) { + topbit = (prod[0] & pmask); + prod[0] <<= 1; + if (prod[1] & pmask) prod[0] ^= 1; + prod[1] <<= 1; + if (topbit) prod[1] ^= pp; + if ((a128[0] & amask[0]) || (a128[1] & amask[1])) { + prod[0] ^= b128[0]; + prod[1] ^= b128[1]; + } + amask[1] >>= 1; + if (amask[0] & 1) amask[1] ^= pmask; + amask[0] >>= 1; + } + c128[0] = prod [0]; + c128[1] = prod [1]; + return; +} + +void +gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) +{ + gf_internal_t * h = gf->scratch; + if ((h->sse & GF_SSE4) == 0) + return; +#if defined(INTEL_SSE4) + int i; + __m128i a, b, pp, prod, amask, u_middle_one; + /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/ + uint32_t topbit, middlebit, pmask; /* this is used as a boolean value */ + + + h = (gf_internal_t *) gf->scratch; + pp = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly); + prod = _mm_setzero_si128(); + a = _mm_insert_epi64(prod, a128[1], 0x0); + a = _mm_insert_epi64(a, a128[0], 0x1); + b = _mm_insert_epi64(prod, b128[1], 0x0); + b = _mm_insert_epi64(b, b128[0], 0x1); + pmask = 0x80000000; + amask = _mm_insert_epi32(prod, 0x80000000, 0x3); + u_middle_one = _mm_insert_epi32(prod, 1, 0x2); + + for (i = 0; i < 64; i++) { + topbit = (_mm_extract_epi32(prod, 0x3) & pmask); + middlebit = (_mm_extract_epi32(prod, 0x1) & pmask); + prod = _mm_slli_epi64(prod, 1); /* this instruction loses the middle bit */ + if (middlebit) { + prod = _mm_xor_si128(prod, u_middle_one); + } + if (topbit) { + prod = _mm_xor_si128(prod, pp); + } + if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 1))) { + prod = _mm_xor_si128(prod, b); + } + amask = _mm_srli_epi64(amask, 1); /*so does this one, but we can just replace after loop*/ + } + amask = _mm_insert_epi32(amask, 1 << 31, 0x1); + for (i = 64; i < 128; i++) { + topbit = (_mm_extract_epi32(prod, 0x3) & pmask); + middlebit = (_mm_extract_epi32(prod, 0x1) & pmask); + prod = _mm_slli_epi64(prod, 1); + if (middlebit) prod = _mm_xor_si128(prod, u_middle_one); + if (topbit) prod = _mm_xor_si128(prod, pp); + if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 0))) { + prod = _mm_xor_si128(prod, b); + } + amask = _mm_srli_epi64(amask, 1); + } + c128[0] = (uint64_t)_mm_extract_epi64(prod, 1); + c128[1] = (uint64_t)_mm_extract_epi64(prod, 0); +#endif + return; +} + + +/* Ben: This slow function implements sse instrutions for bytwo_b because why not */ +void +gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) +{ + gf_internal_t * h = gf->scratch; + if ((h->sse & GF_SSE4) == 0) + return; +#if defined(INTEL_SSE4) + __m128i a, b, lmask, hmask, pp, c, middle_one; + uint64_t topbit, middlebit; + + + c = _mm_setzero_si128(); + lmask = _mm_insert_epi64(c, 1ULL << 63, 0); + hmask = _mm_insert_epi64(c, 1ULL << 63, 1); + b = _mm_insert_epi64(c, a128[0], 1); + b = _mm_insert_epi64(b, a128[1], 0); + a = _mm_insert_epi64(c, b128[0], 1); + a = _mm_insert_epi64(a, b128[1], 0); + pp = _mm_insert_epi64(c, h->prim_poly, 0); + middle_one = _mm_insert_epi64(c, 1, 0x1); + + while (1) { + if (_mm_extract_epi32(a, 0x0) & 1) { + c = _mm_xor_si128(c, b); + } + middlebit = (_mm_extract_epi32(a, 0x2) & 1); + a = _mm_srli_epi64(a, 1); + if (middlebit) a = _mm_xor_si128(a, lmask); + if ((_mm_extract_epi64(a, 0x1) == 0ULL) && (_mm_extract_epi64(a, 0x0) == 0ULL)){ + c128[0] = _mm_extract_epi64(c, 0x1); + c128[1] = _mm_extract_epi64(c, 0x0); + return; + } + topbit = (_mm_extract_epi64(_mm_and_si128(b, hmask), 1)); + middlebit = (_mm_extract_epi64(_mm_and_si128(b, lmask), 0)); + b = _mm_slli_epi64(b, 1); + if (middlebit) b = _mm_xor_si128(b, middle_one); + if (topbit) b = _mm_xor_si128(b, pp); + } +#endif +} + +void +gf_w128_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) +{ + uint64_t bmask, pp; + gf_internal_t *h; + uint64_t a[2], b[2], c[2]; + + h = (gf_internal_t *) gf->scratch; + + bmask = (1ULL << 63); + set_zero(c, 0); + b[0] = a128[0]; + b[1] = a128[1]; + a[0] = b128[0]; + a[1] = b128[1]; + + while (1) { + if (a[1] & 1) { + c[0] ^= b[0]; + c[1] ^= b[1]; + } + a[1] >>= 1; + if (a[0] & 1) a[1] ^= bmask; + a[0] >>= 1; + if (a[1] == 0 && a[0] == 0) { + c128[0] = c[0]; + c128[1] = c[1]; + return; + } + pp = (b[0] & bmask); + b[0] <<= 1; + if (b[1] & bmask) b[0] ^= 1; + b[1] <<= 1; + if (pp) b[1] ^= h->prim_poly; + } +} + +static +void +gf_w128_split_4_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) +{ + int i, j, k; + uint64_t pp; + gf_internal_t *h; + uint64_t *s64, *d64, *top; + gf_region_data rd; + uint64_t v[2], s; + struct gf_w128_split_4_128_data *ld; + + /* We only do this to check on alignment. */ + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); + + if (val[0] == 0) { + if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } + } + + h = (gf_internal_t *) gf->scratch; + ld = (struct gf_w128_split_4_128_data *) h->private; + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) { + v[0] = val[0]; + v[1] = val[1]; + for (i = 0; i < 32; i++) { + ld->tables[0][i][0] = 0; + ld->tables[1][i][0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]); + ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]); + } + pp = (v[0] & (1ULL << 63)); + v[0] <<= 1; + if (v[1] & (1ULL << 63)) v[0] ^= 1; + v[1] <<= 1; + if (pp) v[1] ^= h->prim_poly; + } + } + } + ld->last_value[0] = val[0]; + ld->last_value[1] = val[1]; + +/* + for (i = 0; i < 32; i++) { + for (j = 0; j < 16; j++) { + printf("%2d %2d %016llx %016llx\n", i, j, ld->tables[0][i][j], ld->tables[1][i][j]); + } + printf("\n"); + } + */ + i = 0; + while (d64 < top) { + v[0] = (xor) ? d64[0] : 0; + v[1] = (xor) ? d64[1] : 0; + s = s64[1]; + i = 0; + while (s != 0) { + v[0] ^= ld->tables[0][i][s&0xf]; + v[1] ^= ld->tables[1][i][s&0xf]; + s >>= 4; + i++; + } + s = s64[0]; + i = 16; + while (s != 0) { + v[0] ^= ld->tables[0][i][s&0xf]; + v[1] ^= ld->tables[1][i][s&0xf]; + s >>= 4; + i++; + } + d64[0] = v[0]; + d64[1] = v[1]; + s64 += 2; + d64 += 2; + } +} + +static +void +gf_w128_split_4_128_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) +{ + gf_internal_t * h = gf->scratch; + if ((h->sse & GF_SSSE3) == 0) + return; + +#ifdef INTEL_SSSE3 + int i, j, k; + uint64_t pp, v[2], s, *s64, *d64, *top; + __m128i p, tables[32][16]; + struct gf_w128_split_4_128_data *ld; + gf_region_data rd; + + if (val[0] == 0) { + if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } + } + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + /* We only do this to check on alignment. */ + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 16); + + /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */ + + gf_w128_multiply_region_from_single(gf, src, dest, val, ((char*)rd.s_start-(char*)src), xor); + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + ld = (struct gf_w128_split_4_128_data *) h->private; + + if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) { + v[0] = val[0]; + v[1] = val[1]; + for (i = 0; i < 32; i++) { + ld->tables[0][i][0] = 0; + ld->tables[1][i][0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]); + ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]); + } + pp = (v[0] & (1ULL << 63)); + v[0] <<= 1; + if (v[1] & (1ULL << 63)) v[0] ^= 1; + v[1] <<= 1; + if (pp) v[1] ^= h->prim_poly; + } + } + } + + ld->last_value[0] = val[0]; + ld->last_value[1] = val[1]; + + for (i = 0; i < 32; i++) { + for (j = 0; j < 16; j++) { + v[0] = ld->tables[0][i][j]; + v[1] = ld->tables[1][i][j]; + tables[i][j] = _mm_loadu_si128((__m128i *) v); + +/* + printf("%2d %2d: ", i, j); + MM_PRINT8("", tables[i][j]); */ + } + } + + while (d64 != top) { + + if (xor) { + p = _mm_load_si128 ((__m128i *) d64); + } else { + p = _mm_setzero_si128(); + } + s = *s64; + s64++; + for (i = 0; i < 16; i++) { + j = (s&0xf); + s >>= 4; + p = _mm_xor_si128(p, tables[16+i][j]); + } + s = *s64; + s64++; + for (i = 0; i < 16; i++) { + j = (s&0xf); + s >>= 4; + p = _mm_xor_si128(p, tables[i][j]); + } + _mm_store_si128((__m128i *) d64, p); + d64 += 2; + } + + /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */ + + gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((char*)src+bytes)-(char*)rd.s_top, xor); +#endif +} + +static +void +gf_w128_split_4_128_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) +{ + gf_internal_t * h = gf->scratch; + if ((h->sse & GF_SSSE3) == 0) + return; + +#ifdef INTEL_SSSE3 + int i, j, k; + uint64_t pp, v[2], *s64, *d64, *top; + __m128i si, tables[32][16], p[16], v0, mask1; + struct gf_w128_split_4_128_data *ld; + uint8_t btable[16]; + gf_region_data rd; + + if (val[0] == 0) { + if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } + } + + pp = h->prim_poly; + + /* We only do this to check on alignment. */ + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 256); + + /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */ + + gf_w128_multiply_region_from_single(gf, src, dest, val, ((char*)rd.s_start-(char*)src), xor); + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + ld = (struct gf_w128_split_4_128_data *) h->private; + + if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) { + v[0] = val[0]; + v[1] = val[1]; + for (i = 0; i < 32; i++) { + ld->tables[0][i][0] = 0; + ld->tables[1][i][0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]); + ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]); + } + pp = (v[0] & (1ULL << 63)); + v[0] <<= 1; + if (v[1] & (1ULL << 63)) v[0] ^= 1; + v[1] <<= 1; + if (pp) v[1] ^= h->prim_poly; + } + } + } + + ld->last_value[0] = val[0]; + ld->last_value[1] = val[1]; + + for (i = 0; i < 32; i++) { + for (j = 0; j < 16; j++) { + for (k = 0; k < 16; k++) { + btable[k] = (uint8_t) ld->tables[1-(j/8)][i][k]; + ld->tables[1-(j/8)][i][k] >>= 8; + } + tables[i][j] = _mm_loadu_si128((__m128i *) btable); +/* + printf("%2d %2d: ", i, j); + MM_PRINT8("", tables[i][j]); + */ + } + } + + + mask1 = _mm_set1_epi8(0xf); + + while (d64 != top) { + + if (xor) { + for (i = 0; i < 16; i++) p[i] = _mm_load_si128 ((__m128i *) (d64+i*2)); + } else { + for (i = 0; i < 16; i++) p[i] = _mm_setzero_si128(); + } + i = 0; + for (k = 0; k < 16; k++) { + v0 = _mm_load_si128((__m128i *) s64); + s64 += 2; + + si = _mm_and_si128(v0, mask1); + + for (j = 0; j < 16; j++) { + p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); + } + i++; + v0 = _mm_srli_epi32(v0, 4); + si = _mm_and_si128(v0, mask1); + for (j = 0; j < 16; j++) { + p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); + } + i++; + } + for (i = 0; i < 16; i++) { + _mm_store_si128((__m128i *) d64, p[i]); + d64 += 2; + } + } + /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */ + + gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((char*)src+bytes)-(char*)rd.s_top, xor); +#endif +} + +static +void +gf_w128_split_8_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) +{ + int i, j, k; + uint64_t pp; + gf_internal_t *h; + uint64_t *s64, *d64, *top; + gf_region_data rd; + uint64_t v[2], s; + struct gf_w128_split_8_128_data *ld; + + /* Check on alignment. Ignore it otherwise. */ + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); + + if (val[0] == 0) { + if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } + } + + h = (gf_internal_t *) gf->scratch; + ld = (struct gf_w128_split_8_128_data *) h->private; + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) { + v[0] = val[0]; + v[1] = val[1]; + for (i = 0; i < 16; i++) { + ld->tables[0][i][0] = 0; + ld->tables[1][i][0] = 0; + for (j = 1; j < (1 << 8); j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]); + ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]); + } + pp = (v[0] & (1ULL << 63)); + v[0] <<= 1; + if (v[1] & (1ULL << 63)) v[0] ^= 1; + v[1] <<= 1; + if (pp) v[1] ^= h->prim_poly; + } + } + } + ld->last_value[0] = val[0]; + ld->last_value[1] = val[1]; + + while (d64 < top) { + v[0] = (xor) ? d64[0] : 0; + v[1] = (xor) ? d64[1] : 0; + s = s64[1]; + i = 0; + while (s != 0) { + v[0] ^= ld->tables[0][i][s&0xff]; + v[1] ^= ld->tables[1][i][s&0xff]; + s >>= 8; + i++; + } + s = s64[0]; + i = 8; + while (s != 0) { + v[0] ^= ld->tables[0][i][s&0xff]; + v[1] ^= ld->tables[1][i][s&0xff]; + s >>= 8; + i++; + } + d64[0] = v[0]; + d64[1] = v[1]; + s64 += 2; + d64 += 2; + } +} + +void +gf_w128_bytwo_b_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) +{ + uint64_t bmask, pp; + gf_internal_t *h; + uint64_t a[2], c[2], b[2], *s64, *d64, *top; + gf_region_data rd; + + /* We only do this to check on alignment. */ + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); + + if (val[0] == 0) { + if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } + } + + h = (gf_internal_t *) gf->scratch; + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + bmask = (1ULL << 63); + + while (d64 < top) { + set_zero(c, 0); + b[0] = s64[0]; + b[1] = s64[1]; + a[0] = val[0]; + a[1] = val[1]; + + while (a[0] != 0) { + if (a[1] & 1) { + c[0] ^= b[0]; + c[1] ^= b[1]; + } + a[1] >>= 1; + if (a[0] & 1) a[1] ^= bmask; + a[0] >>= 1; + pp = (b[0] & bmask); + b[0] <<= 1; + if (b[1] & bmask) b[0] ^= 1; + b[1] <<= 1; + if (pp) b[1] ^= h->prim_poly; + } + while (1) { + if (a[1] & 1) { + c[0] ^= b[0]; + c[1] ^= b[1]; + } + a[1] >>= 1; + if (a[1] == 0) break; + pp = (b[0] & bmask); + b[0] <<= 1; + if (b[1] & bmask) b[0] ^= 1; + b[1] <<= 1; + if (pp) b[1] ^= h->prim_poly; + } + if (xor) { + d64[0] ^= c[0]; + d64[1] ^= c[1]; + } else { + d64[0] = c[0]; + d64[1] = c[1]; + } + s64 += 2; + d64 += 2; + } +} + +static +void gf_w128_group_m_init(gf_t *gf, gf_val_128_t b128) +{ + int i, j; + int g_m; + uint64_t prim_poly, lbit; + gf_internal_t *scratch; + gf_group_tables_t *gt; + uint64_t a128[2]; + scratch = (gf_internal_t *) gf->scratch; + gt = scratch->private; + g_m = scratch->arg1; + prim_poly = scratch->prim_poly; + + + set_zero(gt->m_table, 0); + a_get_b(gt->m_table, 2, b128, 0); + lbit = 1; + lbit <<= 63; + + for (i = 2; i < (1 << g_m); i <<= 1) { + a_get_b(a128, 0, gt->m_table, 2 * (i >> 1)); + two_x(a128); + a_get_b(gt->m_table, 2 * i, a128, 0); + if (gt->m_table[2 * (i >> 1)] & lbit) gt->m_table[(2 * i) + 1] ^= prim_poly; + for (j = 0; j < i; j++) { + gt->m_table[(2 * i) + (2 * j)] = gt->m_table[(2 * i)] ^ gt->m_table[(2 * j)]; + gt->m_table[(2 * i) + (2 * j) + 1] = gt->m_table[(2 * i) + 1] ^ gt->m_table[(2 * j) + 1]; + } + } + return; +} + +void +gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) +{ + int i; + /* index_r, index_m, total_m (if g_r > g_m) */ + int i_r, i_m, t_m; + int mask_m, mask_r; + int g_m, g_r; + uint64_t p_i[2], a[2]; + gf_internal_t *scratch; + gf_group_tables_t *gt; + + scratch = (gf_internal_t *) gf->scratch; + gt = scratch->private; + g_m = scratch->arg1; + g_r = scratch->arg2; + + mask_m = (1 << g_m) - 1; + mask_r = (1 << g_r) - 1; + + if (b128[0] != gt->m_table[2] || b128[1] != gt->m_table[3]) { + gf_w128_group_m_init(gf, b128); + } + + p_i[0] = 0; + p_i[1] = 0; + a[0] = a128[0]; + a[1] = a128[1]; + + t_m = 0; + i_r = 0; + + /* Top 64 bits */ + for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) { + i_m = (a[0] >> (i * g_m)) & mask_m; + i_r ^= (p_i[0] >> (64 - g_m)) & mask_r; + p_i[0] <<= g_m; + p_i[0] ^= (p_i[1] >> (64-g_m)); + p_i[1] <<= g_m; + p_i[0] ^= gt->m_table[2 * i_m]; + p_i[1] ^= gt->m_table[(2 * i_m) + 1]; + t_m += g_m; + if (t_m == g_r) { + p_i[1] ^= gt->r_table[i_r]; + t_m = 0; + i_r = 0; + } else { + i_r <<= g_m; + } + } + + for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) { + i_m = (a[1] >> (i * g_m)) & mask_m; + i_r ^= (p_i[0] >> (64 - g_m)) & mask_r; + p_i[0] <<= g_m; + p_i[0] ^= (p_i[1] >> (64-g_m)); + p_i[1] <<= g_m; + p_i[0] ^= gt->m_table[2 * i_m]; + p_i[1] ^= gt->m_table[(2 * i_m) + 1]; + t_m += g_m; + if (t_m == g_r) { + p_i[1] ^= gt->r_table[i_r]; + t_m = 0; + i_r = 0; + } else { + i_r <<= g_m; + } + } + c128[0] = p_i[0]; + c128[1] = p_i[1]; +} + +static +void +gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) +{ + int i; + int i_r, i_m, t_m; + int mask_m, mask_r; + int g_m, g_r; + uint64_t p_i[2], a[2]; + gf_internal_t *scratch; + gf_group_tables_t *gt; + gf_region_data rd; + uint64_t *a128, *c128, *top; + + /* We only do this to check on alignment. */ + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); + + if (val[0] == 0) { + if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } + } + + scratch = (gf_internal_t *) gf->scratch; + gt = scratch->private; + g_m = scratch->arg1; + g_r = scratch->arg2; + + mask_m = (1 << g_m) - 1; + mask_r = (1 << g_r) - 1; + + if (val[0] != gt->m_table[2] || val[1] != gt->m_table[3]) { + gf_w128_group_m_init(gf, val); + } + + a128 = (uint64_t *) src; + c128 = (uint64_t *) dest; + top = (uint64_t *) rd.d_top; + + while (c128 < top) { + p_i[0] = 0; + p_i[1] = 0; + a[0] = a128[0]; + a[1] = a128[1]; + + t_m = 0; + i_r = 0; + + /* Top 64 bits */ + for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) { + i_m = (a[0] >> (i * g_m)) & mask_m; + i_r ^= (p_i[0] >> (64 - g_m)) & mask_r; + p_i[0] <<= g_m; + p_i[0] ^= (p_i[1] >> (64-g_m)); + p_i[1] <<= g_m; + + p_i[0] ^= gt->m_table[2 * i_m]; + p_i[1] ^= gt->m_table[(2 * i_m) + 1]; + t_m += g_m; + if (t_m == g_r) { + p_i[1] ^= gt->r_table[i_r]; + t_m = 0; + i_r = 0; + } else { + i_r <<= g_m; + } + } + for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) { + i_m = (a[1] >> (i * g_m)) & mask_m; + i_r ^= (p_i[0] >> (64 - g_m)) & mask_r; + p_i[0] <<= g_m; + p_i[0] ^= (p_i[1] >> (64-g_m)); + p_i[1] <<= g_m; + p_i[0] ^= gt->m_table[2 * i_m]; + p_i[1] ^= gt->m_table[(2 * i_m) + 1]; + t_m += g_m; + if (t_m == g_r) { + p_i[1] ^= gt->r_table[i_r]; + t_m = 0; + i_r = 0; + } else { + i_r <<= g_m; + } + } + + if (xor) { + c128[0] ^= p_i[0]; + c128[1] ^= p_i[1]; + } else { + c128[0] = p_i[0]; + c128[1] = p_i[1]; + } + a128 += 2; + c128 += 2; + } +} + +/* a^-1 -> b */ + void +gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128) +{ + uint64_t e_i[2], e_im1[2], e_ip1[2]; + uint64_t d_i, d_im1, d_ip1; + uint64_t y_i[2], y_im1[2], y_ip1[2]; + uint64_t c_i[2]; + uint64_t *b; + uint64_t one = 1; + + /* This needs to return some sort of error (in b128?) */ + if (a128[0] == 0 && a128[1] == 0) return; + + b = (uint64_t *) b128; + + e_im1[0] = 0; + e_im1[1] = ((gf_internal_t *) (gf->scratch))->prim_poly; + e_i[0] = a128[0]; + e_i[1] = a128[1]; + d_im1 = 128; + + //Allen: I think d_i starts at 63 here, and checks each bit of a, starting at MSB, looking for the first nonzero bit + //so d_i should be 0 if this half of a is all 0s, otherwise it should be the position from right of the first-from-left zero bit of this half of a. + //BUT if d_i is 0 at end we won't know yet if the rightmost bit of this half is 1 or not + + for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[0]) == 0 && d_i > 0; d_i--) ; + + //Allen: this is testing just the first half of the stop condition above, so if it holds we know we did not find a nonzero bit yet + + if (!((one << d_i) & e_i[0])) { + + //Allen: this is doing the same thing on the other half of a. In other words, we're still searching for a nonzero bit of a. + // but not bothering to test if d_i hits zero, which is fine because we've already tested for a=0. + + for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[1]) == 0; d_i--) ; + + } else { + + //Allen: if a 1 was found in more-significant half of a, make d_i the ACTUAL index of the first nonzero bit in the entire a. + + d_i += 64; + } + y_i[0] = 0; + y_i[1] = 1; + y_im1[0] = 0; + y_im1[1] = 0; + + while (!(e_i[0] == 0 && e_i[1] == 1)) { + + e_ip1[0] = e_im1[0]; + e_ip1[1] = e_im1[1]; + d_ip1 = d_im1; + c_i[0] = 0; + c_i[1] = 0; + + while (d_ip1 >= d_i) { + if ((d_ip1 - d_i) >= 64) { + c_i[0] ^= (one << ((d_ip1 - d_i) - 64)); + e_ip1[0] ^= (e_i[1] << ((d_ip1 - d_i) - 64)); + } else { + c_i[1] ^= (one << (d_ip1 - d_i)); + e_ip1[0] ^= (e_i[0] << (d_ip1 - d_i)); + if (d_ip1 - d_i > 0) e_ip1[0] ^= (e_i[1] >> (64 - (d_ip1 - d_i))); + e_ip1[1] ^= (e_i[1] << (d_ip1 - d_i)); + } + d_ip1--; + if (e_ip1[0] == 0 && e_ip1[1] == 0) { b[0] = 0; b[1] = 0; return; } + while (d_ip1 >= 64 && (e_ip1[0] & (one << (d_ip1 - 64))) == 0) d_ip1--; + while (d_ip1 < 64 && (e_ip1[1] & (one << d_ip1)) == 0) d_ip1--; + } + gf->multiply.w128(gf, c_i, y_i, y_ip1); + y_ip1[0] ^= y_im1[0]; + y_ip1[1] ^= y_im1[1]; + + y_im1[0] = y_i[0]; + y_im1[1] = y_i[1]; + + y_i[0] = y_ip1[0]; + y_i[1] = y_ip1[1]; + + e_im1[0] = e_i[0]; + e_im1[1] = e_i[1]; + d_im1 = d_i; + e_i[0] = e_ip1[0]; + e_i[1] = e_ip1[1]; + d_i = d_ip1; + } + + b[0] = y_i[0]; + b[1] = y_i[1]; + return; +} + + void +gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) +{ + uint64_t d[2]; + gf->inverse.w128(gf, b128, d); + gf->multiply.w128(gf, a128, d, c128); + return; +} + + void +gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128) +{ + uint64_t one128[2]; + one128[0] = 0; + one128[1] = 1; + gf->divide.w128(gf, one128, a128, b128); + return; +} + + +static + void +gf_w128_composite_inverse(gf_t *gf, gf_val_128_t a, gf_val_128_t inv) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint64_t a0 = a[1]; + uint64_t a1 = a[0]; + uint64_t c0, c1, d, tmp; + uint64_t a0inv, a1inv; + + if (a0 == 0) { + a1inv = base_gf->inverse.w64(base_gf, a1); + c0 = base_gf->multiply.w64(base_gf, a1inv, h->prim_poly); + c1 = a1inv; + } else if (a1 == 0) { + c0 = base_gf->inverse.w64(base_gf, a0); + c1 = 0; + } else { + a1inv = base_gf->inverse.w64(base_gf, a1); + a0inv = base_gf->inverse.w64(base_gf, a0); + + d = base_gf->multiply.w64(base_gf, a1, a0inv); + + tmp = (base_gf->multiply.w64(base_gf, a1, a0inv) ^ base_gf->multiply.w64(base_gf, a0, a1inv) ^ h->prim_poly); + tmp = base_gf->inverse.w64(base_gf, tmp); + + d = base_gf->multiply.w64(base_gf, d, tmp); + + c0 = base_gf->multiply.w64(base_gf, (d^1), a0inv); + c1 = base_gf->multiply.w64(base_gf, d, a1inv); + } + inv[0] = c1; + inv[1] = c0; +} + +static + void +gf_w128_composite_multiply(gf_t *gf, gf_val_128_t a, gf_val_128_t b, gf_val_128_t rv) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint64_t b0 = b[1]; + uint64_t b1 = b[0]; + uint64_t a0 = a[1]; + uint64_t a1 = a[0]; + uint64_t a1b1; + + a1b1 = base_gf->multiply.w64(base_gf, a1, b1); + + rv[1] = (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1); + rv[0] = base_gf->multiply.w64(base_gf, a1, b0) ^ + base_gf->multiply.w64(base_gf, a0, b1) ^ + base_gf->multiply.w64(base_gf, a1b1, h->prim_poly); +} + +static + void +gf_w128_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint64_t b0 = val[1]; + uint64_t b1 = val[0]; + uint64_t *s64, *d64; + uint64_t *top; + uint64_t a0, a1, a1b1; + gf_region_data rd; + + if (val[0] == 0 && val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); + + s64 = rd.s_start; + d64 = rd.d_start; + top = rd.d_top; + + if (xor) { + while (d64 < top) { + a1 = s64[0]; + a0 = s64[1]; + a1b1 = base_gf->multiply.w64(base_gf, a1, b1); + + d64[1] ^= (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1); + d64[0] ^= (base_gf->multiply.w64(base_gf, a1, b0) ^ + base_gf->multiply.w64(base_gf, a0, b1) ^ + base_gf->multiply.w64(base_gf, a1b1, h->prim_poly)); + s64 += 2; + d64 += 2; + } + } else { + while (d64 < top) { + a1 = s64[0]; + a0 = s64[1]; + a1b1 = base_gf->multiply.w64(base_gf, a1, b1); + + d64[1] = (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1); + d64[0] = (base_gf->multiply.w64(base_gf, a1, b0) ^ + base_gf->multiply.w64(base_gf, a0, b1) ^ + base_gf->multiply.w64(base_gf, a1b1, h->prim_poly)); + s64 += 2; + d64 += 2; + } + } +} + +static +void +gf_w128_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int + xor) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; + gf_val_64_t val0 = val[1]; + gf_val_64_t val1 = val[0]; + uint8_t *slow, *shigh; + uint8_t *dlow, *dhigh, *top; + int sub_reg_size; + gf_region_data rd; + + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 64); + gf_w128_multiply_region_from_single(gf, src, dest, val, ((char*)rd.s_start-(char*)src), xor); + + slow = (uint8_t *) rd.s_start; + dlow = (uint8_t *) rd.d_start; + top = (uint8_t*) rd.d_top; + sub_reg_size = (top - dlow)/2; + shigh = slow + sub_reg_size; + dhigh = dlow + sub_reg_size; + + base_gf->multiply_region.w64(base_gf, slow, dlow, val0, sub_reg_size, xor); + base_gf->multiply_region.w64(base_gf, shigh, dlow, val1, sub_reg_size, 1); + base_gf->multiply_region.w64(base_gf, slow, dhigh, val1, sub_reg_size, xor); + base_gf->multiply_region.w64(base_gf, shigh, dhigh, val0, sub_reg_size, 1); + base_gf->multiply_region.w64(base_gf, shigh, dhigh, base_gf->multiply.w64(base_gf, h->prim_poly, val1 + ), sub_reg_size, 1); + + gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((char*)src+bytes)-(char*)rd.s_top, xor); +} + + + static +int gf_w128_composite_init(gf_t *gf) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + + if (h->region_type & GF_REGION_ALTMAP) { + gf->multiply_region.w128 = gf_w128_composite_multiply_region_alt; + } else { + gf->multiply_region.w128 = gf_w128_composite_multiply_region; + } + + gf->multiply.w128 = gf_w128_composite_multiply; + gf->divide.w128 = gf_w128_divide_from_inverse; + gf->inverse.w128 = gf_w128_composite_inverse; + + return 1; +} + +static +int gf_w128_cfm_init(gf_t *gf) +{ + gf_internal_t * h = gf->scratch; + if (h->sse & GF_SSE4_PCLMUL) { + return 0; + } else { + gf->inverse.w128 = gf_w128_euclid; + gf->multiply.w128 = gf_w128_clm_multiply; + gf->multiply_region.w128 = gf_w128_clm_multiply_region_from_single; + return 1; + } +} + +static +int gf_w128_shift_init(gf_t *gf) +{ + gf->multiply.w128 = gf_w128_shift_multiply; + gf->inverse.w128 = gf_w128_euclid; + gf->multiply_region.w128 = gf_w128_multiply_region_from_single; + return 1; +} + + static +int gf_w128_bytwo_init(gf_t *gf) +{ + gf_internal_t *h; + h = (gf_internal_t *) gf->scratch; + + if (h->mult_type == GF_MULT_BYTWO_p) { + gf->multiply.w128 = gf_w128_bytwo_p_multiply; + /*gf->multiply.w128 = gf_w128_sse_bytwo_p_multiply;*/ + /* John: the sse function is slower.*/ + } else { + gf->multiply.w128 = gf_w128_bytwo_b_multiply; + /*gf->multiply.w128 = gf_w128_sse_bytwo_b_multiply; +Ben: This sse function is also slower. */ + } + gf->inverse.w128 = gf_w128_euclid; + gf->multiply_region.w128 = gf_w128_bytwo_b_multiply_region; + return 1; +} + +/* + * Because the prim poly is only 8 bits and we are limiting g_r to 16, I do not need the high 64 + * bits in all of these numbers. + */ + static +void gf_w128_group_r_init(gf_t *gf) +{ + int i, j; + int g_r; + uint64_t pp; + gf_internal_t *scratch; + gf_group_tables_t *gt; + scratch = (gf_internal_t *) gf->scratch; + gt = scratch->private; + g_r = scratch->arg2; + pp = scratch->prim_poly; + + gt->r_table[0] = 0; + for (i = 1; i < (1 << g_r); i++) { + gt->r_table[i] = 0; + for (j = 0; j < g_r; j++) { + if (i & (1 << j)) { + gt->r_table[i] ^= (pp << j); + } + } + } + return; +} + + static +int gf_w128_split_init(gf_t *gf) +{ + struct gf_w128_split_4_128_data *sd4; + struct gf_w128_split_8_128_data *sd8; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + gf->multiply.w128 = gf_w128_bytwo_p_multiply; + if((h->sse & GF_SSE4_PCLMUL) && !(h->region_type & GF_REGION_NOSSE)){ + gf->multiply.w128 = gf_w128_clm_multiply; + } + + gf->inverse.w128 = gf_w128_euclid; + + if ((h->arg1 != 4 && h->arg2 != 4) || h->mult_type == GF_MULT_DEFAULT) { + sd8 = (struct gf_w128_split_8_128_data *) h->private; + sd8->last_value[0] = 0; + sd8->last_value[1] = 0; + gf->multiply_region.w128 = gf_w128_split_8_128_multiply_region; + } else { + sd4 = (struct gf_w128_split_4_128_data *) h->private; + sd4->last_value[0] = 0; + sd4->last_value[1] = 0; + if((h->region_type & GF_REGION_ALTMAP)) + { + #ifdef INTEL_SSE4 + if(!(h->region_type & GF_REGION_NOSSE)) + gf->multiply_region.w128 = gf_w128_split_4_128_sse_altmap_multiply_region; + else + return 0; + #else + return 0; + #endif + } + else { + if(h->sse & GF_SSE4) { + if(!(h->region_type & GF_REGION_NOSSE)) + gf->multiply_region.w128 = gf_w128_split_4_128_sse_multiply_region; + else + gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region; + } else { + gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region; + } + } + } + return 1; +} + + +static +int gf_w128_group_init(gf_t *gf) +{ + gf_internal_t *scratch; + gf_group_tables_t *gt; + int g_r, size_r; + + scratch = (gf_internal_t *) gf->scratch; + gt = scratch->private; + g_r = scratch->arg2; + size_r = (1 << g_r); + + gt->r_table = scratch->private + (2 * sizeof(uint64_t *)); + gt->m_table = gt->r_table + size_r; + gt->m_table[2] = 0; + gt->m_table[3] = 0; + + gf->multiply.w128 = gf_w128_group_multiply; + gf->inverse.w128 = gf_w128_euclid; + gf->multiply_region.w128 = gf_w128_group_multiply_region; + + gf_w128_group_r_init(gf); + + return 1; +} + +void gf_w128_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv) +{ + gf_val_128_t s; + + s = (gf_val_128_t) start; + s += (index * 2); + memcpy(rv, s, 16); +} + +static void gf_w128_split_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv) +{ + int i, blocks; + uint64_t *r64, tmp; + uint8_t *r8; + gf_region_data rd; + + gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 256); + r64 = (uint64_t *) start; + if ((r64 + index*2 < (uint64_t *) rd.d_start) || + (r64 + index*2 >= (uint64_t *) rd.d_top)) { + memcpy(rv, r64+(index*2), 16); + return; + } + + index -= (((uint64_t *) rd.d_start) - r64)/2; + r64 = (uint64_t *) rd.d_start; + + blocks = index/16; + r64 += (blocks*32); + index %= 16; + r8 = (uint8_t *) r64; + r8 += index; + rv[0] = 0; + rv[1] = 0; + + for (i = 0; i < 8; i++) { + tmp = *r8; + rv[1] |= (tmp << (i*8)); + r8 += 16; + } + + for (i = 0; i < 8; i++) { + tmp = *r8; + rv[0] |= (tmp << (i*8)); + r8 += 16; + } + return; +} + + static +void gf_w128_composite_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv) +{ + int sub_size; + gf_internal_t *h; + uint8_t *r8, *top; + uint64_t *r64; + gf_region_data rd; + + h = (gf_internal_t *) gf->scratch; + gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 64); + r64 = (uint64_t *) start; + if ((r64 + index*2 < (uint64_t *) rd.d_start) || + (r64 + index*2 >= (uint64_t *) rd.d_top)) { + memcpy(rv, r64+(index*2), 16); + return; + } + index -= (((uint64_t *) rd.d_start) - r64)/2; + r8 = (uint8_t *) rd.d_start; + top = (uint8_t *) rd.d_top; + sub_size = (top-r8)/2; + + rv[1] = h->base_gf->extract_word.w64(h->base_gf, r8, sub_size, index); + rv[0] = h->base_gf->extract_word.w64(h->base_gf, r8+sub_size, sub_size, index); + + return; +} + +int gf_w128_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) +{ + int size_m, size_r; + if (divide_type==GF_DIVIDE_MATRIX) return 0; + + switch(mult_type) + { + case GF_MULT_CARRY_FREE: + return sizeof(gf_internal_t); + break; + case GF_MULT_SHIFT: + return sizeof(gf_internal_t); + break; + case GF_MULT_BYTWO_p: + case GF_MULT_BYTWO_b: + return sizeof(gf_internal_t); + break; + case GF_MULT_DEFAULT: + case GF_MULT_SPLIT_TABLE: + if ((arg1 == 4 && arg2 == 128) || (arg1 == 128 && arg2 == 4)) { + return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_4_128_data) + 64; + } else if ((arg1 == 8 && arg2 == 128) || (arg1 == 128 && arg2 == 8) || mult_type == GF_MULT_DEFAULT) { + return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_8_128_data) + 64; + } + return 0; + break; + case GF_MULT_GROUP: + /* JSP We've already error checked the arguments. */ + size_m = (1 << arg1) * 2 * sizeof(uint64_t); + size_r = (1 << arg2) * 2 * sizeof(uint64_t); + /* + * two pointers prepend the table data for structure + * because the tables are of dynamic size + */ + return sizeof(gf_internal_t) + size_m + size_r + 4 * sizeof(uint64_t *); + break; + case GF_MULT_COMPOSITE: + if (arg1 == 2) { + return sizeof(gf_internal_t) + 4; + } else { + return 0; + } + break; + + default: + return 0; + } +} + +int gf_w128_init(gf_t *gf) +{ + gf_internal_t *h; + int no_default_flag = 0; + + h = (gf_internal_t *) gf->scratch; + + /* Allen: set default primitive polynomial / irreducible polynomial if needed */ + + if (h->prim_poly == 0) { + if (h->mult_type == GF_MULT_COMPOSITE) { + h->prim_poly = gf_composite_get_default_poly(h->base_gf); + if (h->prim_poly == 0) return 0; /* This shouldn't happen */ + } else { + h->prim_poly = 0x87; /* Omitting the leftmost 1 as in w=32 */ + } + if (no_default_flag == 1) { + fprintf(stderr,"Code contains no default irreducible polynomial for given base field\n"); + return 0; + } + } + + gf->multiply.w128 = NULL; + gf->divide.w128 = NULL; + gf->inverse.w128 = NULL; + gf->multiply_region.w128 = NULL; + switch(h->mult_type) { + case GF_MULT_BYTWO_p: + case GF_MULT_BYTWO_b: if (gf_w128_bytwo_init(gf) == 0) return 0; break; + case GF_MULT_CARRY_FREE: if (gf_w128_cfm_init(gf) == 0) return 0; break; + case GF_MULT_SHIFT: if (gf_w128_shift_init(gf) == 0) return 0; break; + case GF_MULT_GROUP: if (gf_w128_group_init(gf) == 0) return 0; break; + case GF_MULT_DEFAULT: + case GF_MULT_SPLIT_TABLE: if (gf_w128_split_init(gf) == 0) return 0; break; + case GF_MULT_COMPOSITE: if (gf_w128_composite_init(gf) == 0) return 0; break; + default: return 0; + } + + /* Ben: Used to be h->region_type == GF_REGION_ALTMAP, but failed since there + are multiple flags in h->region_type */ + if (h->mult_type == GF_MULT_SPLIT_TABLE && (h->region_type & GF_REGION_ALTMAP)) { + gf->extract_word.w128 = gf_w128_split_extract_word; + } else if (h->mult_type == GF_MULT_COMPOSITE && h->region_type == GF_REGION_ALTMAP) { + gf->extract_word.w128 = gf_w128_composite_extract_word; + } else { + gf->extract_word.w128 = gf_w128_extract_word; + } + + if (h->divide_type == GF_DIVIDE_EUCLID) { + gf->divide.w128 = gf_w128_divide_from_inverse; + } + + if (gf->inverse.w128 != NULL && gf->divide.w128 == NULL) { + gf->divide.w128 = gf_w128_divide_from_inverse; + } + if (gf->inverse.w128 == NULL && gf->divide.w128 != NULL) { + gf->inverse.w128 = gf_w128_inverse_from_divide; + } + return 1; +} diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w16.c b/src/erasure-code/jerasure/gf-complete/src/gf_w16.c new file mode 100644 index 000000000000..f1fb6501269c --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/gf_w16.c @@ -0,0 +1,2489 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_w16.c + * + * Routines for 16-bit Galois fields + */ + +#include "gf_int.h" +#include +#include + +#define GF_FIELD_WIDTH (16) +#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH) +#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1 + +#define GF_BASE_FIELD_WIDTH (8) +#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH) + +struct gf_w16_logtable_data { + uint16_t log_tbl[GF_FIELD_SIZE]; + uint16_t antilog_tbl[GF_FIELD_SIZE * 2]; + uint16_t inv_tbl[GF_FIELD_SIZE]; + uint16_t *d_antilog; +}; + +struct gf_w16_zero_logtable_data { + int log_tbl[GF_FIELD_SIZE]; + uint16_t _antilog_tbl[GF_FIELD_SIZE * 4]; + uint16_t *antilog_tbl; + uint16_t inv_tbl[GF_FIELD_SIZE]; +}; + +struct gf_w16_lazytable_data { + uint16_t log_tbl[GF_FIELD_SIZE]; + uint16_t antilog_tbl[GF_FIELD_SIZE * 2]; + uint16_t inv_tbl[GF_FIELD_SIZE]; + uint16_t *d_antilog; + uint16_t lazytable[GF_FIELD_SIZE]; +}; + +struct gf_w16_bytwo_data { + uint64_t prim_poly; + uint64_t mask1; + uint64_t mask2; +}; + +struct gf_w16_split_8_8_data { + uint16_t tables[3][256][256]; +}; + +struct gf_w16_group_4_4_data { + uint16_t reduce[16]; + uint16_t shift[16]; +}; + +struct gf_w16_composite_data { + uint8_t *mult_table; +}; + +#define AB2(ip, am1 ,am2, b, t1, t2) {\ + t1 = (b << 1) & am1;\ + t2 = b & am2; \ + t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \ + b = (t1 ^ (t2 & ip));} + +#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\ + t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \ + t2 = _mm_and_si128(va, m2); \ + t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \ + va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); } + +#define MM_PRINT(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 2) printf(" %02x %02x", blah[15-ii], blah[14-ii]); printf("\n"); } + +#define GF_FIRST_BIT (1 << 15) +#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1) + +static +inline +gf_val_32_t gf_w16_inverse_from_divide (gf_t *gf, gf_val_32_t a) +{ + return gf->divide.w32(gf, 1, a); +} + +static +inline +gf_val_32_t gf_w16_divide_from_inverse (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + b = gf->inverse.w32(gf, b); + return gf->multiply.w32(gf, a, b); +} + +static +void +gf_w16_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + gf_region_data rd; + uint16_t *s16; + uint16_t *d16; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); + gf_do_initial_region_alignment(&rd); + + s16 = (uint16_t *) rd.s_start; + d16 = (uint16_t *) rd.d_start; + + if (xor) { + while (d16 < ((uint16_t *) rd.d_top)) { + *d16 ^= gf->multiply.w32(gf, val, *s16); + d16++; + s16++; + } + } else { + while (d16 < ((uint16_t *) rd.d_top)) { + *d16 = gf->multiply.w32(gf, val, *s16); + d16++; + s16++; + } + } + gf_do_final_region_alignment(&rd); +} + +#if defined(INTEL_SSE4_PCLMUL) +static +void +gf_w16_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + gf_region_data rd; + uint16_t *s16; + uint16_t *d16; + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); + gf_do_initial_region_alignment(&rd); + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + + s16 = (uint16_t *) rd.s_start; + d16 = (uint16_t *) rd.d_start; + + if (xor) { + while (d16 < ((uint16_t *) rd.d_top)) { + + /* see gf_w16_clm_multiply() to see explanation of method */ + + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d16++; + s16++; + } + } else { + while (d16 < ((uint16_t *) rd.d_top)) { + + /* see gf_w16_clm_multiply() to see explanation of method */ + + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d16++; + s16++; + } + } + gf_do_final_region_alignment(&rd); +} +#endif + +#if defined(INTEL_SSE4_PCLMUL) +static +void +gf_w16_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + gf_region_data rd; + uint16_t *s16; + uint16_t *d16; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); + gf_do_initial_region_alignment(&rd); + + s16 = (uint16_t *) rd.s_start; + d16 = (uint16_t *) rd.d_start; + + if (xor) { + while (d16 < ((uint16_t *) rd.d_top)) { + + /* see gf_w16_clm_multiply() to see explanation of method */ + + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d16++; + s16++; + } + } else { + while (d16 < ((uint16_t *) rd.d_top)) { + + /* see gf_w16_clm_multiply() to see explanation of method */ + + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d16++; + s16++; + } + } + gf_do_final_region_alignment(&rd); +} +#endif + +#if defined(INTEL_SSE4_PCLMUL) +static +void +gf_w16_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + gf_region_data rd; + uint16_t *s16; + uint16_t *d16; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); + gf_do_initial_region_alignment(&rd); + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + + s16 = (uint16_t *) rd.s_start; + d16 = (uint16_t *) rd.d_start; + + if (xor) { + while (d16 < ((uint16_t *) rd.d_top)) { + + /* see gf_w16_clm_multiply() to see explanation of method */ + + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d16++; + s16++; + } + } else { + while (d16 < ((uint16_t *) rd.d_top)) { + + /* see gf_w16_clm_multiply() to see explanation of method */ + + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d16++; + s16++; + } + } + gf_do_final_region_alignment(&rd); +} +#endif + +static +inline +gf_val_32_t gf_w16_euclid (gf_t *gf, gf_val_32_t b) +{ + gf_val_32_t e_i, e_im1, e_ip1; + gf_val_32_t d_i, d_im1, d_ip1; + gf_val_32_t y_i, y_im1, y_ip1; + gf_val_32_t c_i; + + if (b == 0) return -1; + e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; + e_i = b; + d_im1 = 16; + for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ; + y_i = 1; + y_im1 = 0; + + while (e_i != 1) { + + e_ip1 = e_im1; + d_ip1 = d_im1; + c_i = 0; + + while (d_ip1 >= d_i) { + c_i ^= (1 << (d_ip1 - d_i)); + e_ip1 ^= (e_i << (d_ip1 - d_i)); + if (e_ip1 == 0) return 0; + while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--; + } + + y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i); + y_im1 = y_i; + y_i = y_ip1; + + e_im1 = e_i; + d_im1 = d_i; + e_i = e_ip1; + d_i = d_ip1; + } + + return y_i; +} + +static +gf_val_32_t gf_w16_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + uint16_t *r16, rv; + + r16 = (uint16_t *) start; + rv = r16[index]; + return rv; +} + +static +gf_val_32_t gf_w16_composite_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + int sub_size; + gf_internal_t *h; + uint8_t *r8, *top; + uint16_t a, b, *r16; + gf_region_data rd; + + h = (gf_internal_t *) gf->scratch; + gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32); + r16 = (uint16_t *) start; + if (r16 + index < (uint16_t *) rd.d_start) return r16[index]; + if (r16 + index >= (uint16_t *) rd.d_top) return r16[index]; + index -= (((uint16_t *) rd.d_start) - r16); + r8 = (uint8_t *) rd.d_start; + top = (uint8_t *) rd.d_top; + sub_size = (top-r8)/2; + + a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index); + b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index); + return (a | (b << 8)); +} + +static +gf_val_32_t gf_w16_split_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + uint16_t *r16, rv; + uint8_t *r8; + gf_region_data rd; + + gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32); + r16 = (uint16_t *) start; + if (r16 + index < (uint16_t *) rd.d_start) return r16[index]; + if (r16 + index >= (uint16_t *) rd.d_top) return r16[index]; + index -= (((uint16_t *) rd.d_start) - r16); + r8 = (uint8_t *) rd.d_start; + r8 += ((index & 0xfffffff0)*2); + r8 += (index & 0xf); + rv = (*r8 << 8); + r8 += 16; + rv |= *r8; + return rv; +} + +static +inline +gf_val_32_t gf_w16_matrix (gf_t *gf, gf_val_32_t b) +{ + return gf_bitmatrix_inverse(b, 16, ((gf_internal_t *) (gf->scratch))->prim_poly); +} + +/* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm. I only + include it for completeness. It does have the feature that it requires no + extra memory. + */ + +static +inline +gf_val_32_t +gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) +{ + gf_val_32_t rv = 0; + +#if defined(INTEL_SSE4_PCLMUL) + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0); + b = _mm_insert_epi32 (a, b16, 0); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only + have to do the reduction at most twice, because (w-2)/z == 2. Where + z is equal to the number of zeros after the leading 1 + + _mm_clmulepi64_si128 is the carryless multiply operation. Here + _mm_srli_si128 shifts the result to the right by 2 bytes. This allows + us to multiply the prim_poly by the leading bits of the result. We + then xor the result of that operation back with the result.*/ + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + + +#endif + return rv; +} + +static +inline +gf_val_32_t +gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) +{ + gf_val_32_t rv = 0; + +#if defined(INTEL_SSE4_PCLMUL) + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0); + b = _mm_insert_epi32 (a, b16, 0); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + + +#endif + return rv; +} + +static +inline +gf_val_32_t +gf_w16_clm_multiply_4 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) +{ + gf_val_32_t rv = 0; + +#if defined(INTEL_SSE4_PCLMUL) + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0); + b = _mm_insert_epi32 (a, b16, 0); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + + +#endif + return rv; +} + + +static +inline + gf_val_32_t +gf_w16_shift_multiply (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) +{ + gf_val_32_t product, i, pp, a, b; + gf_internal_t *h; + + a = a16; + b = b16; + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + product = 0; + + for (i = 0; i < GF_FIELD_WIDTH; i++) { + if (a & (1 << i)) product ^= (b << i); + } + for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) { + if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); + } + return product; +} + +static +int gf_w16_shift_init(gf_t *gf) +{ + gf->multiply.w32 = gf_w16_shift_multiply; + return 1; +} + +static +int gf_w16_cfm_init(gf_t *gf) +{ +#if defined(INTEL_SSE4_PCLMUL) + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + /*Ben: Determining how many reductions to do */ + + if ((0xfe00 & h->prim_poly) == 0) { + gf->multiply.w32 = gf_w16_clm_multiply_2; + gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_2; + } else if((0xf000 & h->prim_poly) == 0) { + gf->multiply.w32 = gf_w16_clm_multiply_3; + gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_3; + } else if ((0xe000 & h->prim_poly) == 0) { + gf->multiply.w32 = gf_w16_clm_multiply_4; + gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_4; + } else { + return 0; + } + return 1; +#endif + + return 0; +} + +/* KMG: GF_MULT_LOGTABLE: */ + +static +void +gf_w16_log_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint16_t *s16, *d16; + int lv; + struct gf_w16_logtable_data *ltd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); + gf_do_initial_region_alignment(&rd); + + ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + s16 = (uint16_t *) rd.s_start; + d16 = (uint16_t *) rd.d_start; + + lv = ltd->log_tbl[val]; + + if (xor) { + while (d16 < (uint16_t *) rd.d_top) { + *d16 ^= (*s16 == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[*s16]]); + d16++; + s16++; + } + } else { + while (d16 < (uint16_t *) rd.d_top) { + *d16 = (*s16 == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[*s16]]); + d16++; + s16++; + } + } + gf_do_final_region_alignment(&rd); +} + +static +inline +gf_val_32_t +gf_w16_log_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_w16_logtable_data *ltd; + + ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(int) ltd->log_tbl[a] + (int) ltd->log_tbl[b]]; +} + +static +inline +gf_val_32_t +gf_w16_log_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + int log_sum = 0; + struct gf_w16_logtable_data *ltd; + + if (a == 0 || b == 0) return 0; + ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + + log_sum = (int) ltd->log_tbl[a] - (int) ltd->log_tbl[b]; + return (ltd->d_antilog[log_sum]); +} + +static +gf_val_32_t +gf_w16_log_inverse(gf_t *gf, gf_val_32_t a) +{ + struct gf_w16_logtable_data *ltd; + + ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + return (ltd->inv_tbl[a]); +} + +static +int gf_w16_log_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_w16_logtable_data *ltd; + int i, b; + int check = 0; + + h = (gf_internal_t *) gf->scratch; + ltd = h->private; + + for (i = 0; i < GF_MULT_GROUP_SIZE+1; i++) + ltd->log_tbl[i] = 0; + ltd->d_antilog = ltd->antilog_tbl + GF_MULT_GROUP_SIZE; + + b = 1; + for (i = 0; i < GF_MULT_GROUP_SIZE; i++) { + if (ltd->log_tbl[b] != 0) check = 1; + ltd->log_tbl[b] = i; + ltd->antilog_tbl[i] = b; + ltd->antilog_tbl[i+GF_MULT_GROUP_SIZE] = b; + b <<= 1; + if (b & GF_FIELD_SIZE) { + b = b ^ h->prim_poly; + } + } + + /* If you can't construct the log table, there's a problem. This code is used for + some other implementations (e.g. in SPLIT), so if the log table doesn't work in + that instance, use CARRY_FREE / SHIFT instead. */ + + if (check) { + if (h->mult_type != GF_MULT_LOG_TABLE) { + +#if defined(INTEL_SSE4_PCLMUL) + return gf_w16_cfm_init(gf); +#endif + return gf_w16_shift_init(gf); + } else { + _gf_errno = GF_E_LOGPOLY; + return 0; + } + } + + ltd->inv_tbl[0] = 0; /* Not really, but we need to fill it with something */ + ltd->inv_tbl[1] = 1; + for (i = 2; i < GF_FIELD_SIZE; i++) { + ltd->inv_tbl[i] = ltd->antilog_tbl[GF_MULT_GROUP_SIZE-ltd->log_tbl[i]]; + } + + gf->inverse.w32 = gf_w16_log_inverse; + gf->divide.w32 = gf_w16_log_divide; + gf->multiply.w32 = gf_w16_log_multiply; + gf->multiply_region.w32 = gf_w16_log_multiply_region; + + return 1; +} + +/* JSP: GF_MULT_SPLIT_TABLE: Using 8 multiplication tables to leverage SSE instructions. +*/ + + +/* Ben: Does alternate mapping multiplication using a split table in the + lazy method without sse instructions*/ + +static +void +gf_w16_split_4_16_lazy_nosse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t i, j, c, prod; + uint8_t *s8, *d8, *top; + uint16_t table[4][16]; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); + gf_do_initial_region_alignment(&rd); + + /*Ben: Constructs lazy multiplication table*/ + + for (j = 0; j < 16; j++) { + for (i = 0; i < 4; i++) { + c = (j << (i*4)); + table[i][j] = gf->multiply.w32(gf, c, val); + } + } + + /*Ben: s8 is the start of source, d8 is the start of dest, top is end of dest region. */ + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + top = (uint8_t *) rd.d_top; + + + while (d8 < top) { + + /*Ben: Multiplies across 16 two byte quantities using alternate mapping + high bits are on the left, low bits are on the right. */ + + for (j=0;j<16;j++) { + + /*Ben: If the xor flag is set, the product should include what is in dest */ + prod = (xor) ? ((uint16_t)(*d8)<<8) ^ *(d8+16) : 0; + + /*Ben: xors all 4 table lookups into the product variable*/ + + prod ^= ((table[0][*(s8+16)&0xf]) ^ + (table[1][(*(s8+16)&0xf0)>>4]) ^ + (table[2][*(s8)&0xf]) ^ + (table[3][(*(s8)&0xf0)>>4])); + + /*Ben: Stores product in the destination and moves on*/ + + *d8 = (uint8_t)(prod >> 8); + *(d8+16) = (uint8_t)(prod & 0x00ff); + s8++; + d8++; + } + s8+=16; + d8+=16; + } + gf_do_final_region_alignment(&rd); +} + +static + void +gf_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t i, j, a, c, prod; + uint16_t *s16, *d16, *top; + uint16_t table[4][16]; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); + gf_do_initial_region_alignment(&rd); + + for (j = 0; j < 16; j++) { + for (i = 0; i < 4; i++) { + c = (j << (i*4)); + table[i][j] = gf->multiply.w32(gf, c, val); + } + } + + s16 = (uint16_t *) rd.s_start; + d16 = (uint16_t *) rd.d_start; + top = (uint16_t *) rd.d_top; + + while (d16 < top) { + a = *s16; + prod = (xor) ? *d16 : 0; + for (i = 0; i < 4; i++) { + prod ^= table[i][a&0xf]; + a >>= 4; + } + *d16 = prod; + s16++; + d16++; + } +} + +static +void +gf_w16_split_8_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t j, k, v, a, prod, *s64, *d64, *top64; + gf_internal_t *h; + uint64_t htable[256], ltable[256]; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + gf_do_initial_region_alignment(&rd); + + h = (gf_internal_t *) gf->scratch; + + v = val; + ltable[0] = 0; + for (j = 1; j < 256; j <<= 1) { + for (k = 0; k < j; k++) ltable[k^j] = (v ^ ltable[k]); + v = GF_MULTBY_TWO(v); + } + htable[0] = 0; + for (j = 1; j < 256; j <<= 1) { + for (k = 0; k < j; k++) htable[k^j] = (v ^ htable[k]); + v = GF_MULTBY_TWO(v); + } + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top64 = (uint64_t *) rd.d_top; + +/* Does Unrolling Matter? -- Doesn't seem to. + while (d64 != top64) { + a = *s64; + + prod = htable[a >> 56]; + a <<= 8; + prod ^= ltable[a >> 56]; + a <<= 8; + prod <<= 16; + + prod ^= htable[a >> 56]; + a <<= 8; + prod ^= ltable[a >> 56]; + a <<= 8; + prod <<= 16; + + prod ^= htable[a >> 56]; + a <<= 8; + prod ^= ltable[a >> 56]; + a <<= 8; + prod <<= 16; + + prod ^= htable[a >> 56]; + a <<= 8; + prod ^= ltable[a >> 56]; + prod ^= ((xor) ? *d64 : 0); + *d64 = prod; + s64++; + d64++; + } +*/ + + while (d64 != top64) { + a = *s64; + + prod = 0; + for (j = 0; j < 4; j++) { + prod <<= 16; + prod ^= htable[a >> 56]; + a <<= 8; + prod ^= ltable[a >> 56]; + a <<= 8; + } + + //JSP: We can move the conditional outside the while loop, but we need to fully test it to understand which is better. + + prod ^= ((xor) ? *d64 : 0); + *d64 = prod; + s64++; + d64++; + } + gf_do_final_region_alignment(&rd); +} + +static void +gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t c; + gf_internal_t *h; + struct gf_w16_lazytable_data *ltd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + gf_do_initial_region_alignment(&rd); + + h = (gf_internal_t *) gf->scratch; + ltd = (struct gf_w16_lazytable_data *) h->private; + + ltd->lazytable[0] = 0; + + /* + a = val; + c = 1; + pp = h->prim_poly; + + do { + ltd->lazytable[c] = a; + c <<= 1; + if (c & (1 << GF_FIELD_WIDTH)) c ^= pp; + a <<= 1; + if (a & (1 << GF_FIELD_WIDTH)) a ^= pp; + } while (c != 1); + */ + + for (c = 1; c < GF_FIELD_SIZE; c++) { + ltd->lazytable[c] = gf_w16_shift_multiply(gf, c, val); + } + + gf_two_byte_region_table_multiply(&rd, ltd->lazytable); + gf_do_final_region_alignment(&rd); +} + +static +void +gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ +#ifdef INTEL_SSSE3 + uint64_t i, j, *s64, *d64, *top64;; + uint64_t c, prod; + uint8_t low[4][16]; + uint8_t high[4][16]; + gf_region_data rd; + + __m128i mask, ta, tb, ti, tpl, tph, tlow[4], thigh[4], tta, ttb, lmask; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); + gf_do_initial_region_alignment(&rd); + + for (j = 0; j < 16; j++) { + for (i = 0; i < 4; i++) { + c = (j << (i*4)); + prod = gf->multiply.w32(gf, c, val); + low[i][j] = (prod & 0xff); + high[i][j] = (prod >> 8); + } + } + + for (i = 0; i < 4; i++) { + tlow[i] = _mm_loadu_si128((__m128i *)low[i]); + thigh[i] = _mm_loadu_si128((__m128i *)high[i]); + } + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top64 = (uint64_t *) rd.d_top; + + mask = _mm_set1_epi8 (0x0f); + lmask = _mm_set1_epi16 (0xff); + + if (xor) { + while (d64 != top64) { + + ta = _mm_load_si128((__m128i *) s64); + tb = _mm_load_si128((__m128i *) (s64+2)); + + tta = _mm_srli_epi16(ta, 8); + ttb = _mm_srli_epi16(tb, 8); + tpl = _mm_and_si128(tb, lmask); + tph = _mm_and_si128(ta, lmask); + + tb = _mm_packus_epi16(tpl, tph); + ta = _mm_packus_epi16(ttb, tta); + + ti = _mm_and_si128 (mask, tb); + tph = _mm_shuffle_epi8 (thigh[0], ti); + tpl = _mm_shuffle_epi8 (tlow[0], ti); + + tb = _mm_srli_epi16(tb, 4); + ti = _mm_and_si128 (mask, tb); + tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl); + tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph); + + ti = _mm_and_si128 (mask, ta); + tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl); + tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph); + + ta = _mm_srli_epi16(ta, 4); + ti = _mm_and_si128 (mask, ta); + tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl); + tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph); + + ta = _mm_unpackhi_epi8(tpl, tph); + tb = _mm_unpacklo_epi8(tpl, tph); + + tta = _mm_load_si128((__m128i *) d64); + ta = _mm_xor_si128(ta, tta); + ttb = _mm_load_si128((__m128i *) (d64+2)); + tb = _mm_xor_si128(tb, ttb); + _mm_store_si128 ((__m128i *)d64, ta); + _mm_store_si128 ((__m128i *)(d64+2), tb); + + d64 += 4; + s64 += 4; + + } + } else { + while (d64 != top64) { + + ta = _mm_load_si128((__m128i *) s64); + tb = _mm_load_si128((__m128i *) (s64+2)); + + tta = _mm_srli_epi16(ta, 8); + ttb = _mm_srli_epi16(tb, 8); + tpl = _mm_and_si128(tb, lmask); + tph = _mm_and_si128(ta, lmask); + + tb = _mm_packus_epi16(tpl, tph); + ta = _mm_packus_epi16(ttb, tta); + + ti = _mm_and_si128 (mask, tb); + tph = _mm_shuffle_epi8 (thigh[0], ti); + tpl = _mm_shuffle_epi8 (tlow[0], ti); + + tb = _mm_srli_epi16(tb, 4); + ti = _mm_and_si128 (mask, tb); + tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl); + tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph); + + ti = _mm_and_si128 (mask, ta); + tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl); + tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph); + + ta = _mm_srli_epi16(ta, 4); + ti = _mm_and_si128 (mask, ta); + tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl); + tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph); + + ta = _mm_unpackhi_epi8(tpl, tph); + tb = _mm_unpacklo_epi8(tpl, tph); + + _mm_store_si128 ((__m128i *)d64, ta); + _mm_store_si128 ((__m128i *)(d64+2), tb); + + d64 += 4; + s64 += 4; + } + } + + gf_do_final_region_alignment(&rd); +#endif +} + +static +void +gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ +#ifdef INTEL_SSSE3 + uint64_t i, j, *s64, *d64, *top64;; + uint64_t c, prod; + uint8_t low[4][16]; + uint8_t high[4][16]; + gf_region_data rd; + __m128i mask, ta, tb, ti, tpl, tph, tlow[4], thigh[4]; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); + gf_do_initial_region_alignment(&rd); + + for (j = 0; j < 16; j++) { + for (i = 0; i < 4; i++) { + c = (j << (i*4)); + prod = gf->multiply.w32(gf, c, val); + low[i][j] = (prod & 0xff); + high[i][j] = (prod >> 8); + } + } + + for (i = 0; i < 4; i++) { + tlow[i] = _mm_loadu_si128((__m128i *)low[i]); + thigh[i] = _mm_loadu_si128((__m128i *)high[i]); + } + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top64 = (uint64_t *) rd.d_top; + + mask = _mm_set1_epi8 (0x0f); + + if (xor) { + while (d64 != top64) { + + ta = _mm_load_si128((__m128i *) s64); + tb = _mm_load_si128((__m128i *) (s64+2)); + + ti = _mm_and_si128 (mask, tb); + tph = _mm_shuffle_epi8 (thigh[0], ti); + tpl = _mm_shuffle_epi8 (tlow[0], ti); + + tb = _mm_srli_epi16(tb, 4); + ti = _mm_and_si128 (mask, tb); + tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl); + tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph); + + ti = _mm_and_si128 (mask, ta); + tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl); + tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph); + + ta = _mm_srli_epi16(ta, 4); + ti = _mm_and_si128 (mask, ta); + tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl); + tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph); + + ta = _mm_load_si128((__m128i *) d64); + tph = _mm_xor_si128(tph, ta); + _mm_store_si128 ((__m128i *)d64, tph); + tb = _mm_load_si128((__m128i *) (d64+2)); + tpl = _mm_xor_si128(tpl, tb); + _mm_store_si128 ((__m128i *)(d64+2), tpl); + + d64 += 4; + s64 += 4; + } + } else { + while (d64 != top64) { + + ta = _mm_load_si128((__m128i *) s64); + tb = _mm_load_si128((__m128i *) (s64+2)); + + ti = _mm_and_si128 (mask, tb); + tph = _mm_shuffle_epi8 (thigh[0], ti); + tpl = _mm_shuffle_epi8 (tlow[0], ti); + + tb = _mm_srli_epi16(tb, 4); + ti = _mm_and_si128 (mask, tb); + tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl); + tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph); + + ti = _mm_and_si128 (mask, ta); + tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl); + tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph); + + ta = _mm_srli_epi16(ta, 4); + ti = _mm_and_si128 (mask, ta); + tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl); + tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph); + + _mm_store_si128 ((__m128i *)d64, tph); + _mm_store_si128 ((__m128i *)(d64+2), tpl); + + d64 += 4; + s64 += 4; + + } + } + gf_do_final_region_alignment(&rd); + +#endif +} + +uint32_t +gf_w16_split_8_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t alow, blow; + struct gf_w16_split_8_8_data *d8; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + d8 = (struct gf_w16_split_8_8_data *) h->private; + + alow = a & 0xff; + blow = b & 0xff; + a >>= 8; + b >>= 8; + + return d8->tables[0][alow][blow] ^ + d8->tables[1][alow][b] ^ + d8->tables[1][a][blow] ^ + d8->tables[2][a][b]; +} + +static +int gf_w16_split_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_w16_split_8_8_data *d8; + int i, j, exp, issse3; + uint32_t p, basep; + + h = (gf_internal_t *) gf->scratch; + +issse3 = 0; +#ifdef INTEL_SSSE3 + issse3 = 1; +#endif + + if (h->arg1 == 8 && h->arg2 == 8) { + d8 = (struct gf_w16_split_8_8_data *) h->private; + basep = 1; + for (exp = 0; exp < 3; exp++) { + for (j = 0; j < 256; j++) d8->tables[exp][0][j] = 0; + for (i = 0; i < 256; i++) d8->tables[exp][i][0] = 0; + d8->tables[exp][1][1] = basep; + for (i = 2; i < 256; i++) { + if (i&1) { + p = d8->tables[exp][i^1][1]; + d8->tables[exp][i][1] = p ^ basep; + } else { + p = d8->tables[exp][i>>1][1]; + d8->tables[exp][i][1] = GF_MULTBY_TWO(p); + } + } + for (i = 1; i < 256; i++) { + p = d8->tables[exp][i][1]; + for (j = 1; j < 256; j++) { + if (j&1) { + d8->tables[exp][i][j] = d8->tables[exp][i][j^1] ^ p; + } else { + d8->tables[exp][i][j] = GF_MULTBY_TWO(d8->tables[exp][i][j>>1]); + } + } + } + for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep); + } + gf->multiply.w32 = gf_w16_split_8_8_multiply; + gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region; + return 1; + + } + + /* We'll be using LOG for multiplication, unless the pp isn't primitive. + In that case, we'll be using SHIFT. */ + + gf_w16_log_init(gf); + + /* Defaults */ + + if (issse3) { + gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_multiply_region; + } else { + gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region; + } + + + if ((h->arg1 == 8 && h->arg2 == 16) || (h->arg2 == 8 && h->arg1 == 16)) { + gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region; + + } else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) { + if (issse3) { + if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region; + else if(h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region; + else if(h->region_type & GF_REGION_ALTMAP) + gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_altmap_multiply_region; + } else { + if(h->region_type & GF_REGION_SSE) + return 0; + else if(h->region_type & GF_REGION_ALTMAP) + gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region; + else + gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region; + } + } + + return 1; +} + +static +int gf_w16_table_init(gf_t *gf) +{ + gf_w16_log_init(gf); + + gf->multiply_region.w32 = gf_w16_table_lazy_multiply_region; + return 1; +} + +static +void +gf_w16_log_zero_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint16_t lv; + int i; + uint16_t *s16, *d16, *top16; + struct gf_w16_zero_logtable_data *ltd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); + gf_do_initial_region_alignment(&rd); + + ltd = (struct gf_w16_zero_logtable_data*) ((gf_internal_t *) gf->scratch)->private; + s16 = (uint16_t *) rd.s_start; + d16 = (uint16_t *) rd.d_start; + top16 = (uint16_t *) rd.d_top; + bytes = top16 - d16; + + lv = ltd->log_tbl[val]; + + if (xor) { + for (i = 0; i < bytes; i++) { + d16[i] ^= (ltd->antilog_tbl[lv + ltd->log_tbl[s16[i]]]); + } + } else { + for (i = 0; i < bytes; i++) { + d16[i] = (ltd->antilog_tbl[lv + ltd->log_tbl[s16[i]]]); + } + } + + /* This isn't necessary. */ + + gf_do_final_region_alignment(&rd); +} + +/* Here -- double-check Kevin */ + +static +inline +gf_val_32_t +gf_w16_log_zero_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_w16_zero_logtable_data *ltd; + + ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + return ltd->antilog_tbl[ltd->log_tbl[a] + ltd->log_tbl[b]]; +} + +static +inline +gf_val_32_t +gf_w16_log_zero_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + int log_sum = 0; + struct gf_w16_zero_logtable_data *ltd; + + if (a == 0 || b == 0) return 0; + ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + + log_sum = ltd->log_tbl[a] - ltd->log_tbl[b] + (GF_MULT_GROUP_SIZE); + return (ltd->antilog_tbl[log_sum]); +} + +static +gf_val_32_t +gf_w16_log_zero_inverse (gf_t *gf, gf_val_32_t a) +{ + struct gf_w16_zero_logtable_data *ltd; + + ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + return (ltd->inv_tbl[a]); +} + +static +inline +gf_val_32_t +gf_w16_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t prod, pp, pmask, amask; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + + prod = 0; + pmask = 0x8000; + amask = 0x8000; + + while (amask != 0) { + if (prod & pmask) { + prod = ((prod << 1) ^ pp); + } else { + prod <<= 1; + } + if (a & amask) prod ^= b; + amask >>= 1; + } + return prod; +} + +static +inline +gf_val_32_t +gf_w16_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t prod, pp, bmask; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + prod = 0; + bmask = 0x8000; + + while (1) { + if (a & 1) prod ^= b; + a >>= 1; + if (a == 0) return prod; + if (b & bmask) { + b = ((b << 1) ^ pp); + } else { + b <<= 1; + } + } +} + +static +void +gf_w16_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t *s64, *d64, t1, t2, ta, prod, amask; + gf_region_data rd; + struct gf_w16_bytwo_data *btd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + gf_do_initial_region_alignment(&rd); + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + + if (xor) { + while (s64 < (uint64_t *) rd.s_top) { + prod = 0; + amask = 0x8000; + ta = *s64; + while (amask != 0) { + AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); + if (val & amask) prod ^= ta; + amask >>= 1; + } + *d64 ^= prod; + d64++; + s64++; + } + } else { + while (s64 < (uint64_t *) rd.s_top) { + prod = 0; + amask = 0x8000; + ta = *s64; + while (amask != 0) { + AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); + if (val & amask) prod ^= ta; + amask >>= 1; + } + *d64 = prod; + d64++; + s64++; + } + } + gf_do_final_region_alignment(&rd); +} + +#define BYTWO_P_ONESTEP {\ + SSE_AB2(pp, m1 ,m2, prod, t1, t2); \ + t1 = _mm_and_si128(v, one); \ + t1 = _mm_sub_epi16(t1, one); \ + t1 = _mm_and_si128(t1, ta); \ + prod = _mm_xor_si128(prod, t1); \ + v = _mm_srli_epi64(v, 1); } + +#ifdef INTEL_SSE2 +static +void +gf_w16_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + int i; + uint8_t *s8, *d8; + uint32_t vrev; + __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v; + struct gf_w16_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + vrev = 0; + for (i = 0; i < 16; i++) { + vrev <<= 1; + if (!(val & (1 << i))) vrev |= 1; + } + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + pp = _mm_set1_epi16(btd->prim_poly&0xffff); + m1 = _mm_set1_epi16((btd->mask1)&0xffff); + m2 = _mm_set1_epi16((btd->mask2)&0xffff); + one = _mm_set1_epi16(1); + + while (d8 < (uint8_t *) rd.d_top) { + prod = _mm_setzero_si128(); + v = _mm_set1_epi16(vrev); + ta = _mm_load_si128((__m128i *) s8); + tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8); + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp)); + d8 += 16; + s8 += 16; + } + gf_do_final_region_alignment(&rd); +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w16_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w16_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi16(btd->prim_poly&0xffff); + m1 = _mm_set1_epi16((btd->mask1)&0xffff); + m2 = _mm_set1_epi16((btd->mask2)&0xffff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, m2, va, t1, t2); + _mm_store_si128((__m128i *)d8, va); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w16_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w16_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi16(btd->prim_poly&0xffff); + m1 = _mm_set1_epi16((btd->mask1)&0xffff); + m2 = _mm_set1_epi16((btd->mask2)&0xffff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, m2, va, t1, t2); + vb = _mm_load_si128 ((__m128i *)(d8)); + vb = _mm_xor_si128(vb, va); + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } +} +#endif + + +#ifdef INTEL_SSE2 +static +void +gf_w16_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + int itb; + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va, vb; + struct gf_w16_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + if (val == 2) { + if (xor) { + gf_w16_bytwo_b_sse_region_2_xor(&rd, btd); + } else { + gf_w16_bytwo_b_sse_region_2_noxor(&rd, btd); + } + gf_do_final_region_alignment(&rd); + return; + } + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + pp = _mm_set1_epi16(btd->prim_poly&0xffff); + m1 = _mm_set1_epi16((btd->mask1)&0xffff); + m2 = _mm_set1_epi16((btd->mask2)&0xffff); + + while (d8 < (uint8_t *) rd.d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8)); + itb = val; + while (1) { + if (itb & 1) vb = _mm_xor_si128(vb, va); + itb >>= 1; + if (itb == 0) break; + SSE_AB2(pp, m1, m2, va, t1, t2); + } + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } + + gf_do_final_region_alignment(&rd); +} +#endif + +static +void +gf_w16_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t *s64, *d64, t1, t2, ta, tb, prod; + struct gf_w16_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + + switch (val) { + case 2: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= ta; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta; + d64++; + s64++; + } + } + break; + case 3: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 4: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= ta; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta; + d64++; + s64++; + } + } + break; + case 5: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta ^ prod; + d64++; + s64++; + } + } + default: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + prod = *d64 ; + ta = *s64; + tb = val; + while (1) { + if (tb & 1) prod ^= ta; + tb >>= 1; + if (tb == 0) break; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + } + *d64 = prod; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + prod = 0 ; + ta = *s64; + tb = val; + while (1) { + if (tb & 1) prod ^= ta; + tb >>= 1; + if (tb == 0) break; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + } + *d64 = prod; + d64++; + s64++; + } + } + break; + } + gf_do_final_region_alignment(&rd); +} + +static +int gf_w16_bytwo_init(gf_t *gf) +{ + gf_internal_t *h; + uint64_t ip, m1, m2; + struct gf_w16_bytwo_data *btd; + + h = (gf_internal_t *) gf->scratch; + btd = (struct gf_w16_bytwo_data *) (h->private); + ip = h->prim_poly & 0xffff; + m1 = 0xfffe; + m2 = 0x8000; + btd->prim_poly = 0; + btd->mask1 = 0; + btd->mask2 = 0; + + while (ip != 0) { + btd->prim_poly |= ip; + btd->mask1 |= m1; + btd->mask2 |= m2; + ip <<= GF_FIELD_WIDTH; + m1 <<= GF_FIELD_WIDTH; + m2 <<= GF_FIELD_WIDTH; + } + + if (h->mult_type == GF_MULT_BYTWO_p) { + gf->multiply.w32 = gf_w16_bytwo_p_multiply; + #ifdef INTEL_SSE2 + if (h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region; + else + gf->multiply_region.w32 = gf_w16_bytwo_p_sse_multiply_region; + #else + gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region; + if(h->region_type & GF_REGION_SSE) + return 0; + #endif + } else { + gf->multiply.w32 = gf_w16_bytwo_b_multiply; + #ifdef INTEL_SSE2 + if (h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region; + else + gf->multiply_region.w32 = gf_w16_bytwo_b_sse_multiply_region; + #else + gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region; + if(h->region_type & GF_REGION_SSE) + return 0; + #endif + } + + return 1; +} + +static +int gf_w16_log_zero_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_w16_zero_logtable_data *ltd; + int i, b; + + h = (gf_internal_t *) gf->scratch; + ltd = h->private; + + ltd->log_tbl[0] = (-GF_MULT_GROUP_SIZE) + 1; + + bzero(&(ltd->_antilog_tbl[0]), sizeof(ltd->_antilog_tbl)); + + ltd->antilog_tbl = &(ltd->_antilog_tbl[GF_FIELD_SIZE * 2]); + + b = 1; + for (i = 0; i < GF_MULT_GROUP_SIZE; i++) { + ltd->log_tbl[b] = (uint16_t)i; + ltd->antilog_tbl[i] = (uint16_t)b; + ltd->antilog_tbl[i+GF_MULT_GROUP_SIZE] = (uint16_t)b; + b <<= 1; + if (b & GF_FIELD_SIZE) { + b = b ^ h->prim_poly; + } + } + ltd->inv_tbl[0] = 0; /* Not really, but we need to fill it with something */ + ltd->inv_tbl[1] = 1; + for (i = 2; i < GF_FIELD_SIZE; i++) { + ltd->inv_tbl[i] = ltd->antilog_tbl[GF_MULT_GROUP_SIZE-ltd->log_tbl[i]]; + } + + gf->inverse.w32 = gf_w16_log_zero_inverse; + gf->divide.w32 = gf_w16_log_zero_divide; + gf->multiply.w32 = gf_w16_log_zero_multiply; + gf->multiply_region.w32 = gf_w16_log_zero_multiply_region; + return 1; +} + +static +gf_val_32_t +gf_w16_composite_multiply_recursive(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint8_t b0 = b & 0x00ff; + uint8_t b1 = (b & 0xff00) >> 8; + uint8_t a0 = a & 0x00ff; + uint8_t a1 = (a & 0xff00) >> 8; + uint8_t a1b1; + uint16_t rv; + + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + rv = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8)); + return rv; +} + +static +gf_val_32_t +gf_w16_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + uint8_t b0 = b & 0x00ff; + uint8_t b1 = (b & 0xff00) >> 8; + uint8_t a0 = a & 0x00ff; + uint8_t a1 = (a & 0xff00) >> 8; + uint8_t a1b1, *mt; + uint16_t rv; + struct gf_w16_composite_data *cd; + + cd = (struct gf_w16_composite_data *) h->private; + mt = cd->mult_table; + + a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1); + + rv = ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8)); + return rv; +} + +/* + * Composite field division trick (explained in 2007 tech report) + * + * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1 + * + * let c = b^-1 + * + * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0) + * + * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1 + * + * let d = b1c1 and d+1 = b0c0 + * + * solve s*b1c1+b1c0+b0c1 = 0 + * + * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1 + * + * c0 = (d+1)b0^-1 + * c1 = d*b1^-1 + * + * a / b = a * c + */ + +static +gf_val_32_t +gf_w16_composite_inverse(gf_t *gf, gf_val_32_t a) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint8_t a0 = a & 0x00ff; + uint8_t a1 = (a & 0xff00) >> 8; + uint8_t c0, c1, d, tmp; + uint16_t c; + uint8_t a0inv, a1inv; + + if (a0 == 0) { + a1inv = base_gf->inverse.w32(base_gf, a1); + c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly); + c1 = a1inv; + } else if (a1 == 0) { + c0 = base_gf->inverse.w32(base_gf, a0); + c1 = 0; + } else { + a1inv = base_gf->inverse.w32(base_gf, a1); + a0inv = base_gf->inverse.w32(base_gf, a0); + + d = base_gf->multiply.w32(base_gf, a1, a0inv); + + tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly); + tmp = base_gf->inverse.w32(base_gf, tmp); + + d = base_gf->multiply.w32(base_gf, d, tmp); + + c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv); + c1 = base_gf->multiply.w32(base_gf, d, a1inv); + } + + c = c0 | (c1 << 8); + + return c; +} + +static +void +gf_w16_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint8_t b0 = val & 0x00ff; + uint8_t b1 = (val & 0xff00) >> 8; + uint16_t *s16, *d16, *top; + uint8_t a0, a1, a1b1, *mt; + gf_region_data rd; + struct gf_w16_composite_data *cd; + + cd = (struct gf_w16_composite_data *) h->private; + mt = cd->mult_table; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); + + s16 = rd.s_start; + d16 = rd.d_start; + top = rd.d_top; + + if (mt == NULL) { + if (xor) { + while (d16 < top) { + a0 = (*s16) & 0x00ff; + a1 = ((*s16) & 0xff00) >> 8; + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + (*d16) ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((base_gf->multiply.w32(base_gf, a1, b0) ^ + base_gf->multiply.w32(base_gf, a0, b1) ^ + base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8)); + s16++; + d16++; + } + } else { + while (d16 < top) { + a0 = (*s16) & 0x00ff; + a1 = ((*s16) & 0xff00) >> 8; + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + (*d16) = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((base_gf->multiply.w32(base_gf, a1, b0) ^ + base_gf->multiply.w32(base_gf, a0, b1) ^ + base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8)); + s16++; + d16++; + } + } + } else { + if (xor) { + while (d16 < top) { + a0 = (*s16) & 0x00ff; + a1 = ((*s16) & 0xff00) >> 8; + a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1); + + (*d16) ^= ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | + ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ + GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ + GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8)); + s16++; + d16++; + } + } else { + while (d16 < top) { + a0 = (*s16) & 0x00ff; + a1 = ((*s16) & 0xff00) >> 8; + a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1); + + (*d16) = ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | + ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ + GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ + GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8)); + s16++; + d16++; + } + } + } +} + +static +void +gf_w16_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint8_t val0 = val & 0x00ff; + uint8_t val1 = (val & 0xff00) >> 8; + gf_region_data rd; + int sub_reg_size; + uint8_t *slow, *shigh; + uint8_t *dlow, *dhigh, *top;; + + /* JSP: I want the two pointers aligned wrt each other on 16 byte + boundaries. So I'm going to make sure that the area on + which the two operate is a multiple of 32. Of course, that + junks up the mapping, but so be it -- that's why we have extract_word.... */ + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); + gf_do_initial_region_alignment(&rd); + + slow = (uint8_t *) rd.s_start; + dlow = (uint8_t *) rd.d_start; + top = (uint8_t *) rd.d_top; + sub_reg_size = (top - dlow)/2; + shigh = slow + sub_reg_size; + dhigh = dlow + sub_reg_size; + + base_gf->multiply_region.w32(base_gf, slow, dlow, val0, sub_reg_size, xor); + base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor); + base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1); + + gf_do_final_region_alignment(&rd); +} + +static +int gf_w16_composite_init(gf_t *gf) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + struct gf_w16_composite_data *cd; + + if (h->base_gf == NULL) return 0; + + cd = (struct gf_w16_composite_data *) h->private; + cd->mult_table = gf_w8_get_mult_table(h->base_gf); + + if (h->region_type & GF_REGION_ALTMAP) { + gf->multiply_region.w32 = gf_w16_composite_multiply_region_alt; + } else { + gf->multiply_region.w32 = gf_w16_composite_multiply_region; + } + + if (cd->mult_table == NULL) { + gf->multiply.w32 = gf_w16_composite_multiply_recursive; + } else { + gf->multiply.w32 = gf_w16_composite_multiply_inline; + } + gf->divide.w32 = NULL; + gf->inverse.w32 = gf_w16_composite_inverse; + + return 1; +} + +static +void +gf_w16_group_4_set_shift_tables(uint16_t *shift, uint16_t val, gf_internal_t *h) +{ + int i, j; + + shift[0] = 0; + for (i = 0; i < 16; i += 2) { + j = (shift[i>>1] << 1); + if (j & (1 << 16)) j ^= h->prim_poly; + shift[i] = j; + shift[i^1] = j^val; + } +} + +static +inline +gf_val_32_t +gf_w16_group_4_4_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint16_t p, l, ind, r, a16; + + struct gf_w16_group_4_4_data *d44; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + + d44 = (struct gf_w16_group_4_4_data *) h->private; + gf_w16_group_4_set_shift_tables(d44->shift, b, h); + + a16 = a; + ind = a16 >> 12; + a16 <<= 4; + p = d44->shift[ind]; + r = p & 0xfff; + l = p >> 12; + ind = a16 >> 12; + a16 <<= 4; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4)); + r = p & 0xfff; + l = p >> 12; + ind = a16 >> 12; + a16 <<= 4; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4)); + r = p & 0xfff; + l = p >> 12; + ind = a16 >> 12; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4)); + return p; +} + +static +void gf_w16_group_4_4_region_multiply(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint16_t p, l, ind, r, a16, p16; + struct gf_w16_group_4_4_data *d44; + gf_region_data rd; + uint16_t *s16, *d16, *top; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_internal_t *h = (gf_internal_t *) gf->scratch; + d44 = (struct gf_w16_group_4_4_data *) h->private; + gf_w16_group_4_set_shift_tables(d44->shift, val, h); + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); + gf_do_initial_region_alignment(&rd); + + s16 = (uint16_t *) rd.s_start; + d16 = (uint16_t *) rd.d_start; + top = (uint16_t *) rd.d_top; + + while (d16 < top) { + p = 0; + a16 = *s16; + p16 = (xor) ? *d16 : 0; + ind = a16 >> 12; + a16 <<= 4; + p = d44->shift[ind]; + r = p & 0xfff; + l = p >> 12; + ind = a16 >> 12; + a16 <<= 4; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4)); + r = p & 0xfff; + l = p >> 12; + ind = a16 >> 12; + a16 <<= 4; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4)); + r = p & 0xfff; + l = p >> 12; + ind = a16 >> 12; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4)); + p ^= p16; + *d16 = p; + d16++; + s16++; + } + gf_do_final_region_alignment(&rd); +} + +static +int gf_w16_group_init(gf_t *gf) +{ + int i, j, p; + struct gf_w16_group_4_4_data *d44; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + + d44 = (struct gf_w16_group_4_4_data *) h->private; + d44->reduce[0] = 0; + for (i = 0; i < 16; i++) { + p = 0; + for (j = 0; j < 4; j++) { + if (i & (1 << j)) p ^= (h->prim_poly << j); + } + d44->reduce[p>>16] = (p&0xffff); + } + + gf->multiply.w32 = gf_w16_group_4_4_multiply; + gf->divide.w32 = NULL; + gf->inverse.w32 = NULL; + gf->multiply_region.w32 = gf_w16_group_4_4_region_multiply; + + return 1; +} + +int gf_w16_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) +{ + switch(mult_type) + { + case GF_MULT_TABLE: + return sizeof(gf_internal_t) + sizeof(struct gf_w16_lazytable_data) + 64; + break; + case GF_MULT_BYTWO_p: + case GF_MULT_BYTWO_b: + return sizeof(gf_internal_t) + sizeof(struct gf_w16_bytwo_data); + break; + case GF_MULT_LOG_ZERO: + return sizeof(gf_internal_t) + sizeof(struct gf_w16_zero_logtable_data) + 64; + break; + case GF_MULT_LOG_TABLE: + return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64; + break; + case GF_MULT_DEFAULT: + case GF_MULT_SPLIT_TABLE: + if (arg1 == 8 && arg2 == 8) { + return sizeof(gf_internal_t) + sizeof(struct gf_w16_split_8_8_data) + 64; + } else if ((arg1 == 8 && arg2 == 16) || (arg2 == 8 && arg1 == 16)) { + return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64; + } else if (mult_type == GF_MULT_DEFAULT || + (arg1 == 4 && arg2 == 16) || (arg2 == 4 && arg1 == 16)) { + return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64; + } + return 0; + break; + case GF_MULT_GROUP: + return sizeof(gf_internal_t) + sizeof(struct gf_w16_group_4_4_data) + 64; + break; + case GF_MULT_CARRY_FREE: + return sizeof(gf_internal_t); + break; + case GF_MULT_SHIFT: + return sizeof(gf_internal_t); + break; + case GF_MULT_COMPOSITE: + return sizeof(gf_internal_t) + sizeof(struct gf_w16_composite_data) + 64; + break; + + default: + return 0; + } + return 0; +} + +int gf_w16_init(gf_t *gf) +{ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + /* Allen: set default primitive polynomial / irreducible polynomial if needed */ + + if (h->prim_poly == 0) { + if (h->mult_type == GF_MULT_COMPOSITE) { + h->prim_poly = gf_composite_get_default_poly(h->base_gf); + if (h->prim_poly == 0) return 0; + } else { + + /* Allen: use the following primitive polynomial to make + carryless multiply work more efficiently for GF(2^16). + + h->prim_poly = 0x1002d; + + The following is the traditional primitive polynomial for GF(2^16) */ + + h->prim_poly = 0x1100b; + } + } + + if (h->mult_type != GF_MULT_COMPOSITE) h->prim_poly |= (1 << 16); + + gf->multiply.w32 = NULL; + gf->divide.w32 = NULL; + gf->inverse.w32 = NULL; + gf->multiply_region.w32 = NULL; + + switch(h->mult_type) { + case GF_MULT_LOG_ZERO: if (gf_w16_log_zero_init(gf) == 0) return 0; break; + case GF_MULT_LOG_TABLE: if (gf_w16_log_init(gf) == 0) return 0; break; + case GF_MULT_DEFAULT: + case GF_MULT_SPLIT_TABLE: if (gf_w16_split_init(gf) == 0) return 0; break; + case GF_MULT_TABLE: if (gf_w16_table_init(gf) == 0) return 0; break; + case GF_MULT_CARRY_FREE: if (gf_w16_cfm_init(gf) == 0) return 0; break; + case GF_MULT_SHIFT: if (gf_w16_shift_init(gf) == 0) return 0; break; + case GF_MULT_COMPOSITE: if (gf_w16_composite_init(gf) == 0) return 0; break; + case GF_MULT_BYTWO_p: + case GF_MULT_BYTWO_b: if (gf_w16_bytwo_init(gf) == 0) return 0; break; + case GF_MULT_GROUP: if (gf_w16_group_init(gf) == 0) return 0; break; + default: return 0; + } + if (h->divide_type == GF_DIVIDE_EUCLID) { + gf->divide.w32 = gf_w16_divide_from_inverse; + gf->inverse.w32 = gf_w16_euclid; + } else if (h->divide_type == GF_DIVIDE_MATRIX) { + gf->divide.w32 = gf_w16_divide_from_inverse; + gf->inverse.w32 = gf_w16_matrix; + } + + if (gf->divide.w32 == NULL) { + gf->divide.w32 = gf_w16_divide_from_inverse; + if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w16_euclid; + } + + if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w16_inverse_from_divide; + + if (h->region_type & GF_REGION_ALTMAP) { + if (h->mult_type == GF_MULT_COMPOSITE) { + gf->extract_word.w32 = gf_w16_composite_extract_word; + } else { + gf->extract_word.w32 = gf_w16_split_extract_word; + } + } else if (h->region_type == GF_REGION_CAUCHY) { + gf->multiply_region.w32 = gf_wgen_cauchy_region; + gf->extract_word.w32 = gf_wgen_extract_word; + } else { + gf->extract_word.w32 = gf_w16_extract_word; + } + if (gf->multiply_region.w32 == NULL) { + gf->multiply_region.w32 = gf_w16_multiply_region_from_single; + } + return 1; +} + +/* Inline setup functions */ + +uint16_t *gf_w16_get_log_table(gf_t *gf) +{ + struct gf_w16_logtable_data *ltd; + + if (gf->multiply.w32 == gf_w16_log_multiply) { + ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + return (uint16_t *) ltd->log_tbl; + } + return NULL; +} + +uint16_t *gf_w16_get_mult_alog_table(gf_t *gf) +{ + gf_internal_t *h; + struct gf_w16_logtable_data *ltd; + + h = (gf_internal_t *) gf->scratch; + if (gf->multiply.w32 == gf_w16_log_multiply) { + ltd = (struct gf_w16_logtable_data *) h->private; + return (uint16_t *) ltd->antilog_tbl; + } + return NULL; +} + +uint16_t *gf_w16_get_div_alog_table(gf_t *gf) +{ + gf_internal_t *h; + struct gf_w16_logtable_data *ltd; + + h = (gf_internal_t *) gf->scratch; + if (gf->multiply.w32 == gf_w16_log_multiply) { + ltd = (struct gf_w16_logtable_data *) h->private; + return (uint16_t *) ltd->d_antilog; + } + return NULL; +} diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w32.c b/src/erasure-code/jerasure/gf-complete/src/gf_w32.c new file mode 100644 index 000000000000..1503c72dce90 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/gf_w32.c @@ -0,0 +1,2741 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_w32.c + * + * Routines for 32-bit Galois fields + */ + + +#include "gf_int.h" +#include +#include + +#define GF_FIELD_WIDTH (32) +#define GF_FIRST_BIT (1 << 31) + +#define GF_BASE_FIELD_WIDTH (16) +#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH) +#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1 +#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1) + +struct gf_split_2_32_lazy_data { + uint32_t tables[16][4]; + uint32_t last_value; +}; + +struct gf_w32_split_8_8_data { + uint32_t tables[7][256][256]; + uint32_t region_tables[4][256]; + uint32_t last_value; +}; + +struct gf_w32_group_data { + uint32_t *reduce; + uint32_t *shift; + int tshift; + uint64_t rmask; + uint32_t *memory; +}; + +struct gf_split_16_32_lazy_data { + uint32_t tables[2][(1<<16)]; + uint32_t last_value; +}; + +struct gf_split_8_32_lazy_data { + uint32_t tables[4][256]; + uint32_t last_value; +}; + +struct gf_split_4_32_lazy_data { + uint32_t tables[8][16]; + uint32_t last_value; +}; + +struct gf_w32_bytwo_data { + uint64_t prim_poly; + uint64_t mask1; + uint64_t mask2; +}; + +struct gf_w32_composite_data { + uint16_t *log; + uint16_t *alog; +}; + +#define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); } + +#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); } + +#define AB2(ip, am1 ,am2, b, t1, t2) {\ + t1 = (b << 1) & am1;\ + t2 = b & am2; \ + t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \ + b = (t1 ^ (t2 & ip));} + +#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\ + t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \ + t2 = _mm_and_si128(va, m2); \ + t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \ + va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); } + +static +inline +uint32_t gf_w32_inverse_from_divide (gf_t *gf, uint32_t a) +{ + return gf->divide.w32(gf, 1, a); +} + +static +inline +uint32_t gf_w32_divide_from_inverse (gf_t *gf, uint32_t a, uint32_t b) +{ + b = gf->inverse.w32(gf, b); + return gf->multiply.w32(gf, a, b); +} + +static +void +gf_w32_multiply_region_from_single(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int +xor) +{ + int i; + uint32_t *s32; + uint32_t *d32; + + s32 = (uint32_t *) src; + d32 = (uint32_t *) dest; + + if (xor) { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + d32[i] ^= gf->multiply.w32(gf, val, s32[i]); + } + } else { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + d32[i] = gf->multiply.w32(gf, val, s32[i]); + } + } +} + +#if defined(INTEL_SSE4_PCLMUL) + +static +void +gf_w32_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + + int i; + uint32_t *s32; + uint32_t *d32; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + s32 = (uint32_t *) src; + d32 = (uint32_t *) dest; + + if (xor) { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + b = _mm_insert_epi32 (a, s32[i], 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + } + } else { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + b = _mm_insert_epi32 (a, s32[i], 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + } + } +} +#endif + +#if defined(INTEL_SSE4_PCLMUL) + +static +void +gf_w32_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + + int i; + uint32_t *s32; + uint32_t *d32; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + + s32 = (uint32_t *) src; + d32 = (uint32_t *) dest; + + if (xor) { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + b = _mm_insert_epi32 (a, s32[i], 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + } + } else { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + b = _mm_insert_epi32 (a, s32[i], 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + } + } +} +#endif + +#if defined(INTEL_SSE4_PCLMUL) +static +void +gf_w32_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + int i; + uint32_t *s32; + uint32_t *d32; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + + s32 = (uint32_t *) src; + d32 = (uint32_t *) dest; + + if (xor) { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + b = _mm_insert_epi32 (a, s32[i], 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + } + } else { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + b = _mm_insert_epi32 (a, s32[i], 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + } + } +} +#endif + +static +inline +uint32_t gf_w32_euclid (gf_t *gf, uint32_t b) +{ + uint32_t e_i, e_im1, e_ip1; + uint32_t d_i, d_im1, d_ip1; + uint32_t y_i, y_im1, y_ip1; + uint32_t c_i; + + if (b == 0) return -1; + e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; + e_i = b; + d_im1 = 32; + for (d_i = d_im1-1; ((1 << d_i) & e_i) == 0; d_i--) ; + y_i = 1; + y_im1 = 0; + + while (e_i != 1) { + + e_ip1 = e_im1; + d_ip1 = d_im1; + c_i = 0; + + while (d_ip1 >= d_i) { + c_i ^= (1 << (d_ip1 - d_i)); + e_ip1 ^= (e_i << (d_ip1 - d_i)); + d_ip1--; + if (e_ip1 == 0) return 0; + while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--; + } + + y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i); + y_im1 = y_i; + y_i = y_ip1; + + e_im1 = e_i; + d_im1 = d_i; + e_i = e_ip1; + d_i = d_ip1; + } + + return y_i; +} + +static +gf_val_32_t gf_w32_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + uint32_t *r32, rv; + + r32 = (uint32_t *) start; + rv = r32[index]; + return rv; +} + +static +gf_val_32_t gf_w32_composite_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + int sub_size; + gf_internal_t *h; + uint8_t *r8, *top; + uint32_t a, b, *r32; + gf_region_data rd; + + h = (gf_internal_t *) gf->scratch; + gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32); + r32 = (uint32_t *) start; + if (r32 + index < (uint32_t *) rd.d_start) return r32[index]; + if (r32 + index >= (uint32_t *) rd.d_top) return r32[index]; + index -= (((uint32_t *) rd.d_start) - r32); + r8 = (uint8_t *) rd.d_start; + top = (uint8_t *) rd.d_top; + sub_size = (top-r8)/2; + + a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index); + b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index); + return (a | (b << 16)); +} + +static +gf_val_32_t gf_w32_split_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + int i; + uint32_t *r32, rv; + uint8_t *r8; + gf_region_data rd; + + gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 64); + r32 = (uint32_t *) start; + if (r32 + index < (uint32_t *) rd.d_start) return r32[index]; + if (r32 + index >= (uint32_t *) rd.d_top) return r32[index]; + index -= (((uint32_t *) rd.d_start) - r32); + r8 = (uint8_t *) rd.d_start; + r8 += ((index & 0xfffffff0)*4); + r8 += (index & 0xf); + r8 += 48; + rv =0; + for (i = 0; i < 4; i++) { + rv <<= 8; + rv |= *r8; + r8 -= 16; + } + return rv; +} + + +static +inline +uint32_t gf_w32_matrix (gf_t *gf, uint32_t b) +{ + return gf_bitmatrix_inverse(b, 32, ((gf_internal_t *) (gf->scratch))->prim_poly); +} + +/* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm. I only + include it for completeness. It does have the feature that it requires no + extra memory. +*/ + + + + +static +inline +gf_val_32_t +gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) +{ + gf_val_32_t rv = 0; + +#if defined(INTEL_SSE4_PCLMUL) + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + + a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0); + b = _mm_insert_epi32 (a, b32, 0); + + prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only + have to do the reduction at most twice, because (w-2)/z == 2. Where + z is equal to the number of zeros after the leading 1 + + _mm_clmulepi64_si128 is the carryless multiply operation. Here + _mm_srli_si128 shifts the result to the right by 4 bytes. This allows + us to multiply the prim_poly by the leading bits of the result. We + then xor the result of that operation back with the result.*/ + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); +#endif + return rv; +} +static +inline +gf_val_32_t +gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) +{ + gf_val_32_t rv = 0; + +#if defined(INTEL_SSE4_PCLMUL) + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + + a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0); + b = _mm_insert_epi32 (a, b32, 0); + + prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); +#endif + return rv; +} + +static +inline +gf_val_32_t +gf_w32_clm_multiply_4 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) +{ + gf_val_32_t rv = 0; + +#if defined(INTEL_SSE4_PCLMUL) + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + + a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0); + b = _mm_insert_epi32 (a, b32, 0); + + prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); +#endif + return rv; +} + + +static +inline +uint32_t +gf_w32_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32) +{ + uint64_t product, i, pp, a, b, one; + gf_internal_t *h; + + a = a32; + b = b32; + h = (gf_internal_t *) gf->scratch; + one = 1; + pp = h->prim_poly | (one << 32); + + product = 0; + + for (i = 0; i < GF_FIELD_WIDTH; i++) { + if (a & (one << i)) product ^= (b << i); + } + for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) { + if (product & (one << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); + } + return product; +} + + static +int gf_w32_cfm_init(gf_t *gf) +{ + gf->inverse.w32 = gf_w32_euclid; + gf->multiply_region.w32 = gf_w32_multiply_region_from_single; + + /*Ben: We also check to see if the prim poly will work for pclmul */ + /*Ben: Check to see how many reduction steps it will take*/ + +#if defined(INTEL_SSE4_PCLMUL) + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + if ((0xfffe0000 & h->prim_poly) == 0){ + gf->multiply.w32 = gf_w32_clm_multiply_2; + gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_2; + }else if ((0xffc00000 & h->prim_poly) == 0){ + gf->multiply.w32 = gf_w32_clm_multiply_3; + gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_3; + }else if ((0xfe000000 & h->prim_poly) == 0){ + gf->multiply.w32 = gf_w32_clm_multiply_4; + gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_4; + } else { + return 0; + } + return 1; + #endif + + return 0; +} + + static +int gf_w32_shift_init(gf_t *gf) +{ + gf->inverse.w32 = gf_w32_euclid; + gf->multiply_region.w32 = gf_w32_multiply_region_from_single; + gf->multiply.w32 = gf_w32_shift_multiply; + return 1; +} + +static + void +gf_w32_group_set_shift_tables(uint32_t *shift, uint32_t val, gf_internal_t *h) +{ + int i; + uint32_t j; + + shift[0] = 0; + + for (i = 1; i < (1 << h->arg1); i <<= 1) { + for (j = 0; j < i; j++) shift[i|j] = shift[j]^val; + if (val & GF_FIRST_BIT) { + val <<= 1; + val ^= h->prim_poly; + } else { + val <<= 1; + } + } +} + + static +void gf_w32_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + int leftover, rs; + uint32_t p, l, ind, a32; + int bits_left; + int g_s; + gf_region_data rd; + uint32_t *s32, *d32, *top; + struct gf_w32_group_data *gd; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gd = (struct gf_w32_group_data *) h->private; + g_s = h->arg1; + gf_w32_group_set_shift_tables(gd->shift, val, h); + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); + gf_do_initial_region_alignment(&rd); + + s32 = (uint32_t *) rd.s_start; + d32 = (uint32_t *) rd.d_start; + top = (uint32_t *) rd.d_top; + + leftover = 32 % g_s; + if (leftover == 0) leftover = g_s; + + while (d32 < top) { + rs = 32 - leftover; + a32 = *s32; + ind = a32 >> rs; + a32 <<= leftover; + p = gd->shift[ind]; + + bits_left = rs; + rs = 32 - g_s; + + while (bits_left > 0) { + bits_left -= g_s; + ind = a32 >> rs; + a32 <<= g_s; + l = p >> rs; + p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s)); + } + if (xor) p ^= *d32; + *d32 = p; + d32++; + s32++; + } + gf_do_final_region_alignment(&rd); +} + + static +void gf_w32_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint32_t *s32, *d32, *top; + int i; + int leftover; + uint64_t p, l, r; + uint32_t a32, ind; + int g_s, g_r; + struct gf_w32_group_data *gd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_internal_t *h = (gf_internal_t *) gf->scratch; + g_s = h->arg1; + g_r = h->arg2; + gd = (struct gf_w32_group_data *) h->private; + gf_w32_group_set_shift_tables(gd->shift, val, h); + + leftover = GF_FIELD_WIDTH % g_s; + if (leftover == 0) leftover = g_s; + + gd = (struct gf_w32_group_data *) h->private; + gf_w32_group_set_shift_tables(gd->shift, val, h); + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); + gf_do_initial_region_alignment(&rd); + + s32 = (uint32_t *) rd.s_start; + d32 = (uint32_t *) rd.d_start; + top = (uint32_t *) rd.d_top; + + while (d32 < top) { + a32 = *s32; + ind = a32 >> (GF_FIELD_WIDTH - leftover); + p = gd->shift[ind]; + p <<= g_s; + a32 <<= leftover; + + i = (GF_FIELD_WIDTH - leftover); + while (i > g_s) { + ind = a32 >> (GF_FIELD_WIDTH-g_s); + p ^= gd->shift[ind]; + a32 <<= g_s; + p <<= g_s; + i -= g_s; + } + + ind = a32 >> (GF_FIELD_WIDTH-g_s); + p ^= gd->shift[ind]; + + for (i = gd->tshift ; i >= 0; i -= g_r) { + l = p & (gd->rmask << i); + r = gd->reduce[l >> (i+32)]; + r <<= (i); + p ^= r; + } + + if (xor) p ^= *d32; + *d32 = p; + d32++; + s32++; + } + gf_do_final_region_alignment(&rd); +} + +static +inline +gf_val_32_t +gf_w32_group_s_equals_r_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + int leftover, rs; + uint32_t p, l, ind, a32; + int bits_left; + int g_s; + + struct gf_w32_group_data *gd; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + g_s = h->arg1; + + gd = (struct gf_w32_group_data *) h->private; + gf_w32_group_set_shift_tables(gd->shift, b, h); + + leftover = 32 % g_s; + if (leftover == 0) leftover = g_s; + + rs = 32 - leftover; + a32 = a; + ind = a32 >> rs; + a32 <<= leftover; + p = gd->shift[ind]; + + bits_left = rs; + rs = 32 - g_s; + + while (bits_left > 0) { + bits_left -= g_s; + ind = a32 >> rs; + a32 <<= g_s; + l = p >> rs; + p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s)); + } + return p; +} + +static +inline +gf_val_32_t +gf_w32_group_4_4_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t p, l, ind, a32; + + struct gf_w32_group_data *d44; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + + d44 = (struct gf_w32_group_data *) h->private; + gf_w32_group_set_shift_tables(d44->shift, b, h); + + p = 0; + a32 = a; + ind = a32 >> 28; + a32 <<= 4; + p = d44->shift[ind]; + ind = a32 >> 28; + a32 <<= 4; + l = p >> 28; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4)); + ind = a32 >> 28; + a32 <<= 4; + l = p >> 28; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4)); + ind = a32 >> 28; + a32 <<= 4; + l = p >> 28; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4)); + ind = a32 >> 28; + a32 <<= 4; + l = p >> 28; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4)); + ind = a32 >> 28; + a32 <<= 4; + l = p >> 28; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4)); + ind = a32 >> 28; + a32 <<= 4; + l = p >> 28; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4)); + ind = a32 >> 28; + l = p >> 28; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4)); + return p; +} + +static +inline +gf_val_32_t +gf_w32_group_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + int i; + int leftover; + uint64_t p, l, r; + uint32_t a32, ind; + int g_s, g_r; + struct gf_w32_group_data *gd; + + gf_internal_t *h = (gf_internal_t *) gf->scratch; + g_s = h->arg1; + g_r = h->arg2; + gd = (struct gf_w32_group_data *) h->private; + gf_w32_group_set_shift_tables(gd->shift, b, h); + + leftover = GF_FIELD_WIDTH % g_s; + if (leftover == 0) leftover = g_s; + + a32 = a; + ind = a32 >> (GF_FIELD_WIDTH - leftover); + p = gd->shift[ind]; + p <<= g_s; + a32 <<= leftover; + + i = (GF_FIELD_WIDTH - leftover); + while (i > g_s) { + ind = a32 >> (GF_FIELD_WIDTH-g_s); + p ^= gd->shift[ind]; + a32 <<= g_s; + p <<= g_s; + i -= g_s; + } + + ind = a32 >> (GF_FIELD_WIDTH-g_s); + p ^= gd->shift[ind]; + + for (i = gd->tshift ; i >= 0; i -= g_r) { + l = p & (gd->rmask << i); + r = gd->reduce[l >> (i+32)]; + r <<= (i); + p ^= r; + } + return p; +} + +static +inline +gf_val_32_t +gf_w32_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t prod, pp, bmask; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + prod = 0; + bmask = 0x80000000; + + while (1) { + if (a & 1) prod ^= b; + a >>= 1; + if (a == 0) return prod; + if (b & bmask) { + b = ((b << 1) ^ pp); + } else { + b <<= 1; + } + } +} + +static +inline +gf_val_32_t +gf_w32_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t prod, pp, pmask, amask; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + + prod = 0; + pmask = 0x80000000; + amask = 0x80000000; + + while (amask != 0) { + if (prod & pmask) { + prod = ((prod << 1) ^ pp); + } else { + prod <<= 1; + } + if (a & amask) prod ^= b; + amask >>= 1; + } + return prod; +} + +static +void +gf_w32_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t *s64, *d64, t1, t2, ta, prod, amask; + gf_region_data rd; + struct gf_w32_bytwo_data *btd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + gf_do_initial_region_alignment(&rd); + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + + if (xor) { + while (s64 < (uint64_t *) rd.s_top) { + prod = 0; + amask = 0x80000000; + ta = *s64; + while (amask != 0) { + AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); + if (val & amask) prod ^= ta; + amask >>= 1; + } + *d64 ^= prod; + d64++; + s64++; + } + } else { + while (s64 < (uint64_t *) rd.s_top) { + prod = 0; + amask = 0x80000000; + ta = *s64; + while (amask != 0) { + AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); + if (val & amask) prod ^= ta; + amask >>= 1; + } + *d64 = prod; + d64++; + s64++; + } + } + gf_do_final_region_alignment(&rd); +} + +#define BYTWO_P_ONESTEP {\ + SSE_AB2(pp, m1 ,m2, prod, t1, t2); \ + t1 = _mm_and_si128(v, one); \ + t1 = _mm_sub_epi32(t1, one); \ + t1 = _mm_and_si128(t1, ta); \ + prod = _mm_xor_si128(prod, t1); \ + v = _mm_srli_epi64(v, 1); } + +#ifdef INTEL_SSE2 +static +void +gf_w32_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + int i; + uint8_t *s8, *d8; + uint32_t vrev; + __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v; + struct gf_w32_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + vrev = 0; + for (i = 0; i < 32; i++) { + vrev <<= 1; + if (!(val & (1 << i))) vrev |= 1; + } + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + pp = _mm_set1_epi32(btd->prim_poly&0xffffffff); + m1 = _mm_set1_epi32((btd->mask1)&0xffffffff); + m2 = _mm_set1_epi32((btd->mask2)&0xffffffff); + one = _mm_set1_epi32(1); + + while (d8 < (uint8_t *) rd.d_top) { + prod = _mm_setzero_si128(); + v = _mm_set1_epi32(vrev); + ta = _mm_load_si128((__m128i *) s8); + tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8); + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp)); + d8 += 16; + s8 += 16; + } + gf_do_final_region_alignment(&rd); +} +#endif + +static +void +gf_w32_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t *s64, *d64, t1, t2, ta, tb, prod; + struct gf_w32_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); + gf_do_initial_region_alignment(&rd); + + btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + + switch (val) { + case 2: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= ta; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta; + d64++; + s64++; + } + } + break; + case 3: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + case 4: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= ta; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta; + d64++; + s64++; + } + } + break; + case 5: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta ^ prod; + d64++; + s64++; + } + } + default: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + prod = *d64 ; + ta = *s64; + tb = val; + while (1) { + if (tb & 1) prod ^= ta; + tb >>= 1; + if (tb == 0) break; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + } + *d64 = prod; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + prod = 0 ; + ta = *s64; + tb = val; + while (1) { + if (tb & 1) prod ^= ta; + tb >>= 1; + if (tb == 0) break; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + } + *d64 = prod; + d64++; + s64++; + } + } + break; + } + gf_do_final_region_alignment(&rd); +} + +#ifdef INTEL_SSE2 +static +void +gf_w32_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w32_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi32(btd->prim_poly&0xffffffff); + m1 = _mm_set1_epi32((btd->mask1)&0xffffffff); + m2 = _mm_set1_epi32((btd->mask2)&0xffffffff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, m2, va, t1, t2); + _mm_store_si128((__m128i *)d8, va); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w32_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w32_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi32(btd->prim_poly&0xffffffff); + m1 = _mm_set1_epi32((btd->mask1)&0xffffffff); + m2 = _mm_set1_epi32((btd->mask2)&0xffffffff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, m2, va, t1, t2); + vb = _mm_load_si128 ((__m128i *)(d8)); + vb = _mm_xor_si128(vb, va); + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } +} +#endif + + +#ifdef INTEL_SSE2 +static +void +gf_w32_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint32_t itb; + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va, vb; + struct gf_w32_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + if (val == 2) { + if (xor) { + gf_w32_bytwo_b_sse_region_2_xor(&rd, btd); + } else { + gf_w32_bytwo_b_sse_region_2_noxor(&rd, btd); + } + gf_do_final_region_alignment(&rd); + return; + } + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + pp = _mm_set1_epi32(btd->prim_poly&0xffffffff); + m1 = _mm_set1_epi32((btd->mask1)&0xffffffff); + m2 = _mm_set1_epi32((btd->mask2)&0xffffffff); + + while (d8 < (uint8_t *) rd.d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8)); + itb = val; + while (1) { + if (itb & 1) vb = _mm_xor_si128(vb, va); + itb >>= 1; + if (itb == 0) break; + SSE_AB2(pp, m1, m2, va, t1, t2); + } + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } + + gf_do_final_region_alignment(&rd); +} +#endif + +static +int gf_w32_bytwo_init(gf_t *gf) +{ + gf_internal_t *h; + uint64_t ip, m1, m2; + struct gf_w32_bytwo_data *btd; + + h = (gf_internal_t *) gf->scratch; + btd = (struct gf_w32_bytwo_data *) (h->private); + ip = h->prim_poly & 0xffffffff; + m1 = 0xfffffffe; + m2 = 0x80000000; + btd->prim_poly = 0; + btd->mask1 = 0; + btd->mask2 = 0; + + while (ip != 0) { + btd->prim_poly |= ip; + btd->mask1 |= m1; + btd->mask2 |= m2; + ip <<= GF_FIELD_WIDTH; + m1 <<= GF_FIELD_WIDTH; + m2 <<= GF_FIELD_WIDTH; + } + + if (h->mult_type == GF_MULT_BYTWO_p) { + gf->multiply.w32 = gf_w32_bytwo_p_multiply; + #ifdef INTEL_SSE2 + if (h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region; + else + gf->multiply_region.w32 = gf_w32_bytwo_p_sse_multiply_region; + #else + gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region; + if(h->region_type & GF_REGION_SSE) + return 0; + #endif + } else { + gf->multiply.w32 = gf_w32_bytwo_b_multiply; + #ifdef INTEL_SSE2 + if (h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region; + else + gf->multiply_region.w32 = gf_w32_bytwo_b_sse_multiply_region; + #else + gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region; + if(h->region_type & GF_REGION_SSE) + return 0; + #endif + } + + gf->inverse.w32 = gf_w32_euclid; + return 1; +} + +static +inline +uint32_t +gf_w32_split_8_8_multiply (gf_t *gf, uint32_t a32, uint32_t b32) +{ + uint32_t product, i, j, mask, tb; + gf_internal_t *h; + struct gf_w32_split_8_8_data *d8; + + h = (gf_internal_t *) gf->scratch; + d8 = (struct gf_w32_split_8_8_data *) h->private; + product = 0; + mask = 0xff; + + for (i = 0; i < 4; i++) { + tb = b32; + for (j = 0; j < 4; j++) { + product ^= d8->tables[i+j][a32&mask][tb&mask]; + tb >>= 8; + } + a32 >>= 8; + } + return product; +} + +static +inline +void +gf_w32_split_8_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + gf_internal_t *h; + uint32_t *s32, *d32, *top, p, a, v; + struct gf_split_8_32_lazy_data *d8; + struct gf_w32_split_8_8_data *d88; + uint32_t *t[4]; + int i, j, k, change; + uint32_t pp; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + if (h->arg1 == 32 || h->arg2 == 32 || h->mult_type == GF_MULT_DEFAULT) { + d8 = (struct gf_split_8_32_lazy_data *) h->private; + for (i = 0; i < 4; i++) t[i] = d8->tables[i]; + change = (val != d8->last_value); + if (change) d8->last_value = val; + } else { + d88 = (struct gf_w32_split_8_8_data *) h->private; + for (i = 0; i < 4; i++) t[i] = d88->region_tables[i]; + change = (val != d88->last_value); + if (change) d88->last_value = val; + } + pp = h->prim_poly; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); + gf_do_initial_region_alignment(&rd); + + s32 = (uint32_t *) rd.s_start; + d32 = (uint32_t *) rd.d_start; + top = (uint32_t *) rd.d_top; + + if (change) { + v = val; + for (i = 0; i < 4; i++) { + t[i][0] = 0; + for (j = 1; j < 256; j <<= 1) { + for (k = 0; k < j; k++) { + t[i][k^j] = (v ^ t[i][k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + } + } + + while (d32 < top) { + p = (xor) ? *d32 : 0; + a = *s32; + i = 0; + while (a != 0) { + v = (a & 0xff); + p ^= t[i][v]; + a >>= 8; + i++; + } + *d32 = p; + d32++; + s32++; + } + gf_do_final_region_alignment(&rd); +} + +static +inline +void +gf_w32_split_16_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + gf_internal_t *h; + uint32_t *s32, *d32, *top, p, a, v; + struct gf_split_16_32_lazy_data *d16; + uint32_t *t[2]; + int i, j, k, change; + uint32_t pp; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + d16 = (struct gf_split_16_32_lazy_data *) h->private; + for (i = 0; i < 2; i++) t[i] = d16->tables[i]; + change = (val != d16->last_value); + if (change) d16->last_value = val; + + pp = h->prim_poly; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); + gf_do_initial_region_alignment(&rd); + + s32 = (uint32_t *) rd.s_start; + d32 = (uint32_t *) rd.d_start; + top = (uint32_t *) rd.d_top; + + if (change) { + v = val; + for (i = 0; i < 2; i++) { + t[i][0] = 0; + for (j = 1; j < (1 << 16); j <<= 1) { + for (k = 0; k < j; k++) { + t[i][k^j] = (v ^ t[i][k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + } + } + + while (d32 < top) { + p = (xor) ? *d32 : 0; + a = *s32; + i = 0; + while (a != 0) { + v = (a & 0xffff); + p ^= t[i][v]; + a >>= 16; + i++; + } + *d32 = p; + d32++; + s32++; + } + gf_do_final_region_alignment(&rd); +} + +static +void +gf_w32_split_2_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + gf_internal_t *h; + struct gf_split_2_32_lazy_data *ld; + int i; + uint32_t pp, v, v2, s, *s32, *d32, *top; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); + gf_do_initial_region_alignment(&rd); + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + ld = (struct gf_split_2_32_lazy_data *) h->private; + + if (ld->last_value != val) { + v = val; + for (i = 0; i < 16; i++) { + v2 = (v << 1); + if (v & GF_FIRST_BIT) v2 ^= pp; + ld->tables[i][0] = 0; + ld->tables[i][1] = v; + ld->tables[i][2] = v2; + ld->tables[i][3] = (v2 ^ v); + v = (v2 << 1); + if (v2 & GF_FIRST_BIT) v ^= pp; + } + } + ld->last_value = val; + + s32 = (uint32_t *) rd.s_start; + d32 = (uint32_t *) rd.d_start; + top = (uint32_t *) rd.d_top; + + while (d32 != top) { + v = (xor) ? *d32 : 0; + s = *s32; + i = 0; + while (s != 0) { + v ^= ld->tables[i][s&3]; + s >>= 2; + i++; + } + *d32 = v; + d32++; + s32++; + } + gf_do_final_region_alignment(&rd); +} + +#ifdef INTEL_SSSE3 +static +void +gf_w32_split_2_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + gf_internal_t *h; + int i, tindex; + uint32_t pp, v, v2, *s32, *d32, *top; + __m128i vi, si, pi, shuffler, tables[16], adder, xi, mask1, mask2; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); + gf_do_initial_region_alignment(&rd); + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + s32 = (uint32_t *) rd.s_start; + d32 = (uint32_t *) rd.d_start; + top = (uint32_t *) rd.d_top; + + v = val; + for (i = 0; i < 16; i++) { + v2 = (v << 1); + if (v & GF_FIRST_BIT) v2 ^= pp; + tables[i] = _mm_set_epi32(v2 ^ v, v2, v, 0); + v = (v2 << 1); + if (v2 & GF_FIRST_BIT) v ^= pp; + } + + shuffler = _mm_set_epi8(0xc, 0xc, 0xc, 0xc, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0); + adder = _mm_set_epi8(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0); + mask1 = _mm_set1_epi8(0x3); + mask2 = _mm_set1_epi8(0xc); + + while (d32 != top) { + pi = (xor) ? _mm_load_si128 ((__m128i *) d32) : _mm_setzero_si128(); + vi = _mm_load_si128((__m128i *) s32); + + tindex = 0; + for (i = 0; i < 4; i++) { + si = _mm_shuffle_epi8(vi, shuffler); + + xi = _mm_and_si128(si, mask1); + xi = _mm_slli_epi16(xi, 2); + xi = _mm_xor_si128(xi, adder); + pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi)); + tindex++; + + xi = _mm_and_si128(si, mask2); + xi = _mm_xor_si128(xi, adder); + pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi)); + si = _mm_srli_epi16(si, 2); + tindex++; + + xi = _mm_and_si128(si, mask2); + xi = _mm_xor_si128(xi, adder); + pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi)); + si = _mm_srli_epi16(si, 2); + tindex++; + + xi = _mm_and_si128(si, mask2); + xi = _mm_xor_si128(xi, adder); + pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi)); + si = _mm_srli_epi16(si, 2); + tindex++; + + vi = _mm_srli_epi32(vi, 8); + } + _mm_store_si128((__m128i *) d32, pi); + d32 += 4; + s32 += 4; + } + + gf_do_final_region_alignment(&rd); + +} +#endif + +static +void +gf_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + gf_internal_t *h; + struct gf_split_4_32_lazy_data *ld; + int i, j, k; + uint32_t pp, v, s, *s32, *d32, *top; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + ld = (struct gf_split_4_32_lazy_data *) h->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); + gf_do_initial_region_alignment(&rd); + + if (ld->last_value != val) { + v = val; + for (i = 0; i < 8; i++) { + ld->tables[i][0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[i][k^j] = (v ^ ld->tables[i][k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + } + } + ld->last_value = val; + + s32 = (uint32_t *) rd.s_start; + d32 = (uint32_t *) rd.d_start; + top = (uint32_t *) rd.d_top; + + while (d32 != top) { + v = (xor) ? *d32 : 0; + s = *s32; + i = 0; + while (s != 0) { + v ^= ld->tables[i][s&0xf]; + s >>= 4; + i++; + } + *d32 = v; + d32++; + s32++; + } + gf_do_final_region_alignment(&rd); +} + +static +void +gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ +#ifdef INTEL_SSSE3 + gf_internal_t *h; + int i, j, k; + uint32_t pp, v, *s32, *d32, *top; + __m128i si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3; + struct gf_split_4_32_lazy_data *ld; + uint8_t btable[16]; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 64); + gf_do_initial_region_alignment(&rd); + + s32 = (uint32_t *) rd.s_start; + d32 = (uint32_t *) rd.d_start; + top = (uint32_t *) rd.d_top; + + ld = (struct gf_split_4_32_lazy_data *) h->private; + + v = val; + for (i = 0; i < 8; i++) { + ld->tables[i][0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[i][k^j] = (v ^ ld->tables[i][k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + for (j = 0; j < 4; j++) { + for (k = 0; k < 16; k++) { + btable[k] = (uint8_t) ld->tables[i][k]; + ld->tables[i][k] >>= 8; + } + tables[i][j] = _mm_loadu_si128((__m128i *) btable); + } + } + + mask1 = _mm_set1_epi8(0xf); + + if (xor) { + while (d32 != top) { + p0 = _mm_load_si128 ((__m128i *) d32); + p1 = _mm_load_si128 ((__m128i *) (d32+4)); + p2 = _mm_load_si128 ((__m128i *) (d32+8)); + p3 = _mm_load_si128 ((__m128i *) (d32+12)); + + v0 = _mm_load_si128((__m128i *) s32); s32 += 4; + v1 = _mm_load_si128((__m128i *) s32); s32 += 4; + v2 = _mm_load_si128((__m128i *) s32); s32 += 4; + v3 = _mm_load_si128((__m128i *) s32); s32 += 4; + + si = _mm_and_si128(v0, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si)); + + v0 = _mm_srli_epi32(v0, 4); + si = _mm_and_si128(v0, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si)); + + si = _mm_and_si128(v1, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si)); + + v1 = _mm_srli_epi32(v1, 4); + si = _mm_and_si128(v1, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si)); + + si = _mm_and_si128(v2, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si)); + + v2 = _mm_srli_epi32(v2, 4); + si = _mm_and_si128(v2, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si)); + + si = _mm_and_si128(v3, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[6][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[6][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[6][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[6][3], si)); + + v3 = _mm_srli_epi32(v3, 4); + si = _mm_and_si128(v3, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si)); + + _mm_store_si128((__m128i *) d32, p0); + _mm_store_si128((__m128i *) (d32+4), p1); + _mm_store_si128((__m128i *) (d32+8), p2); + _mm_store_si128((__m128i *) (d32+12), p3); + d32 += 16; + } + } else { + while (d32 != top) { + + v0 = _mm_load_si128((__m128i *) s32); s32 += 4; + v1 = _mm_load_si128((__m128i *) s32); s32 += 4; + v2 = _mm_load_si128((__m128i *) s32); s32 += 4; + v3 = _mm_load_si128((__m128i *) s32); s32 += 4; + + si = _mm_and_si128(v0, mask1); + p0 = _mm_shuffle_epi8(tables[0][0], si); + p1 = _mm_shuffle_epi8(tables[0][1], si); + p2 = _mm_shuffle_epi8(tables[0][2], si); + p3 = _mm_shuffle_epi8(tables[0][3], si); + + v0 = _mm_srli_epi32(v0, 4); + si = _mm_and_si128(v0, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si)); + + si = _mm_and_si128(v1, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si)); + + v1 = _mm_srli_epi32(v1, 4); + si = _mm_and_si128(v1, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si)); + + si = _mm_and_si128(v2, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si)); + + v2 = _mm_srli_epi32(v2, 4); + si = _mm_and_si128(v2, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si)); + + si = _mm_and_si128(v3, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[6][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[6][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[6][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[6][3], si)); + + v3 = _mm_srli_epi32(v3, 4); + si = _mm_and_si128(v3, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si)); + + _mm_store_si128((__m128i *) d32, p0); + _mm_store_si128((__m128i *) (d32+4), p1); + _mm_store_si128((__m128i *) (d32+8), p2); + _mm_store_si128((__m128i *) (d32+12), p3); + d32 += 16; + } + } + + gf_do_final_region_alignment(&rd); + +#endif +} + + +static +void +gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ +#ifdef INTEL_SSSE3 + gf_internal_t *h; + int i, j, k; + uint32_t pp, v, *s32, *d32, *top, tmp_table[16]; + __m128i si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3, mask8; + __m128i tv1, tv2, tv3, tv0; + uint8_t btable[16]; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 64); + gf_do_initial_region_alignment(&rd); + + s32 = (uint32_t *) rd.s_start; + d32 = (uint32_t *) rd.d_start; + top = (uint32_t *) rd.d_top; + + v = val; + for (i = 0; i < 8; i++) { + tmp_table[0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + tmp_table[k^j] = (v ^ tmp_table[k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + for (j = 0; j < 4; j++) { + for (k = 0; k < 16; k++) { + btable[k] = (uint8_t) tmp_table[k]; + tmp_table[k] >>= 8; + } + tables[i][j] = _mm_loadu_si128((__m128i *) btable); + } + } + + mask1 = _mm_set1_epi8(0xf); + mask8 = _mm_set1_epi16(0xff); + + if (xor) { + while (d32 != top) { + v0 = _mm_load_si128((__m128i *) s32); s32 += 4; + v1 = _mm_load_si128((__m128i *) s32); s32 += 4; + v2 = _mm_load_si128((__m128i *) s32); s32 += 4; + v3 = _mm_load_si128((__m128i *) s32); s32 += 4; + + p0 = _mm_srli_epi16(v0, 8); + p1 = _mm_srli_epi16(v1, 8); + p2 = _mm_srli_epi16(v2, 8); + p3 = _mm_srli_epi16(v3, 8); + + tv0 = _mm_and_si128(v0, mask8); + tv1 = _mm_and_si128(v1, mask8); + tv2 = _mm_and_si128(v2, mask8); + tv3 = _mm_and_si128(v3, mask8); + + v0 = _mm_packus_epi16(p1, p0); + v1 = _mm_packus_epi16(tv1, tv0); + v2 = _mm_packus_epi16(p3, p2); + v3 = _mm_packus_epi16(tv3, tv2); + + p0 = _mm_srli_epi16(v0, 8); + p1 = _mm_srli_epi16(v1, 8); + p2 = _mm_srli_epi16(v2, 8); + p3 = _mm_srli_epi16(v3, 8); + + tv0 = _mm_and_si128(v0, mask8); + tv1 = _mm_and_si128(v1, mask8); + tv2 = _mm_and_si128(v2, mask8); + tv3 = _mm_and_si128(v3, mask8); + + v0 = _mm_packus_epi16(p2, p0); + v1 = _mm_packus_epi16(p3, p1); + v2 = _mm_packus_epi16(tv2, tv0); + v3 = _mm_packus_epi16(tv3, tv1); + + si = _mm_and_si128(v0, mask1); + p0 = _mm_shuffle_epi8(tables[6][0], si); + p1 = _mm_shuffle_epi8(tables[6][1], si); + p2 = _mm_shuffle_epi8(tables[6][2], si); + p3 = _mm_shuffle_epi8(tables[6][3], si); + + v0 = _mm_srli_epi32(v0, 4); + si = _mm_and_si128(v0, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si)); + + si = _mm_and_si128(v1, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si)); + + v1 = _mm_srli_epi32(v1, 4); + si = _mm_and_si128(v1, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si)); + + si = _mm_and_si128(v2, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si)); + + v2 = _mm_srli_epi32(v2, 4); + si = _mm_and_si128(v2, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si)); + + si = _mm_and_si128(v3, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si)); + + v3 = _mm_srli_epi32(v3, 4); + si = _mm_and_si128(v3, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si)); + + tv0 = _mm_unpackhi_epi8(p1, p3); + tv1 = _mm_unpackhi_epi8(p0, p2); + tv2 = _mm_unpacklo_epi8(p1, p3); + tv3 = _mm_unpacklo_epi8(p0, p2); + + p0 = _mm_unpackhi_epi8(tv1, tv0); + p1 = _mm_unpacklo_epi8(tv1, tv0); + p2 = _mm_unpackhi_epi8(tv3, tv2); + p3 = _mm_unpacklo_epi8(tv3, tv2); + + v0 = _mm_load_si128 ((__m128i *) d32); + v1 = _mm_load_si128 ((__m128i *) (d32+4)); + v2 = _mm_load_si128 ((__m128i *) (d32+8)); + v3 = _mm_load_si128 ((__m128i *) (d32+12)); + + p0 = _mm_xor_si128(p0, v0); + p1 = _mm_xor_si128(p1, v1); + p2 = _mm_xor_si128(p2, v2); + p3 = _mm_xor_si128(p3, v3); + + _mm_store_si128((__m128i *) d32, p0); + _mm_store_si128((__m128i *) (d32+4), p1); + _mm_store_si128((__m128i *) (d32+8), p2); + _mm_store_si128((__m128i *) (d32+12), p3); + d32 += 16; + } + } else { + while (d32 != top) { + v0 = _mm_load_si128((__m128i *) s32); s32 += 4; + v1 = _mm_load_si128((__m128i *) s32); s32 += 4; + v2 = _mm_load_si128((__m128i *) s32); s32 += 4; + v3 = _mm_load_si128((__m128i *) s32); s32 += 4; + + p0 = _mm_srli_epi16(v0, 8); + p1 = _mm_srli_epi16(v1, 8); + p2 = _mm_srli_epi16(v2, 8); + p3 = _mm_srli_epi16(v3, 8); + + tv0 = _mm_and_si128(v0, mask8); + tv1 = _mm_and_si128(v1, mask8); + tv2 = _mm_and_si128(v2, mask8); + tv3 = _mm_and_si128(v3, mask8); + + v0 = _mm_packus_epi16(p1, p0); + v1 = _mm_packus_epi16(tv1, tv0); + v2 = _mm_packus_epi16(p3, p2); + v3 = _mm_packus_epi16(tv3, tv2); + + p0 = _mm_srli_epi16(v0, 8); + p1 = _mm_srli_epi16(v1, 8); + p2 = _mm_srli_epi16(v2, 8); + p3 = _mm_srli_epi16(v3, 8); + + tv0 = _mm_and_si128(v0, mask8); + tv1 = _mm_and_si128(v1, mask8); + tv2 = _mm_and_si128(v2, mask8); + tv3 = _mm_and_si128(v3, mask8); + + v0 = _mm_packus_epi16(p2, p0); + v1 = _mm_packus_epi16(p3, p1); + v2 = _mm_packus_epi16(tv2, tv0); + v3 = _mm_packus_epi16(tv3, tv1); + + p0 = v0; + p1 = v1; + p2 = v2; + p3 = v3; + + si = _mm_and_si128(v0, mask1); + p0 = _mm_shuffle_epi8(tables[6][0], si); + p1 = _mm_shuffle_epi8(tables[6][1], si); + p2 = _mm_shuffle_epi8(tables[6][2], si); + p3 = _mm_shuffle_epi8(tables[6][3], si); + + v0 = _mm_srli_epi32(v0, 4); + si = _mm_and_si128(v0, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si)); + + si = _mm_and_si128(v1, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si)); + + v1 = _mm_srli_epi32(v1, 4); + si = _mm_and_si128(v1, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si)); + + si = _mm_and_si128(v2, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si)); + + v2 = _mm_srli_epi32(v2, 4); + si = _mm_and_si128(v2, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si)); + + si = _mm_and_si128(v3, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si)); + + v3 = _mm_srli_epi32(v3, 4); + si = _mm_and_si128(v3, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si)); + + tv0 = _mm_unpackhi_epi8(p1, p3); + tv1 = _mm_unpackhi_epi8(p0, p2); + tv2 = _mm_unpacklo_epi8(p1, p3); + tv3 = _mm_unpacklo_epi8(p0, p2); + + p0 = _mm_unpackhi_epi8(tv1, tv0); + p1 = _mm_unpacklo_epi8(tv1, tv0); + p2 = _mm_unpackhi_epi8(tv3, tv2); + p3 = _mm_unpacklo_epi8(tv3, tv2); + + _mm_store_si128((__m128i *) d32, p0); + _mm_store_si128((__m128i *) (d32+4), p1); + _mm_store_si128((__m128i *) (d32+8), p2); + _mm_store_si128((__m128i *) (d32+12), p3); + d32 += 16; + } + } + gf_do_final_region_alignment(&rd); + +#endif +} + +static +int gf_w32_split_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_split_2_32_lazy_data *ld2; + struct gf_split_4_32_lazy_data *ld4; + struct gf_w32_split_8_8_data *d8; + struct gf_split_8_32_lazy_data *d32; + struct gf_split_16_32_lazy_data *d16; + uint32_t p, basep; + int i, j, exp, ispclmul, issse3; + + ispclmul = 0; +#if defined(INTEL_SSE4_PCLMUL) + ispclmul = 1; +#endif + + issse3 = 0; +#ifdef INTEL_SSSE3 + issse3 = 1; +#endif + + h = (gf_internal_t *) gf->scratch; + + /* Defaults */ + + gf->inverse.w32 = gf_w32_euclid; + + /* JSP: First handle single multiplication: + If args == 8, then we're doing split 8 8. + Otherwise, if PCLMUL, we use that. + Otherwise, we use bytwo_p. + */ + + if (h->arg1 == 8 && h->arg2 == 8) { + gf->multiply.w32 = gf_w32_split_8_8_multiply; + } else if (ispclmul) { + if ((0xfffe0000 & h->prim_poly) == 0){ + gf->multiply.w32 = gf_w32_clm_multiply_2; + } else if ((0xffc00000 & h->prim_poly) == 0){ + gf->multiply.w32 = gf_w32_clm_multiply_3; + } else if ((0xfe000000 & h->prim_poly) == 0){ + gf->multiply.w32 = gf_w32_clm_multiply_4; + } + } else { + gf->multiply.w32 = gf_w32_bytwo_p_multiply; + } + + /* Easy cases: 16/32 and 2/32 */ + + if ((h->arg1 == 16 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 16)) { + d16 = (struct gf_split_16_32_lazy_data *) h->private; + d16->last_value = 0; + gf->multiply_region.w32 = gf_w32_split_16_32_lazy_multiply_region; + return 1; + } + + if ((h->arg1 == 2 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 2)) { + ld2 = (struct gf_split_2_32_lazy_data *) h->private; + ld2->last_value = 0; + #ifdef INTEL_SSSE3 + if (!(h->region_type & GF_REGION_NOSSE)) + gf->multiply_region.w32 = gf_w32_split_2_32_lazy_sse_multiply_region; + else + gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region; + #else + gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region; + if(h->region_type & GF_REGION_SSE) return 0; + #endif + return 1; + } + + /* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */ + + if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4) || + (issse3 && h->mult_type == GF_REGION_DEFAULT)) { + ld4 = (struct gf_split_4_32_lazy_data *) h->private; + ld4->last_value = 0; + if ((h->region_type & GF_REGION_NOSSE) || !issse3) { + gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region; + } else if (h->region_type & GF_REGION_ALTMAP) { + gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region; + } else { + gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_multiply_region; + } + return 1; + } + + /* 8/32 or Default + no SSE */ + + if ((h->arg1 == 8 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 8) || + h->mult_type == GF_MULT_DEFAULT) { + d32 = (struct gf_split_8_32_lazy_data *) h->private; + d32->last_value = 0; + gf->multiply_region.w32 = gf_w32_split_8_32_lazy_multiply_region; + return 1; + } + + /* Finally, if args == 8, then we have to set up the tables here. */ + + if (h->arg1 == 8 && h->arg2 == 8) { + d8 = (struct gf_w32_split_8_8_data *) h->private; + d8->last_value = 0; + gf->multiply.w32 = gf_w32_split_8_8_multiply; + gf->multiply_region.w32 = gf_w32_split_8_32_lazy_multiply_region; + basep = 1; + for (exp = 0; exp < 7; exp++) { + for (j = 0; j < 256; j++) d8->tables[exp][0][j] = 0; + for (i = 0; i < 256; i++) d8->tables[exp][i][0] = 0; + d8->tables[exp][1][1] = basep; + for (i = 2; i < 256; i++) { + if (i&1) { + p = d8->tables[exp][i^1][1]; + d8->tables[exp][i][1] = p ^ basep; + } else { + p = d8->tables[exp][i>>1][1]; + d8->tables[exp][i][1] = GF_MULTBY_TWO(p); + } + } + for (i = 1; i < 256; i++) { + p = d8->tables[exp][i][1]; + for (j = 1; j < 256; j++) { + if (j&1) { + d8->tables[exp][i][j] = d8->tables[exp][i][j^1] ^ p; + } else { + d8->tables[exp][i][j] = GF_MULTBY_TWO(d8->tables[exp][i][j>>1]); + } + } + } + for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep); + } + return 1; + } + + /* If we get here, then the arguments were bad. */ + + return 0; +} + +static +int gf_w32_group_init(gf_t *gf) +{ + uint32_t i, j, p, index; + struct gf_w32_group_data *gd; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + int g_r, g_s; + + g_s = h->arg1; + g_r = h->arg2; + + gd = (struct gf_w32_group_data *) h->private; + gd->shift = (uint32_t *) (&(gd->memory)); + gd->reduce = gd->shift + (1 << g_s); + + gd->rmask = (1 << g_r) - 1; + gd->rmask <<= 32; + + gd->tshift = 32 % g_s; + if (gd->tshift == 0) gd->tshift = g_s; + gd->tshift = (32 - gd->tshift); + gd->tshift = ((gd->tshift-1)/g_r) * g_r; + + gd->reduce[0] = 0; + for (i = 0; i < (1 << g_r); i++) { + p = 0; + index = 0; + for (j = 0; j < g_r; j++) { + if (i & (1 << j)) { + p ^= (h->prim_poly << j); + index ^= (1 << j); + index ^= (h->prim_poly >> (32-j)); + } + } + gd->reduce[index] = p; + } + + if (g_s == g_r) { + gf->multiply.w32 = gf_w32_group_s_equals_r_multiply; + gf->multiply_region.w32 = gf_w32_group_s_equals_r_multiply_region; + } else { + gf->multiply.w32 = gf_w32_group_multiply; + gf->multiply_region.w32 = gf_w32_group_multiply_region; + } + gf->divide.w32 = NULL; + gf->inverse.w32 = gf_w32_euclid; + + return 1; +} + + +static +uint32_t +gf_w32_composite_multiply_recursive(gf_t *gf, uint32_t a, uint32_t b) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint32_t b0 = b & 0x0000ffff; + uint32_t b1 = (b & 0xffff0000) >> 16; + uint32_t a0 = a & 0x0000ffff; + uint32_t a1 = (a & 0xffff0000) >> 16; + uint32_t a1b1; + uint32_t rv; + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + rv = ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16) | (base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1); + return rv; +} + +/* JSP: This could be made faster. Someday, when I'm bored. */ + +static +uint32_t +gf_w32_composite_multiply_inline(gf_t *gf, uint32_t a, uint32_t b) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + uint32_t b0 = b & 0x0000ffff; + uint32_t b1 = b >> 16; + uint32_t a0 = a & 0x0000ffff; + uint32_t a1 = a >> 16; + uint32_t a1b1, prod; + uint16_t *log, *alog; + struct gf_w32_composite_data *cd; + + cd = (struct gf_w32_composite_data *) h->private; + log = cd->log; + alog = cd->alog; + + a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1); + prod = GF_W16_INLINE_MULT(log, alog, a1, b0); + prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1); + prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly); + prod <<= 16; + prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0); + prod ^= a1b1; + return prod; +} + +/* + * Composite field division trick (explained in 2007 tech report) + * + * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1 + * + * let c = b^-1 + * + * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0) + * + * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1 + * + * let d = b1c1 and d+1 = b0c0 + * + * solve s*b1c1+b1c0+b0c1 = 0 + * + * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1 + * + * c0 = (d+1)b0^-1 + * c1 = d*b1^-1 + * + * a / b = a * c + */ + +static +uint32_t +gf_w32_composite_inverse(gf_t *gf, uint32_t a) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint16_t a0 = a & 0x0000ffff; + uint16_t a1 = (a & 0xffff0000) >> 16; + uint16_t c0, c1, d, tmp; + uint32_t c; + uint16_t a0inv, a1inv; + + if (a0 == 0) { + a1inv = base_gf->inverse.w32(base_gf, a1); + c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly); + c1 = a1inv; + } else if (a1 == 0) { + c0 = base_gf->inverse.w32(base_gf, a0); + c1 = 0; + } else { + a1inv = base_gf->inverse.w32(base_gf, a1); + a0inv = base_gf->inverse.w32(base_gf, a0); + + d = base_gf->multiply.w32(base_gf, a1, a0inv); + + tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly); + tmp = base_gf->inverse.w32(base_gf, tmp); + + d = base_gf->multiply.w32(base_gf, d, tmp); + + c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv); + c1 = base_gf->multiply.w32(base_gf, d, a1inv); + } + + c = c0 | (c1 << 16); + + return c; +} + +static +void +gf_w32_composite_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint32_t b0 = val & 0x0000ffff; + uint32_t b1 = (val & 0xffff0000) >> 16; + uint32_t *s32, *d32, *top; + uint16_t a0, a1, a1b1, *log, *alog; + uint32_t prod; + gf_region_data rd; + struct gf_w32_composite_data *cd; + + cd = (struct gf_w32_composite_data *) h->private; + log = cd->log; + alog = cd->alog; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); + + s32 = rd.s_start; + d32 = rd.d_start; + top = rd.d_top; + + if (log == NULL) { + if (xor) { + while (d32 < top) { + a0 = *s32 & 0x0000ffff; + a1 = (*s32 & 0xffff0000) >> 16; + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + *d32 ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16)); + s32++; + d32++; + } + } else { + while (d32 < top) { + a0 = *s32 & 0x0000ffff; + a1 = (*s32 & 0xffff0000) >> 16; + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + *d32 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16)); + s32++; + d32++; + } + } + } else { + if (xor) { + while (d32 < top) { + a0 = *s32 & 0x0000ffff; + a1 = (*s32 & 0xffff0000) >> 16; + a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1); + + prod = GF_W16_INLINE_MULT(log, alog, a1, b0); + prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1); + prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly); + prod <<= 16; + prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0); + prod ^= a1b1; + *d32 ^= prod; + s32++; + d32++; + } + } else { + while (d32 < top) { + a0 = *s32 & 0x0000ffff; + a1 = (*s32 & 0xffff0000) >> 16; + a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1); + + prod = GF_W16_INLINE_MULT(log, alog, a1, b0); + prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1); + prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly); + prod <<= 16; + prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0); + prod ^= a1b1; + + *d32 = prod; + s32++; + d32++; + } + } + } +} + +static +void +gf_w32_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint16_t val0 = val & 0x0000ffff; + uint16_t val1 = (val & 0xffff0000) >> 16; + gf_region_data rd; + int sub_reg_size; + uint8_t *slow, *shigh; + uint8_t *dlow, *dhigh, *top; + + /* JSP: I want the two pointers aligned wrt each other on 16 byte + boundaries. So I'm going to make sure that the area on + which the two operate is a multiple of 32. Of course, that + junks up the mapping, but so be it -- that's why we have extract_word.... */ + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); + gf_do_initial_region_alignment(&rd); + + slow = (uint8_t *) rd.s_start; + dlow = (uint8_t *) rd.d_start; + top = (uint8_t *) rd.d_top; + sub_reg_size = (top - dlow)/2; + shigh = slow + sub_reg_size; + dhigh = dlow + sub_reg_size; + + base_gf->multiply_region.w32(base_gf, slow, dlow, val0, sub_reg_size, xor); + base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor); + base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1); + + gf_do_final_region_alignment(&rd); +} + +static +int gf_w32_composite_init(gf_t *gf) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + struct gf_w32_composite_data *cd; + + if (h->base_gf == NULL) return 0; + + cd = (struct gf_w32_composite_data *) h->private; + cd->log = gf_w16_get_log_table(h->base_gf); + cd->alog = gf_w16_get_mult_alog_table(h->base_gf); + + if (h->region_type & GF_REGION_ALTMAP) { + gf->multiply_region.w32 = gf_w32_composite_multiply_region_alt; + } else { + gf->multiply_region.w32 = gf_w32_composite_multiply_region; + } + + if (cd->log == NULL) { + gf->multiply.w32 = gf_w32_composite_multiply_recursive; + } else { + gf->multiply.w32 = gf_w32_composite_multiply_inline; + } + gf->divide.w32 = NULL; + gf->inverse.w32 = gf_w32_composite_inverse; + + return 1; +} + + + +int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) +{ + int issse3 = 0; + +#ifdef INTEL_SSSE3 + issse3 = 1; +#endif + + switch(mult_type) + { + case GF_MULT_BYTWO_p: + case GF_MULT_BYTWO_b: + return sizeof(gf_internal_t) + sizeof(struct gf_w32_bytwo_data) + 64; + break; + case GF_MULT_GROUP: + return sizeof(gf_internal_t) + sizeof(struct gf_w32_group_data) + + sizeof(uint32_t) * (1 << arg1) + + sizeof(uint32_t) * (1 << arg2) + 64; + break; + case GF_MULT_DEFAULT: + + case GF_MULT_SPLIT_TABLE: + if (arg1 == 8 && arg2 == 8){ + return sizeof(gf_internal_t) + sizeof(struct gf_w32_split_8_8_data) + 64; + } + if ((arg1 == 16 && arg2 == 32) || (arg2 == 16 && arg1 == 32)) { + return sizeof(gf_internal_t) + sizeof(struct gf_split_16_32_lazy_data) + 64; + } + if ((arg1 == 2 && arg2 == 32) || (arg2 == 2 && arg1 == 32)) { + return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64; + } + if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32) || + (mult_type == GF_MULT_DEFAULT && !issse3)) { + return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64; + } + if ((arg1 == 4 && arg2 == 32) || + (arg2 == 4 && arg1 == 32) || + mult_type == GF_MULT_DEFAULT) { + return sizeof(gf_internal_t) + sizeof(struct gf_split_4_32_lazy_data) + 64; + } + return 0; + case GF_MULT_CARRY_FREE: + return sizeof(gf_internal_t); + break; + case GF_MULT_SHIFT: + return sizeof(gf_internal_t); + break; + case GF_MULT_COMPOSITE: + return sizeof(gf_internal_t) + sizeof(struct gf_w32_composite_data) + 64; + break; + + default: + return 0; + } + return 0; +} + +int gf_w32_init(gf_t *gf) +{ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + /* Allen: set default primitive polynomial / irreducible polynomial if needed */ + + if (h->prim_poly == 0) { + if (h->mult_type == GF_MULT_COMPOSITE) { + h->prim_poly = gf_composite_get_default_poly(h->base_gf); + if (h->prim_poly == 0) return 0; /* This shouldn't happen */ + } else { + + /* Allen: use the following primitive polynomial to make carryless multiply work more efficiently for GF(2^32).*/ + + /* h->prim_poly = 0xc5; */ + + /* Allen: The following is the traditional primitive polynomial for GF(2^32) */ + + h->prim_poly = 0x400007; + } + } + + /* No leading one */ + + if(h->mult_type != GF_MULT_COMPOSITE) h->prim_poly &= 0xffffffff; + + gf->multiply.w32 = NULL; + gf->divide.w32 = NULL; + gf->inverse.w32 = NULL; + gf->multiply_region.w32 = NULL; + + switch(h->mult_type) { + case GF_MULT_CARRY_FREE: if (gf_w32_cfm_init(gf) == 0) return 0; break; + case GF_MULT_SHIFT: if (gf_w32_shift_init(gf) == 0) return 0; break; + case GF_MULT_COMPOSITE: if (gf_w32_composite_init(gf) == 0) return 0; break; + case GF_MULT_DEFAULT: + case GF_MULT_SPLIT_TABLE: if (gf_w32_split_init(gf) == 0) return 0; break; + case GF_MULT_GROUP: if (gf_w32_group_init(gf) == 0) return 0; break; + case GF_MULT_BYTWO_p: + case GF_MULT_BYTWO_b: if (gf_w32_bytwo_init(gf) == 0) return 0; break; + default: return 0; + } + if (h->divide_type == GF_DIVIDE_EUCLID) { + gf->divide.w32 = gf_w32_divide_from_inverse; + gf->inverse.w32 = gf_w32_euclid; + } else if (h->divide_type == GF_DIVIDE_MATRIX) { + gf->divide.w32 = gf_w32_divide_from_inverse; + gf->inverse.w32 = gf_w32_matrix; + } + + if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) { + gf->divide.w32 = gf_w32_divide_from_inverse; + } + if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) { + gf->inverse.w32 = gf_w32_inverse_from_divide; + } + if (h->region_type == GF_REGION_CAUCHY) { + gf->extract_word.w32 = gf_wgen_extract_word; + gf->multiply_region.w32 = gf_wgen_cauchy_region; + } else if (h->region_type & GF_REGION_ALTMAP) { + if (h->mult_type == GF_MULT_COMPOSITE) { + gf->extract_word.w32 = gf_w32_composite_extract_word; + } else { + gf->extract_word.w32 = gf_w32_split_extract_word; + } + } else { + gf->extract_word.w32 = gf_w32_extract_word; + } + return 1; +} diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w4.c b/src/erasure-code/jerasure/gf-complete/src/gf_w4.c new file mode 100644 index 000000000000..65cbf23a25d3 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/gf_w4.c @@ -0,0 +1,2081 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_w4.c + * + * Routines for 4-bit Galois fields + */ + +#include "gf_int.h" +#include +#include + +#define GF_FIELD_WIDTH 4 +#define GF_DOUBLE_WIDTH (GF_FIELD_WIDTH*2) +#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH) +#define GF_MULT_GROUP_SIZE (GF_FIELD_SIZE-1) + +/* ------------------------------------------------------------ + JSP: Each implementation has its own data, which is allocated + at one time as part of the handle. For that reason, it + shouldn't be hierarchical -- i.e. one should be able to + allocate it with one call to malloc. */ + +struct gf_logtable_data { + uint8_t log_tbl[GF_FIELD_SIZE]; + uint8_t antilog_tbl[GF_FIELD_SIZE * 2]; + uint8_t *antilog_tbl_div; +}; + +struct gf_single_table_data { + uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; +}; + +struct gf_double_table_data { + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE]; +}; +struct gf_quad_table_data { + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint16_t mult[GF_FIELD_SIZE][(1<<16)]; +}; + +struct gf_quad_table_lazy_data { + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint16_t mult[(1 << 16)]; +}; + +struct gf_bytwo_data { + uint64_t prim_poly; + uint64_t mask1; + uint64_t mask2; +}; + +#define AB2(ip, am1 ,am2, b, t1, t2) {\ + t1 = (b << 1) & am1;\ + t2 = b & am2; \ + t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \ + b = (t1 ^ (t2 & ip));} + +#define SSE_AB2(pp, m1, va, t1, t2) {\ + t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \ + t2 = _mm_and_si128(va, _mm_set1_epi8(0x88)); \ + t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \ + va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); } + +/* ------------------------------------------------------------ + JSP: These are basic and work from multiple implementations. + */ + +static +inline +gf_val_32_t gf_w4_inverse_from_divide (gf_t *gf, gf_val_32_t a) +{ + return gf->divide.w32(gf, 1, a); +} + +static +inline +gf_val_32_t gf_w4_divide_from_inverse (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + b = gf->inverse.w32(gf, b); + return gf->multiply.w32(gf, a, b); +} + +static +inline +gf_val_32_t gf_w4_euclid (gf_t *gf, gf_val_32_t b) +{ + gf_val_32_t e_i, e_im1, e_ip1; + gf_val_32_t d_i, d_im1, d_ip1; + gf_val_32_t y_i, y_im1, y_ip1; + gf_val_32_t c_i; + + if (b == 0) return -1; + e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; + e_i = b; + d_im1 = 4; + for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ; + y_i = 1; + y_im1 = 0; + + while (e_i != 1) { + e_ip1 = e_im1; + d_ip1 = d_im1; + c_i = 0; + + while (d_ip1 >= d_i) { + c_i ^= (1 << (d_ip1 - d_i)); + e_ip1 ^= (e_i << (d_ip1 - d_i)); + if (e_ip1 == 0) return 0; + while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--; + } + + y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i); + y_im1 = y_i; + y_i = y_ip1; + + e_im1 = e_i; + d_im1 = d_i; + e_i = e_ip1; + d_i = d_ip1; + } + + return y_i; +} + +static +gf_val_32_t gf_w4_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + uint8_t *r8, v; + + r8 = (uint8_t *) start; + v = r8[index/2]; + if (index%2) { + return v >> 4; + } else { + return v&0xf; + } +} + + +static +inline +gf_val_32_t gf_w4_matrix (gf_t *gf, gf_val_32_t b) +{ + return gf_bitmatrix_inverse(b, 4, ((gf_internal_t *) (gf->scratch))->prim_poly); +} + + +static +inline +gf_val_32_t +gf_w4_shift_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint8_t product, i, pp; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + product = 0; + + for (i = 0; i < GF_FIELD_WIDTH; i++) { + if (a & (1 << i)) product ^= (b << i); + } + for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) { + if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); + } + return product; +} + +/* Ben: This function works, but it is 33% slower than the normal shift mult */ + +static +inline +gf_val_32_t +gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4) +{ + gf_val_32_t rv = 0; + +#if defined(INTEL_SSE4_PCLMUL) + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi32 (_mm_setzero_si128(), a4, 0); + b = _mm_insert_epi32 (a, b4, 0); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1fULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + /* Ben/JSP: Do prim_poly reduction once. We are guaranteed that we will only + have to do the reduction only once, because (w-2)/z == 1. Where + z is equal to the number of zeros after the leading 1. + + _mm_clmulepi64_si128 is the carryless multiply operation. Here + _mm_srli_epi64 shifts the result to the right by 4 bits. This allows + us to multiply the prim_poly by the leading bits of the result. We + then xor the result of that operation back with the result. */ + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_epi64 (result, 4), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); +#endif + return rv; +} + +static +void +gf_w4_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int + xor) +{ + gf_region_data rd; + uint8_t *s8; + uint8_t *d8; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1); + gf_do_initial_region_alignment(&rd); + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + if (xor) { + while (d8 < ((uint8_t *) rd.d_top)) { + *d8 ^= (gf->multiply.w32(gf, val, (*s8 & 0xf)) | + ((gf->multiply.w32(gf, val, (*s8 >> 4))) << 4)); + d8++; + s8++; + } + } else { + while (d8 < ((uint8_t *) rd.d_top)) { + *d8 = (gf->multiply.w32(gf, val, (*s8 & 0xf)) | + ((gf->multiply.w32(gf, val, (*s8 >> 4))) << 4)); + d8++; + s8++; + } + } + gf_do_final_region_alignment(&rd); +} + +/* ------------------------------------------------------------ + IMPLEMENTATION: LOG_TABLE: + + JSP: This is a basic log-antilog implementation. + I'm not going to spend any time optimizing it because the + other techniques are faster for both single and region + operations. + */ + +static +inline +gf_val_32_t +gf_w4_log_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_logtable_data *ltd; + + ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf->scratch))->private; + return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(unsigned)(ltd->log_tbl[a] + ltd->log_tbl[b])]; +} + +static +inline +gf_val_32_t +gf_w4_log_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + int log_sum = 0; + struct gf_logtable_data *ltd; + + if (a == 0 || b == 0) return 0; + ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf->scratch))->private; + + log_sum = ltd->log_tbl[a] - ltd->log_tbl[b]; + return (ltd->antilog_tbl_div[log_sum]); +} + +static +void +gf_w4_log_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + int i; + uint8_t lv, b, c; + uint8_t *s8, *d8; + + struct gf_logtable_data *ltd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf->scratch))->private; + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + + lv = ltd->log_tbl[val]; + + for (i = 0; i < bytes; i++) { + c = (xor) ? d8[i] : 0; + b = (s8[i] >> GF_FIELD_WIDTH); + c ^= (b == 0) ? 0 : (ltd->antilog_tbl[lv + ltd->log_tbl[b]] << GF_FIELD_WIDTH); + b = (s8[i] & 0xf); + c ^= (b == 0) ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[b]]; + d8[i] = c; + } +} + +static +int gf_w4_log_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_logtable_data *ltd; + int i, b; + + h = (gf_internal_t *) gf->scratch; + ltd = h->private; + + for (i = 0; i < GF_FIELD_SIZE; i++) + ltd->log_tbl[i]=0; + + ltd->antilog_tbl_div = ltd->antilog_tbl + (GF_FIELD_SIZE-1); + b = 1; + i = 0; + do { + if (ltd->log_tbl[b] != 0 && i != 0) { + fprintf(stderr, "Cannot construct log table: Polynomial is not primitive.\n\n"); + return 0; + } + ltd->log_tbl[b] = i; + ltd->antilog_tbl[i] = b; + ltd->antilog_tbl[i+GF_FIELD_SIZE-1] = b; + b <<= 1; + i++; + if (b & GF_FIELD_SIZE) b = b ^ h->prim_poly; + } while (b != 1); + + if (i != GF_FIELD_SIZE - 1) { + _gf_errno = GF_E_LOGPOLY; + return 0; + } + + gf->inverse.w32 = gf_w4_inverse_from_divide; + gf->divide.w32 = gf_w4_log_divide; + gf->multiply.w32 = gf_w4_log_multiply; + gf->multiply_region.w32 = gf_w4_log_multiply_region; + return 1; +} + +/* ------------------------------------------------------------ + IMPLEMENTATION: SINGLE TABLE: JSP. + */ + +static +inline +gf_val_32_t +gf_w4_single_table_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_single_table_data *std; + + std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private; + return std->mult[a][b]; +} + +static +inline +gf_val_32_t +gf_w4_single_table_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_single_table_data *std; + + std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private; + return std->div[a][b]; +} + +static +void +gf_w4_single_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + int i; + uint8_t b, c; + uint8_t *s8, *d8; + + struct gf_single_table_data *std; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private; + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + + for (i = 0; i < bytes; i++) { + c = (xor) ? d8[i] : 0; + b = (s8[i] >> GF_FIELD_WIDTH); + c ^= (std->mult[val][b] << GF_FIELD_WIDTH); + b = (s8[i] & 0xf); + c ^= (std->mult[val][b]); + d8[i] = c; + } +} + +#define MM_PRINT(s, r) { uint8_t blah[16]; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (i = 0; i < 16; i++) printf(" %02x", blah[i]); printf("\n"); } + +#ifdef INTEL_SSSE3 +static +void +gf_w4_single_table_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + gf_region_data rd; + uint8_t *base, *sptr, *dptr, *top; + __m128i tl, loset, r, va, th; + + struct gf_single_table_data *std; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + + std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private; + base = (uint8_t *) std->mult; + base += (val << GF_FIELD_WIDTH); + + gf_do_initial_region_alignment(&rd); + + tl = _mm_loadu_si128((__m128i *)base); + th = _mm_slli_epi64(tl, 4); + loset = _mm_set1_epi8 (0x0f); + + sptr = rd.s_start; + dptr = rd.d_start; + top = rd.s_top; + + while (sptr < (uint8_t *) top) { + va = _mm_load_si128 ((__m128i *)(sptr)); + r = _mm_and_si128 (loset, va); + r = _mm_shuffle_epi8 (tl, r); + va = _mm_srli_epi64 (va, 4); + va = _mm_and_si128 (loset, va); + va = _mm_shuffle_epi8 (th, va); + r = _mm_xor_si128 (r, va); + va = (xor) ? _mm_load_si128 ((__m128i *)(dptr)) : _mm_setzero_si128(); + r = _mm_xor_si128 (r, va); + _mm_store_si128 ((__m128i *)(dptr), r); + dptr += 16; + sptr += 16; + } + gf_do_final_region_alignment(&rd); + +} +#endif + +static +int gf_w4_single_table_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_single_table_data *std; + int a, b, prod; + + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_single_table_data *)h->private; + + bzero(std->mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + + for (a = 1; a < GF_FIELD_SIZE; a++) { + for (b = 1; b < GF_FIELD_SIZE; b++) { + prod = gf_w4_shift_multiply(gf, a, b); + std->mult[a][b] = prod; + std->div[prod][b] = a; + } + } + + gf->inverse.w32 = NULL; + gf->divide.w32 = gf_w4_single_table_divide; + gf->multiply.w32 = gf_w4_single_table_multiply; + #ifdef INTEL_SSSE3 + if(h->region_type & (GF_REGION_NOSSE | GF_REGION_CAUCHY)) + gf->multiply_region.w32 = gf_w4_single_table_multiply_region; + else + gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region; + #else + gf->multiply_region.w32 = gf_w4_single_table_multiply_region; + if (h->region_type & GF_REGION_SSE) return 0; + #endif + + return 1; +} + +/* ------------------------------------------------------------ + IMPLEMENTATION: DOUBLE TABLE: JSP. + */ + +static +inline +gf_val_32_t +gf_w4_double_table_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_double_table_data *std; + + std = (struct gf_double_table_data *) ((gf_internal_t *) (gf->scratch))->private; + return std->mult[a][b]; +} + +static +inline +gf_val_32_t +gf_w4_double_table_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_double_table_data *std; + + std = (struct gf_double_table_data *) ((gf_internal_t *) (gf->scratch))->private; + return std->div[a][b]; +} + +static +void +gf_w4_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + int i; + uint8_t *s8, *d8, *base; + gf_region_data rd; + struct gf_double_table_data *std; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + + std = (struct gf_double_table_data *) ((gf_internal_t *) (gf->scratch))->private; + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + base = (uint8_t *) std->mult; + base += (val << GF_DOUBLE_WIDTH); + + if (xor) { + for (i = 0; i < bytes; i++) d8[i] ^= base[s8[i]]; + } else { + for (i = 0; i < bytes; i++) d8[i] = base[s8[i]]; + } +} + +static +int gf_w4_double_table_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_double_table_data *std; + int a, b, c, prod, ab; + uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE]; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_double_table_data *)h->private; + + bzero(mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + + for (a = 1; a < GF_FIELD_SIZE; a++) { + for (b = 1; b < GF_FIELD_SIZE; b++) { + prod = gf_w4_shift_multiply(gf, a, b); + mult[a][b] = prod; + std->div[prod][b] = a; + } + } + bzero(std->mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE * GF_FIELD_SIZE); + for (a = 0; a < GF_FIELD_SIZE; a++) { + for (b = 0; b < GF_FIELD_SIZE; b++) { + ab = mult[a][b]; + for (c = 0; c < GF_FIELD_SIZE; c++) { + std->mult[a][(b << 4) | c] = ((ab << 4) | mult[a][c]); + } + } + } + + gf->inverse.w32 = NULL; + gf->divide.w32 = gf_w4_double_table_divide; + gf->multiply.w32 = gf_w4_double_table_multiply; + gf->multiply_region.w32 = gf_w4_double_table_multiply_region; + return 1; +} + + +static +inline +gf_val_32_t +gf_w4_quad_table_lazy_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_quad_table_lazy_data *std; + + std = (struct gf_quad_table_lazy_data *) ((gf_internal_t *) (gf->scratch))->private; + return std->div[a][b]; +} + +static +inline +gf_val_32_t +gf_w4_quad_table_lazy_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_quad_table_lazy_data *std; + + std = (struct gf_quad_table_lazy_data *) ((gf_internal_t *) (gf->scratch))->private; + return std->smult[a][b]; +} + +static +inline +gf_val_32_t +gf_w4_quad_table_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_quad_table_data *std; + + std = (struct gf_quad_table_data *) ((gf_internal_t *) (gf->scratch))->private; + return std->div[a][b]; +} + +static +inline +gf_val_32_t +gf_w4_quad_table_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_quad_table_data *std; + uint16_t v; + + std = (struct gf_quad_table_data *) ((gf_internal_t *) (gf->scratch))->private; + v = std->mult[a][b]; + return v; +} + +static +void +gf_w4_quad_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint16_t *base; + gf_region_data rd; + struct gf_quad_table_data *std; + struct gf_quad_table_lazy_data *ltd; + gf_internal_t *h; + int a, b, c, d, va, vb, vc, vd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) (gf->scratch); + if (h->region_type & GF_REGION_LAZY) { + ltd = (struct gf_quad_table_lazy_data *) ((gf_internal_t *) (gf->scratch))->private; + base = ltd->mult; + for (a = 0; a < 16; a++) { + va = (ltd->smult[val][a] << 12); + for (b = 0; b < 16; b++) { + vb = (ltd->smult[val][b] << 8); + for (c = 0; c < 16; c++) { + vc = (ltd->smult[val][c] << 4); + for (d = 0; d < 16; d++) { + vd = ltd->smult[val][d]; + base[(a << 12) | (b << 8) | (c << 4) | d ] = (va | vb | vc | vd); + } + } + } + } + } else { + std = (struct gf_quad_table_data *) ((gf_internal_t *) (gf->scratch))->private; + base = &(std->mult[val][0]); + } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + gf_do_initial_region_alignment(&rd); + gf_two_byte_region_table_multiply(&rd, base); + gf_do_final_region_alignment(&rd); +} + +static +int gf_w4_quad_table_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_quad_table_data *std; + int prod, val, a, b, c, d, va, vb, vc, vd; + uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE]; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_quad_table_data *)h->private; + + bzero(mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + + for (a = 1; a < GF_FIELD_SIZE; a++) { + for (b = 1; b < GF_FIELD_SIZE; b++) { + prod = gf_w4_shift_multiply(gf, a, b); + mult[a][b] = prod; + std->div[prod][b] = a; + } + } + + for (val = 0; val < 16; val++) { + for (a = 0; a < 16; a++) { + va = (mult[val][a] << 12); + for (b = 0; b < 16; b++) { + vb = (mult[val][b] << 8); + for (c = 0; c < 16; c++) { + vc = (mult[val][c] << 4); + for (d = 0; d < 16; d++) { + vd = mult[val][d]; + std->mult[val][(a << 12) | (b << 8) | (c << 4) | d ] = (va | vb | vc | vd); + } + } + } + } + } + + gf->inverse.w32 = NULL; + gf->divide.w32 = gf_w4_quad_table_divide; + gf->multiply.w32 = gf_w4_quad_table_multiply; + gf->multiply_region.w32 = gf_w4_quad_table_multiply_region; + return 1; +} +static +int gf_w4_quad_table_lazy_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_quad_table_lazy_data *std; + int a, b, prod, loga, logb; + uint8_t log_tbl[GF_FIELD_SIZE]; + uint8_t antilog_tbl[GF_FIELD_SIZE*2]; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_quad_table_lazy_data *)h->private; + + b = 1; + for (a = 0; a < GF_MULT_GROUP_SIZE; a++) { + log_tbl[b] = a; + antilog_tbl[a] = b; + antilog_tbl[a+GF_MULT_GROUP_SIZE] = b; + b <<= 1; + if (b & GF_FIELD_SIZE) { + b = b ^ h->prim_poly; + } + } + + bzero(std->smult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + + for (a = 1; a < GF_FIELD_SIZE; a++) { + loga = log_tbl[a]; + for (b = 1; b < GF_FIELD_SIZE; b++) { + logb = log_tbl[b]; + prod = antilog_tbl[loga+logb]; + std->smult[a][b] = prod; + std->div[prod][b] = a; + } + } + + gf->inverse.w32 = NULL; + gf->divide.w32 = gf_w4_quad_table_lazy_divide; + gf->multiply.w32 = gf_w4_quad_table_lazy_multiply; + gf->multiply_region.w32 = gf_w4_quad_table_multiply_region; + return 1; +} + +static +int gf_w4_table_init(gf_t *gf) +{ + int rt; + gf_internal_t *h; + int issse3 = 0; + +#ifdef INTEL_SSSE3 + issse3 = 1; +#endif + + h = (gf_internal_t *) gf->scratch; + rt = (h->region_type); + + if (h->mult_type == GF_MULT_DEFAULT && !issse3) rt |= GF_REGION_DOUBLE_TABLE; + + if (rt & GF_REGION_DOUBLE_TABLE) { + return gf_w4_double_table_init(gf); + } else if (rt & GF_REGION_QUAD_TABLE) { + if (rt & GF_REGION_LAZY) { + return gf_w4_quad_table_lazy_init(gf); + } else { + return gf_w4_quad_table_init(gf); + } + return gf_w4_double_table_init(gf); + } else { + return gf_w4_single_table_init(gf); + } + return 0; +} + +/* ------------------------------------------------------------ + JSP: GF_MULT_BYTWO_p and _b: See the paper. +*/ + +static +inline +gf_val_32_t +gf_w4_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t prod, pp, pmask, amask; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + + prod = 0; + pmask = 0x8; + amask = 0x8; + + while (amask != 0) { + if (prod & pmask) { + prod = ((prod << 1) ^ pp); + } else { + prod <<= 1; + } + if (a & amask) prod ^= b; + amask >>= 1; + } + return prod; +} + +static +inline +gf_val_32_t +gf_w4_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t prod, pp, bmask; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + prod = 0; + bmask = 0x8; + + while (1) { + if (a & 1) prod ^= b; + a >>= 1; + if (a == 0) return prod; + if (b & bmask) { + b = ((b << 1) ^ pp); + } else { + b <<= 1; + } + } +} + +static +void +gf_w4_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t *s64, *d64, t1, t2, ta, prod, amask; + gf_region_data rd; + struct gf_bytwo_data *btd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + gf_do_initial_region_alignment(&rd); + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + + if (xor) { + while (s64 < (uint64_t *) rd.s_top) { + prod = 0; + amask = 0x8; + ta = *s64; + while (amask != 0) { + AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); + if (val & amask) prod ^= ta; + amask >>= 1; + } + *d64 ^= prod; + d64++; + s64++; + } + } else { + while (s64 < (uint64_t *) rd.s_top) { + prod = 0; + amask = 0x8; + ta = *s64; + while (amask != 0) { + AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); + if (val & amask) prod ^= ta; + amask >>= 1; + } + *d64 = prod; + d64++; + s64++; + } + } + gf_do_final_region_alignment(&rd); +} + +#define BYTWO_P_ONESTEP {\ + SSE_AB2(pp, m1, prod, t1, t2); \ + t1 = _mm_and_si128(v, one); \ + t1 = _mm_sub_epi8(t1, one); \ + t1 = _mm_and_si128(t1, ta); \ + prod = _mm_xor_si128(prod, t1); \ + v = _mm_srli_epi64(v, 1); } + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + int i; + uint8_t *s8, *d8; + uint8_t vrev; + __m128i pp, m1, ta, prod, t1, t2, tp, one, v; + struct gf_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + vrev = 0; + for (i = 0; i < 4; i++) { + vrev <<= 1; + if (!(val & (1 << i))) vrev |= 1; + } + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + one = _mm_set1_epi8(1); + + while (d8 < (uint8_t *) rd.d_top) { + prod = _mm_setzero_si128(); + v = _mm_set1_epi8(vrev); + ta = _mm_load_si128((__m128i *) s8); + tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8); + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp)); + d8 += 16; + s8 += 16; + } + gf_do_final_region_alignment(&rd); +} +#endif + +/* +static +void +gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ +#ifdef INTEL_SSE2 + uint8_t *d8, *s8, tb; + __m128i pp, m1, m2, t1, t2, va, vb; + struct gf_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + m2 = _mm_set1_epi8((btd->mask2)&0xff); + + if (xor) { + while (d8 < (uint8_t *) rd.d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = _mm_load_si128 ((__m128i *)(d8)); + tb = val; + while (1) { + if (tb & 1) vb = _mm_xor_si128(vb, va); + tb >>= 1; + if (tb == 0) break; + SSE_AB2(pp, m1, m2, va, t1, t2); + } + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } + } else { + while (d8 < (uint8_t *) rd.d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = _mm_setzero_si128 (); + tb = val; + while (1) { + if (tb & 1) vb = _mm_xor_si128(vb, va); + tb >>= 1; + if (tb == 0) break; + t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); + t2 = _mm_and_si128(va, m2); + t2 = _mm_sub_epi64 ( + _mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); + va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); + } + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } + } + gf_do_final_region_alignment(&rd); +#endif +} +*/ + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, va, t1, t2); + _mm_store_si128((__m128i *)d8, va); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, va, t1, t2); + vb = _mm_load_si128 ((__m128i *)(d8)); + vb = _mm_xor_si128(vb, va); + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_region_4_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); + _mm_store_si128((__m128i *)d8, va); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_region_4_xor(gf_region_data *rd, struct gf_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); + vb = _mm_load_si128 ((__m128i *)(d8)); + vb = _mm_xor_si128(vb, va); + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } +} +#endif + + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_region_3_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = va; + SSE_AB2(pp, m1, va, t1, t2); + va = _mm_xor_si128(va, vb); + _mm_store_si128((__m128i *)d8, va); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_region_3_xor(gf_region_data *rd, struct gf_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va); + SSE_AB2(pp, m1, va, t1, t2); + vb = _mm_xor_si128(vb, va); + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_region_5_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = va; + SSE_AB2(pp, m1, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); + va = _mm_xor_si128(va, vb); + _mm_store_si128((__m128i *)d8, va); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_region_5_xor(gf_region_data *rd, struct gf_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va); + SSE_AB2(pp, m1, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); + vb = _mm_xor_si128(vb, va); + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_region_7_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = va; + SSE_AB2(pp, m1, va, t1, t2); + vb = _mm_xor_si128(va, vb); + SSE_AB2(pp, m1, va, t1, t2); + va = _mm_xor_si128(va, vb); + _mm_store_si128((__m128i *)d8, va); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_region_7_xor(gf_region_data *rd, struct gf_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va); + SSE_AB2(pp, m1, va, t1, t2); + vb = _mm_xor_si128(vb, va); + SSE_AB2(pp, m1, va, t1, t2); + vb = _mm_xor_si128(vb, va); + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_region_6_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, va, t1, t2); + vb = va; + SSE_AB2(pp, m1, va, t1, t2); + va = _mm_xor_si128(va, vb); + _mm_store_si128((__m128i *)d8, va); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_region_6_xor(gf_region_data *rd, struct gf_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, va, t1, t2); + vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va); + SSE_AB2(pp, m1, va, t1, t2); + vb = _mm_xor_si128(vb, va); + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint8_t *d8, *s8, tb; + __m128i pp, m1, m2, t1, t2, va, vb; + struct gf_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + switch (val) { + case 2: + if (!xor) { + gf_w4_bytwo_b_sse_region_2_noxor(&rd, btd); + } else { + gf_w4_bytwo_b_sse_region_2_xor(&rd, btd); + } + gf_do_final_region_alignment(&rd); + return; + case 3: + if (!xor) { + gf_w4_bytwo_b_sse_region_3_noxor(&rd, btd); + } else { + gf_w4_bytwo_b_sse_region_3_xor(&rd, btd); + } + gf_do_final_region_alignment(&rd); + return; + case 4: + if (!xor) { + gf_w4_bytwo_b_sse_region_4_noxor(&rd, btd); + } else { + gf_w4_bytwo_b_sse_region_4_xor(&rd, btd); + } + gf_do_final_region_alignment(&rd); + return; + case 5: + if (!xor) { + gf_w4_bytwo_b_sse_region_5_noxor(&rd, btd); + } else { + gf_w4_bytwo_b_sse_region_5_xor(&rd, btd); + } + gf_do_final_region_alignment(&rd); + return; + case 6: + if (!xor) { + gf_w4_bytwo_b_sse_region_6_noxor(&rd, btd); + } else { + gf_w4_bytwo_b_sse_region_6_xor(&rd, btd); + } + gf_do_final_region_alignment(&rd); + return; + case 7: + if (!xor) { + gf_w4_bytwo_b_sse_region_7_noxor(&rd, btd); + } else { + gf_w4_bytwo_b_sse_region_7_xor(&rd, btd); + } + gf_do_final_region_alignment(&rd); + return; + } + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + m2 = _mm_set1_epi8((btd->mask2)&0xff); + + if (xor) { + while (d8 < (uint8_t *) rd.d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = _mm_load_si128 ((__m128i *)(d8)); + tb = val; + while (1) { + if (tb & 1) vb = _mm_xor_si128(vb, va); + tb >>= 1; + if (tb == 0) break; + SSE_AB2(pp, m1, va, t1, t2); + } + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } + } else { + while (d8 < (uint8_t *) rd.d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = _mm_setzero_si128 (); + tb = val; + while (1) { + if (tb & 1) vb = _mm_xor_si128(vb, va); + tb >>= 1; + if (tb == 0) break; + t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); + t2 = _mm_and_si128(va, m2); + t2 = _mm_sub_epi64 ( + _mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); + va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); + } + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } + } + gf_do_final_region_alignment(&rd); +} +#endif + +static +void +gf_w4_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t *s64, *d64, t1, t2, ta, tb, prod; + struct gf_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + + switch (val) { + case 1: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + *d64 ^= *s64; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + *d64 = *s64; + d64++; + s64++; + } + } + break; + case 2: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= ta; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta; + d64++; + s64++; + } + } + break; + case 3: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 4: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= ta; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta; + d64++; + s64++; + } + } + break; + case 5: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta ^ prod; + d64++; + s64++; + } + } + case 6: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta ^ prod; + d64++; + s64++; + } + } + case 7: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta ^ prod; + d64++; + s64++; + } + } + break; + case 8: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= ta; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta; + d64++; + s64++; + } + } + break; + case 9: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 10: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 11: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 12: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 13: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 14: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 15: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + default: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + prod = *d64 ; + ta = *s64; + tb = val; + while (1) { + if (tb & 1) prod ^= ta; + tb >>= 1; + if (tb == 0) break; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + } + *d64 = prod; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + prod = 0 ; + ta = *s64; + tb = val; + while (1) { + if (tb & 1) prod ^= ta; + tb >>= 1; + if (tb == 0) break; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + } + *d64 = prod; + d64++; + s64++; + } + } + break; + } + gf_do_final_region_alignment(&rd); +} + +static +int gf_w4_bytwo_init(gf_t *gf) +{ + gf_internal_t *h; + uint64_t ip, m1, m2; + struct gf_bytwo_data *btd; + + h = (gf_internal_t *) gf->scratch; + btd = (struct gf_bytwo_data *) (h->private); + ip = h->prim_poly & 0xf; + m1 = 0xe; + m2 = 0x8; + btd->prim_poly = 0; + btd->mask1 = 0; + btd->mask2 = 0; + + while (ip != 0) { + btd->prim_poly |= ip; + btd->mask1 |= m1; + btd->mask2 |= m2; + ip <<= GF_FIELD_WIDTH; + m1 <<= GF_FIELD_WIDTH; + m2 <<= GF_FIELD_WIDTH; + } + + if (h->mult_type == GF_MULT_BYTWO_p) { + gf->multiply.w32 = gf_w4_bytwo_p_multiply; + #ifdef INTEL_SSE2 + if (h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region; + else + gf->multiply_region.w32 = gf_w4_bytwo_p_sse_multiply_region; + #else + gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region; + if (h->region_type & GF_REGION_SSE) + return 0; + #endif + } else { + gf->multiply.w32 = gf_w4_bytwo_b_multiply; + #ifdef INTEL_SSE2 + if (h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region; + else + gf->multiply_region.w32 = gf_w4_bytwo_b_sse_multiply_region; + #else + gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region; + if (h->region_type & GF_REGION_SSE) + return 0; + #endif + } + return 1; +} + + +static +int gf_w4_cfm_init(gf_t *gf) +{ +#if defined(INTEL_SSE4_PCLMUL) + gf->multiply.w32 = gf_w4_clm_multiply; + return 1; +#endif + return 0; +} + +static +int gf_w4_shift_init(gf_t *gf) +{ + gf->multiply.w32 = gf_w4_shift_multiply; + return 1; +} + +/* JSP: I'm putting all error-checking into gf_error_check(), so you don't + have to do error checking in scratch_size or in init */ + +int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) +{ + int issse3 = 0; + +#ifdef INTEL_SSSE3 + issse3 = 1; +#endif + + switch(mult_type) + { + case GF_MULT_BYTWO_p: + case GF_MULT_BYTWO_b: + return sizeof(gf_internal_t) + sizeof(struct gf_bytwo_data); + break; + case GF_MULT_DEFAULT: + case GF_MULT_TABLE: + if (region_type == GF_REGION_CAUCHY) { + return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64; + } + + if (mult_type == GF_MULT_DEFAULT && !issse3) region_type = GF_REGION_DOUBLE_TABLE; + + if (region_type & GF_REGION_DOUBLE_TABLE) { + return sizeof(gf_internal_t) + sizeof(struct gf_double_table_data) + 64; + } else if (region_type & GF_REGION_QUAD_TABLE) { + if ((region_type & GF_REGION_LAZY) == 0) { + return sizeof(gf_internal_t) + sizeof(struct gf_quad_table_data) + 64; + } else { + return sizeof(gf_internal_t) + sizeof(struct gf_quad_table_lazy_data) + 64; + } + } else { + return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64; + } + break; + + case GF_MULT_LOG_TABLE: + return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64; + break; + case GF_MULT_CARRY_FREE: + return sizeof(gf_internal_t); + break; + case GF_MULT_SHIFT: + return sizeof(gf_internal_t); + break; + default: + return 0; + } + return 0; +} + +int +gf_w4_init (gf_t *gf) +{ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + if (h->prim_poly == 0) h->prim_poly = 0x13; + h->prim_poly |= 0x10; + gf->multiply.w32 = NULL; + gf->divide.w32 = NULL; + gf->inverse.w32 = NULL; + gf->multiply_region.w32 = NULL; + gf->extract_word.w32 = gf_w4_extract_word; + + switch(h->mult_type) { + case GF_MULT_CARRY_FREE: if (gf_w4_cfm_init(gf) == 0) return 0; break; + case GF_MULT_SHIFT: if (gf_w4_shift_init(gf) == 0) return 0; break; + case GF_MULT_BYTWO_p: + case GF_MULT_BYTWO_b: if (gf_w4_bytwo_init(gf) == 0) return 0; break; + case GF_MULT_LOG_TABLE: if (gf_w4_log_init(gf) == 0) return 0; break; + case GF_MULT_DEFAULT: + case GF_MULT_TABLE: if (gf_w4_table_init(gf) == 0) return 0; break; + default: return 0; + } + + if (h->divide_type == GF_DIVIDE_EUCLID) { + gf->divide.w32 = gf_w4_divide_from_inverse; + gf->inverse.w32 = gf_w4_euclid; + } else if (h->divide_type == GF_DIVIDE_MATRIX) { + gf->divide.w32 = gf_w4_divide_from_inverse; + gf->inverse.w32 = gf_w4_matrix; + } + + if (gf->divide.w32 == NULL) { + gf->divide.w32 = gf_w4_divide_from_inverse; + if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w4_euclid; + } + + if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w4_inverse_from_divide; + + if (h->region_type == GF_REGION_CAUCHY) { + gf->multiply_region.w32 = gf_wgen_cauchy_region; + gf->extract_word.w32 = gf_wgen_extract_word; + } + + if (gf->multiply_region.w32 == NULL) { + gf->multiply_region.w32 = gf_w4_multiply_region_from_single; + } + + return 1; +} + +/* Inline setup functions */ + +uint8_t *gf_w4_get_mult_table(gf_t *gf) +{ + gf_internal_t *h; + struct gf_single_table_data *std; + + h = (gf_internal_t *) gf->scratch; + if (gf->multiply.w32 == gf_w4_single_table_multiply) { + std = (struct gf_single_table_data *) h->private; + return (uint8_t *) std->mult; + } + return NULL; +} + +uint8_t *gf_w4_get_div_table(gf_t *gf) +{ + gf_internal_t *h; + struct gf_single_table_data *std; + + h = (gf_internal_t *) gf->scratch; + if (gf->multiply.w32 == gf_w4_single_table_multiply) { + std = (struct gf_single_table_data *) h->private; + return (uint8_t *) std->div; + } + return NULL; +} + diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w64.c b/src/erasure-code/jerasure/gf-complete/src/gf_w64.c new file mode 100644 index 000000000000..f04daf05df67 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/gf_w64.c @@ -0,0 +1,2244 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_w64.c + * + * Routines for 64-bit Galois fields + */ + +#include "gf_int.h" +#include +#include + +#define GF_FIELD_WIDTH (64) +#define GF_FIRST_BIT (1ULL << 63) + +#define GF_BASE_FIELD_WIDTH (32) +#define GF_BASE_FIELD_SIZE (1ULL << GF_BASE_FIELD_WIDTH) +#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1 + +struct gf_w64_group_data { + uint64_t *reduce; + uint64_t *shift; + uint64_t *memory; +}; + +struct gf_split_4_64_lazy_data { + uint64_t tables[16][16]; + uint64_t last_value; +}; + +struct gf_split_8_64_lazy_data { + uint64_t tables[8][(1<<8)]; + uint64_t last_value; +}; + +struct gf_split_16_64_lazy_data { + uint64_t tables[4][(1<<16)]; + uint64_t last_value; +}; + +struct gf_split_8_8_data { + uint64_t tables[15][256][256]; +}; + +static +inline +gf_val_64_t gf_w64_inverse_from_divide (gf_t *gf, gf_val_64_t a) +{ + return gf->divide.w64(gf, 1, a); +} + +#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); } + +static +inline +gf_val_64_t gf_w64_divide_from_inverse (gf_t *gf, gf_val_64_t a, gf_val_64_t b) +{ + b = gf->inverse.w64(gf, b); + return gf->multiply.w64(gf, a, b); +} + +static +void +gf_w64_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int +xor) +{ + int i; + gf_val_64_t *s64; + gf_val_64_t *d64; + + s64 = (gf_val_64_t *) src; + d64 = (gf_val_64_t *) dest; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + if (xor) { + for (i = 0; i < bytes/sizeof(gf_val_64_t); i++) { + d64[i] ^= gf->multiply.w64(gf, val, s64[i]); + } + } else { + for (i = 0; i < bytes/sizeof(gf_val_64_t); i++) { + d64[i] = gf->multiply.w64(gf, val, s64[i]); + } + } +} + +#if defined(INTEL_SSE4_PCLMUL) +static +void +gf_w64_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int +xor) +{ + gf_val_64_t *s64, *d64, *top; + gf_region_data rd; + + __m128i a, b; + __m128i result, r1; + __m128i prim_poly; + __m128i w; + __m128i m1, m2, m3, m4; + gf_internal_t * h = gf->scratch; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); + b = _mm_insert_epi64 (_mm_setzero_si128(), val, 0); + m1 = _mm_set_epi32(0, 0, 0, (uint32_t)0xffffffff); + m2 = _mm_slli_si128(m1, 4); + m2 = _mm_or_si128(m1, m2); + m3 = _mm_slli_si128(m1, 8); + m4 = _mm_slli_si128(m3, 4); + + s64 = (gf_val_64_t *) rd.s_start; + d64 = (gf_val_64_t *) rd.d_start; + top = (gf_val_64_t *) rd.d_top; + + if (xor) { + while (d64 != top) { + a = _mm_load_si128((__m128i *) s64); + result = _mm_clmulepi64_si128 (a, b, 1); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); + r1 = _mm_xor_si128 (result, w); + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); + result = _mm_xor_si128 (result, w); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); + result = _mm_xor_si128 (result, w); + + result = _mm_unpacklo_epi64(result, r1); + + r1 = _mm_load_si128((__m128i *) d64); + result = _mm_xor_si128(r1, result); + _mm_store_si128((__m128i *) d64, result); + d64 += 2; + s64 += 2; + } + } else { + while (d64 != top) { + + a = _mm_load_si128((__m128i *) s64); + result = _mm_clmulepi64_si128 (a, b, 1); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); + r1 = _mm_xor_si128 (result, w); + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); + result = _mm_xor_si128 (result, w); + + result = _mm_unpacklo_epi64(result, r1); + + _mm_store_si128((__m128i *) d64, result); + d64 += 2; + s64 += 2; + } + } + gf_do_final_region_alignment(&rd); +} +#endif + +#if defined(INTEL_SSE4_PCLMUL) +static +void +gf_w64_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int +xor) +{ + gf_val_64_t *s64, *d64, *top; + gf_region_data rd; + + __m128i a, b; + __m128i result, r1; + __m128i prim_poly; + __m128i w; + __m128i m1, m3, m4; + gf_internal_t * h = gf->scratch; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); + b = _mm_insert_epi64 (_mm_setzero_si128(), val, 0); + m1 = _mm_set_epi32(0, 0, 0, (uint32_t)0xffffffff); + m3 = _mm_slli_si128(m1, 8); + m4 = _mm_slli_si128(m3, 4); + + s64 = (gf_val_64_t *) rd.s_start; + d64 = (gf_val_64_t *) rd.d_start; + top = (gf_val_64_t *) rd.d_top; + + if (xor) { + while (d64 != top) { + a = _mm_load_si128((__m128i *) s64); + result = _mm_clmulepi64_si128 (a, b, 1); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); + r1 = _mm_xor_si128 (result, w); + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); + result = _mm_xor_si128 (result, w); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); + result = _mm_xor_si128 (result, w); + + result = _mm_unpacklo_epi64(result, r1); + + r1 = _mm_load_si128((__m128i *) d64); + result = _mm_xor_si128(r1, result); + _mm_store_si128((__m128i *) d64, result); + d64 += 2; + s64 += 2; + } + } else { + while (d64 != top) { + a = _mm_load_si128((__m128i *) s64); + result = _mm_clmulepi64_si128 (a, b, 1); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); + r1 = _mm_xor_si128 (result, w); + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); + result = _mm_xor_si128 (result, w); + + result = _mm_unpacklo_epi64(result, r1); + + _mm_store_si128((__m128i *) d64, result); + d64 += 2; + s64 += 2; + } + } + gf_do_final_region_alignment(&rd); +} +#endif + +static + inline +gf_val_64_t gf_w64_euclid (gf_t *gf, gf_val_64_t b) +{ + gf_val_64_t e_i, e_im1, e_ip1; + gf_val_64_t d_i, d_im1, d_ip1; + gf_val_64_t y_i, y_im1, y_ip1; + gf_val_64_t c_i; + gf_val_64_t one = 1; + + if (b == 0) return -1; + e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; + e_i = b; + d_im1 = 64; + for (d_i = d_im1-1; ((one << d_i) & e_i) == 0; d_i--) ; + y_i = 1; + y_im1 = 0; + + while (e_i != 1) { + + e_ip1 = e_im1; + d_ip1 = d_im1; + c_i = 0; + + while (d_ip1 >= d_i) { + c_i ^= (one << (d_ip1 - d_i)); + e_ip1 ^= (e_i << (d_ip1 - d_i)); + d_ip1--; + if (e_ip1 == 0) return 0; + while ((e_ip1 & (one << d_ip1)) == 0) d_ip1--; + } + + y_ip1 = y_im1 ^ gf->multiply.w64(gf, c_i, y_i); + y_im1 = y_i; + y_i = y_ip1; + + e_im1 = e_i; + d_im1 = d_i; + e_i = e_ip1; + d_i = d_ip1; + } + + return y_i; +} + +/* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm. I only + include it for completeness. It does have the feature that it requires no + extra memory. +*/ + +static +inline +gf_val_64_t +gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) +{ + uint64_t pl, pr, ppl, ppr, i, a, bl, br, one, lbit; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + ppr = h->prim_poly; + + /* Allen: set leading one of primitive polynomial */ + + ppl = 1; + + a = a64; + bl = 0; + br = b64; + one = 1; + lbit = (one << 63); + + pl = 0; /* Allen: left side of product */ + pr = 0; /* Allen: right side of product */ + + /* Allen: unlike the corresponding functions for smaller word sizes, + * this loop carries out the initial carryless multiply by + * shifting b itself rather than simply looking at successively + * higher shifts of b */ + + for (i = 0; i < GF_FIELD_WIDTH; i++) { + if (a & (one << i)) { + pl ^= bl; + pr ^= br; + } + + bl <<= 1; + if (br & lbit) bl ^= 1; + br <<= 1; + } + + /* Allen: the name of the variable "one" is no longer descriptive at this point */ + + one = lbit >> 1; + ppl = (h->prim_poly >> 2) | one; + ppr = (h->prim_poly << (GF_FIELD_WIDTH-2)); + while (one != 0) { + if (pl & one) { + pl ^= ppl; + pr ^= ppr; + } + one >>= 1; + ppr >>= 1; + if (ppl & 1) ppr ^= lbit; + ppl >>= 1; + } + return pr; +} + +/* + * ELM: Use the Intel carryless multiply instruction to do very fast 64x64 multiply. + */ + +static +inline +gf_val_64_t +gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) +{ + gf_val_64_t rv = 0; + +#if defined(INTEL_SSE4_PCLMUL) + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i v, w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0); + b = _mm_insert_epi64 (a, b64, 0); + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + /* Mask off the high order 32 bits using subtraction of the polynomial. + * NOTE: this part requires that the polynomial have at least 32 leading 0 bits. + */ + + /* Adam: We cant include the leading one in the 64 bit pclmul, + so we need to split up the high 8 bytes of the result into two + parts before we multiply them with the prim_poly.*/ + + v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); + w = _mm_clmulepi64_si128 (prim_poly, v, 0); + result = _mm_xor_si128 (result, w); + v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); + w = _mm_clmulepi64_si128 (prim_poly, v, 0); + result = _mm_xor_si128 (result, w); + + rv = ((gf_val_64_t)_mm_extract_epi64(result, 0)); +#endif + return rv; +} + +static +inline +gf_val_64_t +gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) +{ + gf_val_64_t rv = 0; + +#if defined(INTEL_SSE4_PCLMUL) + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i v, w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0); + b = _mm_insert_epi64 (a, b64, 0); + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); + w = _mm_clmulepi64_si128 (prim_poly, v, 0); + result = _mm_xor_si128 (result, w); + v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); + w = _mm_clmulepi64_si128 (prim_poly, v, 0); + result = _mm_xor_si128 (result, w); + + v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); + w = _mm_clmulepi64_si128 (prim_poly, v, 0); + result = _mm_xor_si128 (result, w); + v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); + w = _mm_clmulepi64_si128 (prim_poly, v, 0); + result = _mm_xor_si128 (result, w); + + rv = ((gf_val_64_t)_mm_extract_epi64(result, 0)); +#endif + return rv; +} + + + void +gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) +{ +#if defined(INTEL_SSE4_PCLMUL) + gf_internal_t *h; + uint8_t *s8, *d8, *dtop; + gf_region_data rd; + __m128i v, b, m, prim_poly, c, fr, w, result; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + dtop = (uint8_t *) rd.d_top; + + v = _mm_insert_epi64(_mm_setzero_si128(), val, 0); + m = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff); + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); + + if (xor) { + while (d8 != dtop) { + b = _mm_load_si128((__m128i *) s8); + result = _mm_clmulepi64_si128 (b, v, 0); + c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); + w = _mm_clmulepi64_si128 (prim_poly, c, 0); + result = _mm_xor_si128 (result, w); + c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); + w = _mm_clmulepi64_si128 (prim_poly, c, 0); + fr = _mm_xor_si128 (result, w); + fr = _mm_and_si128 (fr, m); + + result = _mm_clmulepi64_si128 (b, v, 1); + c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); + w = _mm_clmulepi64_si128 (prim_poly, c, 0); + result = _mm_xor_si128 (result, w); + c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); + w = _mm_clmulepi64_si128 (prim_poly, c, 0); + result = _mm_xor_si128 (result, w); + result = _mm_slli_si128 (result, 8); + fr = _mm_xor_si128 (result, fr); + result = _mm_load_si128((__m128i *) d8); + fr = _mm_xor_si128 (result, fr); + + _mm_store_si128((__m128i *) d8, fr); + d8 += 16; + s8 += 16; + } + } else { + while (d8 < dtop) { + b = _mm_load_si128((__m128i *) s8); + result = _mm_clmulepi64_si128 (b, v, 0); + c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); + w = _mm_clmulepi64_si128 (prim_poly, c, 0); + result = _mm_xor_si128 (result, w); + c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); + w = _mm_clmulepi64_si128 (prim_poly, c, 0); + fr = _mm_xor_si128 (result, w); + fr = _mm_and_si128 (fr, m); + + result = _mm_clmulepi64_si128 (b, v, 1); + c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); + w = _mm_clmulepi64_si128 (prim_poly, c, 0); + result = _mm_xor_si128 (result, w); + c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); + w = _mm_clmulepi64_si128 (prim_poly, c, 0); + result = _mm_xor_si128 (result, w); + result = _mm_slli_si128 (result, 8); + fr = _mm_xor_si128 (result, fr); + + _mm_store_si128((__m128i *) d8, fr); + d8 += 16; + s8 += 16; + } + } + gf_do_final_region_alignment(&rd); +#endif +} + +void +gf_w64_split_4_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) +{ + gf_internal_t *h; + struct gf_split_4_64_lazy_data *ld; + int i, j, k; + uint64_t pp, v, s, *s64, *d64, *top; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + ld = (struct gf_split_4_64_lazy_data *) h->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + gf_do_initial_region_alignment(&rd); + + if (ld->last_value != val) { + v = val; + for (i = 0; i < 16; i++) { + ld->tables[i][0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[i][k^j] = (v ^ ld->tables[i][k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + } + } + ld->last_value = val; + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + while (d64 != top) { + v = (xor) ? *d64 : 0; + s = *s64; + i = 0; + while (s != 0) { + v ^= ld->tables[i][s&0xf]; + s >>= 4; + i++; + } + *d64 = v; + d64++; + s64++; + } + gf_do_final_region_alignment(&rd); +} + +static +inline +uint64_t +gf_w64_split_8_8_multiply (gf_t *gf, uint64_t a64, uint64_t b64) +{ + uint64_t product, i, j, mask, tb; + gf_internal_t *h; + struct gf_split_8_8_data *d8; + + h = (gf_internal_t *) gf->scratch; + d8 = (struct gf_split_8_8_data *) h->private; + product = 0; + mask = 0xff; + + for (i = 0; a64 != 0; i++) { + tb = b64; + for (j = 0; tb != 0; j++) { + product ^= d8->tables[i+j][a64&mask][tb&mask]; + tb >>= 8; + } + a64 >>= 8; + } + return product; +} + +void +gf_w64_split_8_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) +{ + gf_internal_t *h; + struct gf_split_8_64_lazy_data *ld; + int i, j, k; + uint64_t pp, v, s, *s64, *d64, *top; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + ld = (struct gf_split_8_64_lazy_data *) h->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); + gf_do_initial_region_alignment(&rd); + + if (ld->last_value != val) { + v = val; + for (i = 0; i < 8; i++) { + ld->tables[i][0] = 0; + for (j = 1; j < 256; j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[i][k^j] = (v ^ ld->tables[i][k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + } + } + ld->last_value = val; + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + while (d64 != top) { + v = (xor) ? *d64 : 0; + s = *s64; + i = 0; + while (s != 0) { + v ^= ld->tables[i][s&0xff]; + s >>= 8; + i++; + } + *d64 = v; + d64++; + s64++; + } + gf_do_final_region_alignment(&rd); +} + +void +gf_w64_split_16_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) +{ + gf_internal_t *h; + struct gf_split_16_64_lazy_data *ld; + int i, j, k; + uint64_t pp, v, s, *s64, *d64, *top; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + ld = (struct gf_split_16_64_lazy_data *) h->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); + gf_do_initial_region_alignment(&rd); + + if (ld->last_value != val) { + v = val; + for (i = 0; i < 4; i++) { + ld->tables[i][0] = 0; + for (j = 1; j < (1<<16); j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[i][k^j] = (v ^ ld->tables[i][k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + } + } + ld->last_value = val; + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + while (d64 != top) { + v = (xor) ? *d64 : 0; + s = *s64; + i = 0; + while (s != 0) { + v ^= ld->tables[i][s&0xffff]; + s >>= 16; + i++; + } + *d64 = v; + d64++; + s64++; + } + gf_do_final_region_alignment(&rd); +} + +static +int gf_w64_shift_init(gf_t *gf) +{ + gf->multiply.w64 = gf_w64_shift_multiply; + gf->inverse.w64 = gf_w64_euclid; + gf->multiply_region.w64 = gf_w64_multiply_region_from_single; + return 1; +} + +static +int gf_w64_cfm_init(gf_t *gf) +{ + gf->inverse.w64 = gf_w64_euclid; + gf->multiply_region.w64 = gf_w64_multiply_region_from_single; + +#if defined(INTEL_SSE4_PCLMUL) + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ + gf->multiply.w64 = gf_w64_clm_multiply_2; + gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_2; + }else if((0xfffe000000000000ULL & h->prim_poly) == 0){ + gf->multiply.w64 = gf_w64_clm_multiply_4; + gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_4; + } else { + return 0; + } + return 1; +#endif + + return 0; +} + +static +void +gf_w64_group_set_shift_tables(uint64_t *shift, uint64_t val, gf_internal_t *h) +{ + int i; + uint64_t j; + uint64_t one = 1; + int g_s; + + g_s = h->arg1; + shift[0] = 0; + + for (i = 1; i < (1 << g_s); i <<= 1) { + for (j = 0; j < i; j++) shift[i|j] = shift[j]^val; + if (val & (one << 63)) { + val <<= 1; + val ^= h->prim_poly; + } else { + val <<= 1; + } + } +} + +static +inline +gf_val_64_t +gf_w64_group_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b) +{ + uint64_t top, bot, mask, tp; + int g_s, g_r, lshift, rshift; + struct gf_w64_group_data *gd; + + gf_internal_t *h = (gf_internal_t *) gf->scratch; + g_s = h->arg1; + g_r = h->arg2; + gd = (struct gf_w64_group_data *) h->private; + gf_w64_group_set_shift_tables(gd->shift, b, h); + + mask = ((1 << g_s) - 1); + top = 0; + bot = gd->shift[a&mask]; + a >>= g_s; + + if (a == 0) return bot; + lshift = 0; + rshift = 64; + + do { /* Shifting out is straightfoward */ + lshift += g_s; + rshift -= g_s; + tp = gd->shift[a&mask]; + top ^= (tp >> rshift); + bot ^= (tp << lshift); + a >>= g_s; + } while (a != 0); + + /* Reducing is a bit gross, because I don't zero out the index bits of top. + The reason is that we throw top away. Even better, that last (tp >> rshift) + is going to be ignored, so it doesn't matter how (tp >> 64) is implemented. */ + + lshift = ((lshift-1) / g_r) * g_r; + rshift = 64 - lshift; + mask = (1 << g_r) - 1; + while (lshift >= 0) { + tp = gd->reduce[(top >> lshift) & mask]; + top ^= (tp >> rshift); + bot ^= (tp << lshift); + lshift -= g_r; + rshift += g_r; + } + + return bot; +} + +static +void gf_w64_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) +{ + int i, fzb; + uint64_t a64, smask, rmask, top, bot, tp; + int lshift, rshift, g_s, g_r; + gf_region_data rd; + uint64_t *s64, *d64, *dtop; + struct gf_w64_group_data *gd; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gd = (struct gf_w64_group_data *) h->private; + g_s = h->arg1; + g_r = h->arg2; + gf_w64_group_set_shift_tables(gd->shift, val, h); + + for (i = 63; !(val & (1ULL << i)); i--) ; + i += g_s; + + /* i is the bit position of the first zero bit in any element of + gd->shift[] */ + + if (i > 64) i = 64; + + fzb = i; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); + + gf_do_initial_region_alignment(&rd); + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + dtop = (uint64_t *) rd.d_top; + + smask = (1 << g_s) - 1; + rmask = (1 << g_r) - 1; + + while (d64 < dtop) { + a64 = *s64; + + top = 0; + bot = gd->shift[a64&smask]; + a64 >>= g_s; + i = fzb; + + if (a64 != 0) { + lshift = 0; + rshift = 64; + + do { + lshift += g_s; + rshift -= g_s; + tp = gd->shift[a64&smask]; + top ^= (tp >> rshift); + bot ^= (tp << lshift); + a64 >>= g_s; + } while (a64 != 0); + i += lshift; + + lshift = ((i-64-1) / g_r) * g_r; + rshift = 64 - lshift; + while (lshift >= 0) { + tp = gd->reduce[(top >> lshift) & rmask]; + top ^= (tp >> rshift); + bot ^= (tp << lshift); + lshift -= g_r; + rshift += g_r; + } + } + + if (xor) bot ^= *d64; + *d64 = bot; + d64++; + s64++; + } + gf_do_final_region_alignment(&rd); +} + +static +inline +gf_val_64_t +gf_w64_group_s_equals_r_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b) +{ + int leftover, rs; + uint64_t p, l, ind, a64; + int bits_left; + int g_s; + + struct gf_w64_group_data *gd; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + g_s = h->arg1; + + gd = (struct gf_w64_group_data *) h->private; + gf_w64_group_set_shift_tables(gd->shift, b, h); + + leftover = 64 % g_s; + if (leftover == 0) leftover = g_s; + + rs = 64 - leftover; + a64 = a; + ind = a64 >> rs; + a64 <<= leftover; + p = gd->shift[ind]; + + bits_left = rs; + rs = 64 - g_s; + + while (bits_left > 0) { + bits_left -= g_s; + ind = a64 >> rs; + a64 <<= g_s; + l = p >> rs; + p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s)); + } + return p; +} + +static +void gf_w64_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) +{ + int leftover, rs; + uint64_t p, l, ind, a64; + int bits_left; + int g_s; + gf_region_data rd; + uint64_t *s64, *d64, *top; + struct gf_w64_group_data *gd; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gd = (struct gf_w64_group_data *) h->private; + g_s = h->arg1; + gf_w64_group_set_shift_tables(gd->shift, val, h); + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); + gf_do_initial_region_alignment(&rd); + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + leftover = 64 % g_s; + if (leftover == 0) leftover = g_s; + + while (d64 < top) { + rs = 64 - leftover; + a64 = *s64; + ind = a64 >> rs; + a64 <<= leftover; + p = gd->shift[ind]; + + bits_left = rs; + rs = 64 - g_s; + + while (bits_left > 0) { + bits_left -= g_s; + ind = a64 >> rs; + a64 <<= g_s; + l = p >> rs; + p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s)); + } + if (xor) p ^= *d64; + *d64 = p; + d64++; + s64++; + } + gf_do_final_region_alignment(&rd); +} + + +static +int gf_w64_group_init(gf_t *gf) +{ + uint64_t i, j, p, index; + struct gf_w64_group_data *gd; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + int g_r, g_s; + + g_s = h->arg1; + g_r = h->arg2; + + gd = (struct gf_w64_group_data *) h->private; + gd->shift = (uint64_t *) (&(gd->memory)); + gd->reduce = gd->shift + (1 << g_s); + + gd->reduce[0] = 0; + for (i = 0; i < (1 << g_r); i++) { + p = 0; + index = 0; + for (j = 0; j < g_r; j++) { + if (i & (1 << j)) { + p ^= (h->prim_poly << j); + index ^= (1 << j); + if (j > 0) index ^= (h->prim_poly >> (64-j)); + } + } + gd->reduce[index] = p; + } + + if (g_s == g_r) { + gf->multiply.w64 = gf_w64_group_s_equals_r_multiply; + gf->multiply_region.w64 = gf_w64_group_s_equals_r_multiply_region; + } else { + gf->multiply.w64 = gf_w64_group_multiply; + gf->multiply_region.w64 = gf_w64_group_multiply_region; + } + gf->divide.w64 = NULL; + gf->inverse.w64 = gf_w64_euclid; + + return 1; +} + +static +gf_val_64_t gf_w64_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + uint64_t *r64, rv; + + r64 = (uint64_t *) start; + rv = r64[index]; + return rv; +} + +static +gf_val_64_t gf_w64_composite_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + int sub_size; + gf_internal_t *h; + uint8_t *r8, *top; + uint64_t a, b, *r64; + gf_region_data rd; + + h = (gf_internal_t *) gf->scratch; + gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32); + r64 = (uint64_t *) start; + if (r64 + index < (uint64_t *) rd.d_start) return r64[index]; + if (r64 + index >= (uint64_t *) rd.d_top) return r64[index]; + index -= (((uint64_t *) rd.d_start) - r64); + r8 = (uint8_t *) rd.d_start; + top = (uint8_t *) rd.d_top; + sub_size = (top-r8)/2; + + a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index); + b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index); + return (a | ((uint64_t)b << 32)); +} + +static +gf_val_64_t gf_w64_split_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + int i; + uint64_t *r64, rv; + uint8_t *r8; + gf_region_data rd; + + gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 128); + r64 = (uint64_t *) start; + if (r64 + index < (uint64_t *) rd.d_start) return r64[index]; + if (r64 + index >= (uint64_t *) rd.d_top) return r64[index]; + index -= (((uint64_t *) rd.d_start) - r64); + r8 = (uint8_t *) rd.d_start; + r8 += ((index & 0xfffffff0)*8); + r8 += (index & 0xf); + r8 += 112; + rv =0; + for (i = 0; i < 8; i++) { + rv <<= 8; + rv |= *r8; + r8 -= 16; + } + return rv; +} + +static +inline +gf_val_64_t +gf_w64_bytwo_b_multiply (gf_t *gf, gf_val_64_t a, gf_val_64_t b) +{ + uint64_t prod, pp, bmask; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + prod = 0; + bmask = 0x8000000000000000ULL; + + while (1) { + if (a & 1) prod ^= b; + a >>= 1; + if (a == 0) return prod; + if (b & bmask) { + b = ((b << 1) ^ pp); + } else { + b <<= 1; + } + } +} + +static +inline +gf_val_64_t +gf_w64_bytwo_p_multiply (gf_t *gf, gf_val_64_t a, gf_val_64_t b) +{ + uint64_t prod, pp, pmask, amask; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + prod = 0; + + /* changed from declare then shift to just declare.*/ + + pmask = 0x8000000000000000ULL; + amask = 0x8000000000000000ULL; + + while (amask != 0) { + if (prod & pmask) { + prod = ((prod << 1) ^ pp); + } else { + prod <<= 1; + } + if (a & amask) prod ^= b; + amask >>= 1; + } + return prod; +} + +static +void +gf_w64_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) +{ + uint64_t *s64, *d64, ta, prod, amask, pmask, pp; + gf_region_data rd; + gf_internal_t *h; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + gf_do_initial_region_alignment(&rd); + + h = (gf_internal_t *) gf->scratch; + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + pmask = 0x80000000; + pmask <<= 32; + pp = h->prim_poly; + + if (xor) { + while (s64 < (uint64_t *) rd.s_top) { + prod = 0; + amask = pmask; + ta = *s64; + while (amask != 0) { + prod = (prod & pmask) ? ((prod << 1) ^ pp) : (prod << 1); + if (val & amask) prod ^= ta; + amask >>= 1; + } + *d64 ^= prod; + d64++; + s64++; + } + } else { + while (s64 < (uint64_t *) rd.s_top) { + prod = 0; + amask = pmask; + ta = *s64; + while (amask != 0) { + prod = (prod & pmask) ? ((prod << 1) ^ pp) : (prod << 1); + if (val & amask) prod ^= ta; + amask >>= 1; + } + *d64 = prod; + d64++; + s64++; + } + } + gf_do_final_region_alignment(&rd); +} + +static +void +gf_w64_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) +{ + uint64_t *s64, *d64, ta, tb, prod, bmask, pp; + gf_region_data rd; + gf_internal_t *h; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + gf_do_initial_region_alignment(&rd); + + h = (gf_internal_t *) gf->scratch; + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + bmask = 0x80000000; + bmask <<= 32; + pp = h->prim_poly; + + if (xor) { + while (s64 < (uint64_t *) rd.s_top) { + prod = 0; + tb = val; + ta = *s64; + while (1) { + if (tb & 1) prod ^= ta; + tb >>= 1; + if (tb == 0) break; + ta = (ta & bmask) ? ((ta << 1) ^ pp) : (ta << 1); + } + *d64 ^= prod; + d64++; + s64++; + } + } else { + while (s64 < (uint64_t *) rd.s_top) { + prod = 0; + tb = val; + ta = *s64; + while (1) { + if (tb & 1) prod ^= ta; + tb >>= 1; + if (tb == 0) break; + ta = (ta & bmask) ? ((ta << 1) ^ pp) : (ta << 1); + } + *d64 = prod; + d64++; + s64++; + } + } + gf_do_final_region_alignment(&rd); +} + +#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\ + t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \ + t2 = _mm_and_si128(va, m2); \ + t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \ + va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); } + +#define BYTWO_P_ONESTEP {\ + SSE_AB2(pp, m1 ,m2, prod, t1, t2); \ + t1 = _mm_and_si128(v, one); \ + t1 = _mm_sub_epi64(t1, one); \ + t1 = _mm_and_si128(t1, ta); \ + prod = _mm_xor_si128(prod, t1); \ + v = _mm_srli_epi64(v, 1); } + + +void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) +{ +#ifdef INTEL_SSE2 + int i; + uint8_t *s8, *d8; + uint64_t vrev, one64; + uint64_t amask; + __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v; + gf_region_data rd; + gf_internal_t *h; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + h = (gf_internal_t *) gf->scratch; + one64 = 1; + vrev = 0; + for (i = 0; i < 64; i++) { + vrev <<= 1; + if (!(val & (one64 << i))) vrev |= 1; + } + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + amask = -1; + amask ^= 1; + pp = _mm_set1_epi64x(h->prim_poly); + m1 = _mm_set1_epi64x(amask); + m2 = _mm_set1_epi64x(one64 << 63); + one = _mm_set1_epi64x(1); + + while (d8 < (uint8_t *) rd.d_top) { + prod = _mm_setzero_si128(); + v = _mm_set1_epi64x(vrev); + ta = _mm_load_si128((__m128i *) s8); + tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8); + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp)); + d8 += 16; + s8 += 16; + } + gf_do_final_region_alignment(&rd); +#endif +} + +#ifdef INTEL_SSE2 +static +void +gf_w64_bytwo_b_sse_region_2_xor(gf_region_data *rd) +{ + uint64_t one64, amask; + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va, vb; + gf_internal_t *h; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + h = (gf_internal_t *) rd->gf->scratch; + one64 = 1; + amask = -1; + amask ^= 1; + pp = _mm_set1_epi64x(h->prim_poly); + m1 = _mm_set1_epi64x(amask); + m2 = _mm_set1_epi64x(one64 << 63); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, m2, va, t1, t2); + vb = _mm_load_si128 ((__m128i *)(d8)); + vb = _mm_xor_si128(vb, va); + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w64_bytwo_b_sse_region_2_noxor(gf_region_data *rd) +{ + uint64_t one64, amask; + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va; + gf_internal_t *h; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + h = (gf_internal_t *) rd->gf->scratch; + one64 = 1; + amask = -1; + amask ^= 1; + pp = _mm_set1_epi64x(h->prim_poly); + m1 = _mm_set1_epi64x(amask); + m2 = _mm_set1_epi64x(one64 << 63); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, m2, va, t1, t2); + _mm_store_si128((__m128i *)d8, va); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w64_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) +{ + uint64_t itb, amask, one64; + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va, vb; + gf_region_data rd; + gf_internal_t *h; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + if (val == 2) { + if (xor) { + gf_w64_bytwo_b_sse_region_2_xor(&rd); + } else { + gf_w64_bytwo_b_sse_region_2_noxor(&rd); + } + gf_do_final_region_alignment(&rd); + return; + } + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + h = (gf_internal_t *) gf->scratch; + + one64 = 1; + amask = -1; + amask ^= 1; + pp = _mm_set1_epi64x(h->prim_poly); + m1 = _mm_set1_epi64x(amask); + m2 = _mm_set1_epi64x(one64 << 63); + + while (d8 < (uint8_t *) rd.d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8)); + itb = val; + while (1) { + if (itb & 1) vb = _mm_xor_si128(vb, va); + itb >>= 1; + if (itb == 0) break; + SSE_AB2(pp, m1, m2, va, t1, t2); + } + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } + + gf_do_final_region_alignment(&rd); +} +#endif + + +static +int gf_w64_bytwo_init(gf_t *gf) +{ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + if (h->mult_type == GF_MULT_BYTWO_p) { + gf->multiply.w64 = gf_w64_bytwo_p_multiply; + #ifdef INTEL_SSE2 + if (h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region; + else + gf->multiply_region.w64 = gf_w64_bytwo_p_sse_multiply_region; + #else + gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region; + if(h->region_type & GF_REGION_SSE) + return 0; + #endif + } else { + gf->multiply.w64 = gf_w64_bytwo_b_multiply; + #ifdef INTEL_SSE2 + if (h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region; + else + gf->multiply_region.w64 = gf_w64_bytwo_b_sse_multiply_region; + #else + gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region; + if(h->region_type & GF_REGION_SSE) + return 0; + #endif + } + gf->inverse.w64 = gf_w64_euclid; + return 1; +} + + +static +gf_val_64_t +gf_w64_composite_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint32_t b0 = b & 0x00000000ffffffff; + uint32_t b1 = (b & 0xffffffff00000000) >> 32; + uint32_t a0 = a & 0x00000000ffffffff; + uint32_t a1 = (a & 0xffffffff00000000) >> 32; + uint32_t a1b1; + + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + return ((uint64_t)(base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32)); +} + +/* + * Composite field division trick (explained in 2007 tech report) + * + * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1 + * + * let c = b^-1 + * + * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0) + * + * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1 + * + * let d = b1c1 and d+1 = b0c0 + * + * solve s*b1c1+b1c0+b0c1 = 0 + * + * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1 + * + * c0 = (d+1)b0^-1 + * c1 = d*b1^-1 + * + * a / b = a * c + */ + +static +gf_val_64_t +gf_w64_composite_inverse(gf_t *gf, gf_val_64_t a) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint32_t a0 = a & 0x00000000ffffffff; + uint32_t a1 = (a & 0xffffffff00000000) >> 32; + uint32_t c0, c1, d, tmp; + uint64_t c; + uint32_t a0inv, a1inv; + + if (a0 == 0) { + a1inv = base_gf->inverse.w32(base_gf, a1); + c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly); + c1 = a1inv; + } else if (a1 == 0) { + c0 = base_gf->inverse.w32(base_gf, a0); + c1 = 0; + } else { + a1inv = base_gf->inverse.w32(base_gf, a1); + a0inv = base_gf->inverse.w32(base_gf, a0); + + d = base_gf->multiply.w32(base_gf, a1, a0inv); + + tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly); + tmp = base_gf->inverse.w32(base_gf, tmp); + + d = base_gf->multiply.w32(base_gf, d, tmp); + + c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv); + c1 = base_gf->multiply.w32(base_gf, d, a1inv); + } + + c = c0 | ((uint64_t)c1 << 32); + + return c; +} + +static +void +gf_w64_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint32_t b0 = val & 0x00000000ffffffff; + uint32_t b1 = (val & 0xffffffff00000000) >> 32; + uint64_t *s64, *d64; + uint64_t *top; + uint64_t a0, a1, a1b1; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + + s64 = rd.s_start; + d64 = rd.d_start; + top = rd.d_top; + + if (xor) { + while (d64 < top) { + a0 = *s64 & 0x00000000ffffffff; + a1 = (*s64 & 0xffffffff00000000) >> 32; + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + *d64 ^= ((uint64_t)(base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32)); + s64++; + d64++; + } + } else { + while (d64 < top) { + a0 = *s64 & 0x00000000ffffffff; + a1 = (*s64 & 0xffffffff00000000) >> 32; + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + *d64 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32)); + s64++; + d64++; + } + } +} + +static +void +gf_w64_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + gf_val_32_t val0 = val & 0x00000000ffffffff; + gf_val_32_t val1 = (val & 0xffffffff00000000) >> 32; + uint8_t *slow, *shigh; + uint8_t *dlow, *dhigh, *top; + int sub_reg_size; + gf_region_data rd; + + if (!xor) { + memset(dest, 0, bytes); + } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); + gf_do_initial_region_alignment(&rd); + + slow = (uint8_t *) rd.s_start; + dlow = (uint8_t *) rd.d_start; + top = (uint8_t*) rd.d_top; + sub_reg_size = (top - dlow)/2; + shigh = slow + sub_reg_size; + dhigh = dlow + sub_reg_size; + + base_gf->multiply_region.w32(base_gf, slow, dlow, val0, sub_reg_size, xor); + base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor); + base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1); + + gf_do_final_region_alignment(&rd); +} + + + +static +int gf_w64_composite_init(gf_t *gf) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + + if (h->region_type & GF_REGION_ALTMAP) { + gf->multiply_region.w64 = gf_w64_composite_multiply_region_alt; + } else { + gf->multiply_region.w64 = gf_w64_composite_multiply_region; + } + + gf->multiply.w64 = gf_w64_composite_multiply; + gf->divide.w64 = NULL; + gf->inverse.w64 = gf_w64_composite_inverse; + + return 1; +} + +#ifdef INTEL_SSSE3 +static + void +gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) +{ + gf_internal_t *h; + int i, j, k; + uint64_t pp, v, *s64, *d64, *top; + __m128i si, tables[16][8], p[8], v0, mask1; + struct gf_split_4_64_lazy_data *ld; + uint8_t btable[16]; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128); + gf_do_initial_region_alignment(&rd); + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + ld = (struct gf_split_4_64_lazy_data *) h->private; + + v = val; + for (i = 0; i < 16; i++) { + ld->tables[i][0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[i][k^j] = (v ^ ld->tables[i][k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + for (j = 0; j < 8; j++) { + for (k = 0; k < 16; k++) { + btable[k] = (uint8_t) ld->tables[i][k]; + ld->tables[i][k] >>= 8; + } + tables[i][j] = _mm_loadu_si128((__m128i *) btable); + } + } + + mask1 = _mm_set1_epi8(0xf); + + while (d64 != top) { + + if (xor) { + for (i = 0; i < 8; i++) p[i] = _mm_load_si128 ((__m128i *) (d64+i*2)); + } else { + for (i = 0; i < 8; i++) p[i] = _mm_setzero_si128(); + } + i = 0; + for (k = 0; k < 8; k++) { + v0 = _mm_load_si128((__m128i *) s64); + /* MM_PRINT8("v", v0); */ + s64 += 2; + + si = _mm_and_si128(v0, mask1); + + for (j = 0; j < 8; j++) { + p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); + } + i++; + v0 = _mm_srli_epi32(v0, 4); + si = _mm_and_si128(v0, mask1); + for (j = 0; j < 8; j++) { + p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); + } + i++; + } + for (i = 0; i < 8; i++) { + /* MM_PRINT8("v", p[i]); */ + _mm_store_si128((__m128i *) d64, p[i]); + d64 += 2; + } + } + gf_do_final_region_alignment(&rd); +} +#endif + +#ifdef INTEL_SSE4 +static + void +gf_w64_split_4_64_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) +{ + gf_internal_t *h; + int i, j, k; + uint64_t pp, v, *s64, *d64, *top; + __m128i si, tables[16][8], p[8], st[8], mask1, mask8, mask16, t1; + struct gf_split_4_64_lazy_data *ld; + uint8_t btable[16]; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128); + gf_do_initial_region_alignment(&rd); + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + ld = (struct gf_split_4_64_lazy_data *) h->private; + + v = val; + for (i = 0; i < 16; i++) { + ld->tables[i][0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[i][k^j] = (v ^ ld->tables[i][k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + for (j = 0; j < 8; j++) { + for (k = 0; k < 16; k++) { + btable[k] = (uint8_t) ld->tables[i][k]; + ld->tables[i][k] >>= 8; + } + tables[i][j] = _mm_loadu_si128((__m128i *) btable); + } + } + + mask1 = _mm_set1_epi8(0xf); + mask8 = _mm_set1_epi16(0xff); + mask16 = _mm_set1_epi32(0xffff); + + while (d64 != top) { + + for (i = 0; i < 8; i++) p[i] = _mm_setzero_si128(); + + for (k = 0; k < 8; k++) { + st[k] = _mm_load_si128((__m128i *) s64); + s64 += 2; + } + + for (k = 0; k < 4; k ++) { + st[k] = _mm_shuffle_epi32(st[k], _MM_SHUFFLE(3,1,2,0)); + st[k+4] = _mm_shuffle_epi32(st[k+4], _MM_SHUFFLE(2,0,3,1)); + t1 = _mm_blend_epi16(st[k], st[k+4], 0xf0); + st[k] = _mm_srli_si128(st[k], 8); + st[k+4] = _mm_slli_si128(st[k+4], 8); + st[k+4] = _mm_blend_epi16(st[k], st[k+4], 0xf0); + st[k] = t1; + } + +/* + printf("After pack pass 1\n"); + for (k = 0; k < 8; k++) { + MM_PRINT8("v", st[k]); + } + printf("\n"); + */ + + t1 = _mm_packus_epi32(_mm_and_si128(st[0], mask16), _mm_and_si128(st[2], mask16)); + st[2] = _mm_packus_epi32(_mm_srli_epi32(st[0], 16), _mm_srli_epi32(st[2], 16)); + st[0] = t1; + t1 = _mm_packus_epi32(_mm_and_si128(st[1], mask16), _mm_and_si128(st[3], mask16)); + st[3] = _mm_packus_epi32(_mm_srli_epi32(st[1], 16), _mm_srli_epi32(st[3], 16)); + st[1] = t1; + t1 = _mm_packus_epi32(_mm_and_si128(st[4], mask16), _mm_and_si128(st[6], mask16)); + st[6] = _mm_packus_epi32(_mm_srli_epi32(st[4], 16), _mm_srli_epi32(st[6], 16)); + st[4] = t1; + t1 = _mm_packus_epi32(_mm_and_si128(st[5], mask16), _mm_and_si128(st[7], mask16)); + st[7] = _mm_packus_epi32(_mm_srli_epi32(st[5], 16), _mm_srli_epi32(st[7], 16)); + st[5] = t1; + +/* + printf("After pack pass 2\n"); + for (k = 0; k < 8; k++) { + MM_PRINT8("v", st[k]); + } + printf("\n"); + */ + t1 = _mm_packus_epi16(_mm_and_si128(st[0], mask8), _mm_and_si128(st[1], mask8)); + st[1] = _mm_packus_epi16(_mm_srli_epi16(st[0], 8), _mm_srli_epi16(st[1], 8)); + st[0] = t1; + t1 = _mm_packus_epi16(_mm_and_si128(st[2], mask8), _mm_and_si128(st[3], mask8)); + st[3] = _mm_packus_epi16(_mm_srli_epi16(st[2], 8), _mm_srli_epi16(st[3], 8)); + st[2] = t1; + t1 = _mm_packus_epi16(_mm_and_si128(st[4], mask8), _mm_and_si128(st[5], mask8)); + st[5] = _mm_packus_epi16(_mm_srli_epi16(st[4], 8), _mm_srli_epi16(st[5], 8)); + st[4] = t1; + t1 = _mm_packus_epi16(_mm_and_si128(st[6], mask8), _mm_and_si128(st[7], mask8)); + st[7] = _mm_packus_epi16(_mm_srli_epi16(st[6], 8), _mm_srli_epi16(st[7], 8)); + st[6] = t1; + +/* + printf("After final pack pass 2\n"); + for (k = 0; k < 8; k++) { + MM_PRINT8("v", st[k]); + } + */ + i = 0; + for (k = 0; k < 8; k++) { + si = _mm_and_si128(st[k], mask1); + + for (j = 0; j < 8; j++) { + p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); + } + i++; + st[k] = _mm_srli_epi32(st[k], 4); + si = _mm_and_si128(st[k], mask1); + for (j = 0; j < 8; j++) { + p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); + } + i++; + } + + t1 = _mm_unpacklo_epi8(p[0], p[1]); + p[1] = _mm_unpackhi_epi8(p[0], p[1]); + p[0] = t1; + t1 = _mm_unpacklo_epi8(p[2], p[3]); + p[3] = _mm_unpackhi_epi8(p[2], p[3]); + p[2] = t1; + t1 = _mm_unpacklo_epi8(p[4], p[5]); + p[5] = _mm_unpackhi_epi8(p[4], p[5]); + p[4] = t1; + t1 = _mm_unpacklo_epi8(p[6], p[7]); + p[7] = _mm_unpackhi_epi8(p[6], p[7]); + p[6] = t1; + +/* + printf("After unpack pass 1:\n"); + for (i = 0; i < 8; i++) { + MM_PRINT8("v", p[i]); + } + */ + + t1 = _mm_unpacklo_epi16(p[0], p[2]); + p[2] = _mm_unpackhi_epi16(p[0], p[2]); + p[0] = t1; + t1 = _mm_unpacklo_epi16(p[1], p[3]); + p[3] = _mm_unpackhi_epi16(p[1], p[3]); + p[1] = t1; + t1 = _mm_unpacklo_epi16(p[4], p[6]); + p[6] = _mm_unpackhi_epi16(p[4], p[6]); + p[4] = t1; + t1 = _mm_unpacklo_epi16(p[5], p[7]); + p[7] = _mm_unpackhi_epi16(p[5], p[7]); + p[5] = t1; + +/* + printf("After unpack pass 2:\n"); + for (i = 0; i < 8; i++) { + MM_PRINT8("v", p[i]); + } + */ + + t1 = _mm_unpacklo_epi32(p[0], p[4]); + p[4] = _mm_unpackhi_epi32(p[0], p[4]); + p[0] = t1; + t1 = _mm_unpacklo_epi32(p[1], p[5]); + p[5] = _mm_unpackhi_epi32(p[1], p[5]); + p[1] = t1; + t1 = _mm_unpacklo_epi32(p[2], p[6]); + p[6] = _mm_unpackhi_epi32(p[2], p[6]); + p[2] = t1; + t1 = _mm_unpacklo_epi32(p[3], p[7]); + p[7] = _mm_unpackhi_epi32(p[3], p[7]); + p[3] = t1; + + if (xor) { + for (i = 0; i < 8; i++) { + t1 = _mm_load_si128((__m128i *) d64); + _mm_store_si128((__m128i *) d64, _mm_xor_si128(p[i], t1)); + d64 += 2; + } + } else { + for (i = 0; i < 8; i++) { + _mm_store_si128((__m128i *) d64, p[i]); + d64 += 2; + } + } + + } + + gf_do_final_region_alignment(&rd); +} +#endif + +#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1); + +static +int gf_w64_split_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_split_4_64_lazy_data *d4; + struct gf_split_8_64_lazy_data *d8; + struct gf_split_8_8_data *d88; + struct gf_split_16_64_lazy_data *d16; + uint64_t p, basep; + int exp, i, j; + + h = (gf_internal_t *) gf->scratch; + + /* Defaults */ + + gf->multiply_region.w64 = gf_w64_multiply_region_from_single; + + gf->multiply.w64 = gf_w64_bytwo_p_multiply; + +#if defined(INTEL_SSE4_PCLMUL) + if ((!(h->region_type & GF_REGION_NOSSE) && + (h->arg1 == 64 || h->arg2 == 64)) || + h->mult_type == GF_MULT_DEFAULT){ + + if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ + gf->multiply.w64 = gf_w64_clm_multiply_2; + gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_2; + }else if((0xfffe000000000000ULL & h->prim_poly) == 0){ + gf->multiply.w64 = gf_w64_clm_multiply_4; + gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_4; + }else{ + return 0; + } + } +#endif + + gf->inverse.w64 = gf_w64_euclid; + + /* Allen: set region pointers for default mult type. Single pointers are + * taken care of above (explicitly for sse, implicitly for no sse). */ + +#ifdef INTEL_SSE4 + if (h->mult_type == GF_MULT_DEFAULT) { + d4 = (struct gf_split_4_64_lazy_data *) h->private; + d4->last_value = 0; + gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region; + } +#else + if (h->mult_type == GF_MULT_DEFAULT) { + d8 = (struct gf_split_8_64_lazy_data *) h->private; + d8->last_value = 0; + gf->multiply_region.w64 = gf_w64_split_8_64_lazy_multiply_region; + } +#endif + + if ((h->arg1 == 4 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 4)) { + d4 = (struct gf_split_4_64_lazy_data *) h->private; + d4->last_value = 0; + + if((h->region_type & GF_REGION_ALTMAP) && (h->region_type & GF_REGION_NOSSE)) return 0; + if(h->region_type & GF_REGION_ALTMAP) + { + #ifdef INTEL_SSSE3 + gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_altmap_multiply_region; + #else + return 0; + #endif + } + else //no altmap + { + #ifdef INTEL_SSE4 + if(h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region; + else + gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region; + #else + gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region; + if(h->region_type & GF_REGION_SSE) + return 0; + #endif + } + } + if ((h->arg1 == 8 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 8)) { + d8 = (struct gf_split_8_64_lazy_data *) h->private; + d8->last_value = 0; + gf->multiply_region.w64 = gf_w64_split_8_64_lazy_multiply_region; + } + if ((h->arg1 == 16 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 16)) { + d16 = (struct gf_split_16_64_lazy_data *) h->private; + d16->last_value = 0; + gf->multiply_region.w64 = gf_w64_split_16_64_lazy_multiply_region; + } + if ((h->arg1 == 8 && h->arg2 == 8)) { + d88 = (struct gf_split_8_8_data *) h->private; + gf->multiply.w64 = gf_w64_split_8_8_multiply; + + /* The performance of this guy sucks, so don't bother with a region op */ + + basep = 1; + for (exp = 0; exp < 15; exp++) { + for (j = 0; j < 256; j++) d88->tables[exp][0][j] = 0; + for (i = 0; i < 256; i++) d88->tables[exp][i][0] = 0; + d88->tables[exp][1][1] = basep; + for (i = 2; i < 256; i++) { + if (i&1) { + p = d88->tables[exp][i^1][1]; + d88->tables[exp][i][1] = p ^ basep; + } else { + p = d88->tables[exp][i>>1][1]; + d88->tables[exp][i][1] = GF_MULTBY_TWO(p); + } + } + for (i = 1; i < 256; i++) { + p = d88->tables[exp][i][1]; + for (j = 1; j < 256; j++) { + if (j&1) { + d88->tables[exp][i][j] = d88->tables[exp][i][j^1] ^ p; + } else { + d88->tables[exp][i][j] = GF_MULTBY_TWO(d88->tables[exp][i][j>>1]); + } + } + } + for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep); + } + } + return 1; +} + +int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) +{ + switch(mult_type) + { + case GF_MULT_SHIFT: + return sizeof(gf_internal_t); + break; + case GF_MULT_CARRY_FREE: + return sizeof(gf_internal_t); + break; + case GF_MULT_BYTWO_p: + case GF_MULT_BYTWO_b: + return sizeof(gf_internal_t); + break; + + case GF_MULT_DEFAULT: + + /* Allen: set the *local* arg1 and arg2, just for scratch size purposes, + * then fall through to split table scratch size code. */ + +#ifdef INTEL_SSE4 + arg1 = 64; + arg2 = 4; +#else + arg1 = 64; + arg2 = 8; +#endif + + case GF_MULT_SPLIT_TABLE: + if (arg1 == 8 && arg2 == 8) { + return sizeof(gf_internal_t) + sizeof(struct gf_split_8_8_data) + 64; + } + if ((arg1 == 16 && arg2 == 64) || (arg2 == 16 && arg1 == 64)) { + return sizeof(gf_internal_t) + sizeof(struct gf_split_16_64_lazy_data) + 64; + } + if ((arg1 == 8 && arg2 == 64) || (arg2 == 8 && arg1 == 64)) { + return sizeof(gf_internal_t) + sizeof(struct gf_split_8_64_lazy_data) + 64; + } + + if ((arg1 == 64 && arg2 == 4) || (arg1 == 4 && arg2 == 64)) { + return sizeof(gf_internal_t) + sizeof(struct gf_split_4_64_lazy_data) + 64; + } + return 0; + case GF_MULT_GROUP: + return sizeof(gf_internal_t) + sizeof(struct gf_w64_group_data) + + sizeof(uint64_t) * (1 << arg1) + + sizeof(uint64_t) * (1 << arg2) + 64; + break; + case GF_MULT_COMPOSITE: + if (arg1 == 2) return sizeof(gf_internal_t) + 64; + return 0; + break; + default: + return 0; + } +} + +int gf_w64_init(gf_t *gf) +{ + gf_internal_t *h; + int no_default_flag = 0; + + h = (gf_internal_t *) gf->scratch; + + /* Allen: set default primitive polynomial / irreducible polynomial if needed */ + + /* Omitting the leftmost 1 as in w=32 */ + + if (h->prim_poly == 0) { + if (h->mult_type == GF_MULT_COMPOSITE) { + h->prim_poly = gf_composite_get_default_poly(h->base_gf); + if (h->prim_poly == 0) return 0; /* This shouldn't happen */ + } else { + h->prim_poly = 0x1b; + } + if (no_default_flag == 1) { + fprintf(stderr,"Code contains no default irreducible polynomial for given base field\n"); + return 0; + } + } + + gf->multiply.w64 = NULL; + gf->divide.w64 = NULL; + gf->inverse.w64 = NULL; + gf->multiply_region.w64 = NULL; + + switch(h->mult_type) { + case GF_MULT_CARRY_FREE: if (gf_w64_cfm_init(gf) == 0) return 0; break; + case GF_MULT_SHIFT: if (gf_w64_shift_init(gf) == 0) return 0; break; + case GF_MULT_COMPOSITE: if (gf_w64_composite_init(gf) == 0) return 0; break; + case GF_MULT_DEFAULT: + case GF_MULT_SPLIT_TABLE: if (gf_w64_split_init(gf) == 0) return 0; break; + case GF_MULT_GROUP: if (gf_w64_group_init(gf) == 0) return 0; break; + case GF_MULT_BYTWO_p: + case GF_MULT_BYTWO_b: if (gf_w64_bytwo_init(gf) == 0) return 0; break; + default: return 0; + } + if (h->divide_type == GF_DIVIDE_EUCLID) { + gf->divide.w64 = gf_w64_divide_from_inverse; + gf->inverse.w64 = gf_w64_euclid; + } + + if (gf->inverse.w64 != NULL && gf->divide.w64 == NULL) { + gf->divide.w64 = gf_w64_divide_from_inverse; + } + if (gf->inverse.w64 == NULL && gf->divide.w64 != NULL) { + gf->inverse.w64 = gf_w64_inverse_from_divide; + } + + if (h->region_type == GF_REGION_CAUCHY) return 0; + + if (h->region_type & GF_REGION_ALTMAP) { + if (h->mult_type == GF_MULT_COMPOSITE) { + gf->extract_word.w64 = gf_w64_composite_extract_word; + } else if (h->mult_type == GF_MULT_SPLIT_TABLE) { + gf->extract_word.w64 = gf_w64_split_extract_word; + } + } else { + gf->extract_word.w64 = gf_w64_extract_word; + } + + return 1; +} diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w8.c b/src/erasure-code/jerasure/gf-complete/src/gf_w8.c new file mode 100644 index 000000000000..89ef6a2eda67 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/gf_w8.c @@ -0,0 +1,2456 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_w8.c + * + * Routines for 8-bit Galois fields + */ + +#include "gf_int.h" +#include +#include + +#define GF_FIELD_WIDTH (8) +#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH) +#define GF_HALF_SIZE (1 << (GF_FIELD_WIDTH/2)) +#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1 + +#define GF_BASE_FIELD_WIDTH (4) +#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH) + +struct gf_w8_logtable_data { + uint8_t log_tbl[GF_FIELD_SIZE]; + uint8_t antilog_tbl[GF_FIELD_SIZE * 2]; + uint8_t inv_tbl[GF_FIELD_SIZE]; +}; + +struct gf_w8_logzero_table_data { + short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */ + uint8_t antilog_tbl[512+512+1]; + uint8_t *div_tbl; + uint8_t *inv_tbl; +}; + +struct gf_w8_logzero_small_table_data { + short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */ + uint8_t antilog_tbl[255*3]; + uint8_t inv_tbl[GF_FIELD_SIZE]; + uint8_t *div_tbl; +}; + +struct gf_w8_composite_data { + uint8_t *mult_table; +}; + +/* Don't change the order of these relative to gf_w8_half_table_data */ + +struct gf_w8_default_data { + uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE]; + uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE]; + uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE]; +}; + +struct gf_w8_half_table_data { + uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE]; + uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE]; +}; + +struct gf_w8_single_table_data { + uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE]; +}; + +struct gf_w8_double_table_data { + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint16_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE]; +}; + +struct gf_w8_double_table_lazy_data { + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint16_t mult[GF_FIELD_SIZE*GF_FIELD_SIZE]; +}; + +struct gf_w4_logtable_data { + uint8_t log_tbl[GF_BASE_FIELD_SIZE]; + uint8_t antilog_tbl[GF_BASE_FIELD_SIZE * 2]; + uint8_t *antilog_tbl_div; +}; + +struct gf_w4_single_table_data { + uint8_t div[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE]; + uint8_t mult[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE]; +}; + +struct gf_w8_bytwo_data { + uint64_t prim_poly; + uint64_t mask1; + uint64_t mask2; +}; + +#define AB2(ip, am1 ,am2, b, t1, t2) {\ + t1 = (b << 1) & am1;\ + t2 = b & am2; \ + t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \ + b = (t1 ^ (t2 & ip));} + +#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\ + t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \ + t2 = _mm_and_si128(va, m2); \ + t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \ + va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); } + +#define MM_PRINT(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 2) printf(" %02x %02x", blah[15-ii], blah[14-ii]); printf("\n"); } + +static +inline +uint32_t gf_w8_inverse_from_divide (gf_t *gf, uint32_t a) +{ + return gf->divide.w32(gf, 1, a); +} + +static +inline +uint32_t gf_w8_divide_from_inverse (gf_t *gf, uint32_t a, uint32_t b) +{ + b = gf->inverse.w32(gf, b); + return gf->multiply.w32(gf, a, b); +} + +static +inline +uint32_t gf_w8_euclid (gf_t *gf, uint32_t b) +{ + uint32_t e_i, e_im1, e_ip1; + uint32_t d_i, d_im1, d_ip1; + uint32_t y_i, y_im1, y_ip1; + uint32_t c_i; + + if (b == 0) return -1; + e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; + e_i = b; + d_im1 = 8; + for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ; + y_i = 1; + y_im1 = 0; + + while (e_i != 1) { + + e_ip1 = e_im1; + d_ip1 = d_im1; + c_i = 0; + + while (d_ip1 >= d_i) { + c_i ^= (1 << (d_ip1 - d_i)); + e_ip1 ^= (e_i << (d_ip1 - d_i)); + if (e_ip1 == 0) return 0; + while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--; + } + + y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i); + y_im1 = y_i; + y_i = y_ip1; + + e_im1 = e_i; + d_im1 = d_i; + e_i = e_ip1; + d_i = d_ip1; + } + + return y_i; +} + +static +gf_val_32_t gf_w8_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + uint8_t *r8; + + r8 = (uint8_t *) start; + return r8[index]; +} + +static +gf_val_32_t gf_w8_composite_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + int sub_size; + gf_internal_t *h; + uint8_t *r8, *top; + uint8_t a, b; + gf_region_data rd; + + h = (gf_internal_t *) gf->scratch; + gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32); + r8 = (uint8_t *) start; + if (r8 + index < (uint8_t *) rd.d_start) return r8[index]; + if (r8 + index >= (uint8_t *) rd.d_top) return r8[index]; + index -= (((uint8_t *) rd.d_start) - r8); + r8 = (uint8_t *) rd.d_start; + top = (uint8_t *) rd.d_top; + sub_size = (top-r8)/2; + + a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index); + b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index); + return (a | (b << 4)); +} + +static +inline +uint32_t gf_w8_matrix (gf_t *gf, uint32_t b) +{ + return gf_bitmatrix_inverse(b, 8, ((gf_internal_t *) (gf->scratch))->prim_poly); +} + + +static +inline +gf_val_32_t +gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) +{ + gf_val_32_t rv = 0; + +#if defined(INTEL_SSE4_PCLMUL) + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0); + b = _mm_insert_epi32 (a, b8, 0); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only + have to do the reduction at most twice, because (w-2)/z == 2. Where + z is equal to the number of zeros after the leading 1 + + _mm_clmulepi64_si128 is the carryless multiply operation. Here + _mm_srli_si128 shifts the result to the right by 1 byte. This allows + us to multiply the prim_poly by the leading bits of the result. We + then xor the result of that operation back with the result.*/ + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + +#endif + return rv; +} + +static +inline +gf_val_32_t +gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) +{ + gf_val_32_t rv = 0; + +#if defined(INTEL_SSE4_PCLMUL) + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0); + b = _mm_insert_epi32 (a, b8, 0); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + +#endif + return rv; +} + +static +inline +gf_val_32_t +gf_w8_clm_multiply_4 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) +{ + gf_val_32_t rv = 0; + +#if defined(INTEL_SSE4_PCLMUL) + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0); + b = _mm_insert_epi32 (a, b8, 0); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + +#endif + return rv; +} + + +static +void +gf_w8_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int + xor) +{ + gf_region_data rd; + uint8_t *s8; + uint8_t *d8; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1); + gf_do_initial_region_alignment(&rd); + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + if (xor) { + while (d8 < ((uint8_t *) rd.d_top)) { + *d8 ^= gf->multiply.w32(gf, val, *s8); + d8++; + s8++; + } + } else { + while (d8 < ((uint8_t *) rd.d_top)) { + *d8 = gf->multiply.w32(gf, val, *s8); + d8++; + s8++; + } + } + gf_do_final_region_alignment(&rd); +} + +#if defined(INTEL_SSE4_PCLMUL) +static +void +gf_w8_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int + xor) +{ + gf_region_data rd; + uint8_t *s8; + uint8_t *d8; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1); + gf_do_initial_region_alignment(&rd); + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + if (xor) { + while (d8 < ((uint8_t *) rd.d_top)) { + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d8++; + s8++; + } + } else { + while (d8 < ((uint8_t *) rd.d_top)) { + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d8++; + s8++; + } + } + gf_do_final_region_alignment(&rd); +} +#endif + +#if defined(INTEL_SSE4_PCLMUL) +static +void +gf_w8_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int + xor) +{ + gf_region_data rd; + uint8_t *s8; + uint8_t *d8; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1); + gf_do_initial_region_alignment(&rd); + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + if (xor) { + while (d8 < ((uint8_t *) rd.d_top)) { + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d8++; + s8++; + } + } else { + while (d8 < ((uint8_t *) rd.d_top)) { + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d8++; + s8++; + } + } + gf_do_final_region_alignment(&rd); +} +#endif + +#if defined(INTEL_SSE4_PCLMUL) +static +void +gf_w8_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int + xor) +{ + gf_region_data rd; + uint8_t *s8; + uint8_t *d8; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1); + gf_do_initial_region_alignment(&rd); + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + if (xor) { + while (d8 < ((uint8_t *) rd.d_top)) { + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d8++; + s8++; + } + } else { + while (d8 < ((uint8_t *) rd.d_top)) { + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d8++; + s8++; + } + } + gf_do_final_region_alignment(&rd); +} +#endif + +/* ------------------------------------------------------------ +IMPLEMENTATION: SHIFT: + +JSP: The world's dumbest multiplication algorithm. I only +include it for completeness. It does have the feature that it requires no +extra memory. + */ + +static +inline + uint32_t +gf_w8_shift_multiply (gf_t *gf, uint32_t a8, uint32_t b8) +{ + uint16_t product, i, pp, a, b; + gf_internal_t *h; + + a = a8; + b = b8; + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + product = 0; + + for (i = 0; i < GF_FIELD_WIDTH; i++) { + if (a & (1 << i)) product ^= (b << i); + } + for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) { + if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); + } + return product; +} + +static +int gf_w8_cfm_init(gf_t *gf) +{ +#if defined(INTEL_SSE4_PCLMUL) + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + if ((0xe0 & h->prim_poly) == 0){ + gf->multiply.w32 = gf_w8_clm_multiply_2; + gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_2; + }else if ((0xc0 & h->prim_poly) == 0){ + gf->multiply.w32 = gf_w8_clm_multiply_3; + gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_3; + }else if ((0x80 & h->prim_poly) == 0){ + gf->multiply.w32 = gf_w8_clm_multiply_4; + gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_4; + }else{ + return 0; + } + return 1; +#endif + + return 0; + +} + +static +int gf_w8_shift_init(gf_t *gf) +{ + gf->multiply.w32 = gf_w8_shift_multiply; /* The others will be set automatically */ + return 1; +} + +/* ------------------------------------------------------------ +IMPLEMENTATION: LOG_TABLE: + +JSP: Kevin wrote this, and I'm converting it to my structure. +*/ + +static +inline + uint32_t +gf_w8_logzero_multiply (gf_t *gf, uint32_t a, uint32_t b) +{ + struct gf_w8_logzero_table_data *ltd; + + ltd = (struct gf_w8_logzero_table_data *) ((gf_internal_t *) gf->scratch)->private; + return ltd->antilog_tbl[ltd->log_tbl[a] + ltd->log_tbl[b]]; +} + +static +inline + uint32_t +gf_w8_logzero_divide (gf_t *gf, uint32_t a, uint32_t b) +{ + struct gf_w8_logzero_table_data *ltd; + + ltd = (struct gf_w8_logzero_table_data *) ((gf_internal_t *) gf->scratch)->private; + return ltd->div_tbl[ltd->log_tbl[a] - ltd->log_tbl[b]]; +} + +static +inline + uint32_t +gf_w8_logzero_small_multiply (gf_t *gf, uint32_t a, uint32_t b) +{ + struct gf_w8_logzero_small_table_data *std; + + std = (struct gf_w8_logzero_small_table_data *) ((gf_internal_t *) gf->scratch)->private; + if (b == 0) return 0; + return std->antilog_tbl[std->log_tbl[a] + std->log_tbl[b]]; +} + +static +inline + uint32_t +gf_w8_logzero_small_divide (gf_t *gf, uint32_t a, uint32_t b) +{ + struct gf_w8_logzero_small_table_data *std; + + std = (struct gf_w8_logzero_small_table_data *) ((gf_internal_t *) gf->scratch)->private; + return std->div_tbl[std->log_tbl[a] - std->log_tbl[b]]; +} + +static +inline + uint32_t +gf_w8_log_multiply (gf_t *gf, uint32_t a, uint32_t b) +{ + struct gf_w8_logtable_data *ltd; + + ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(unsigned)(ltd->log_tbl[a] + ltd->log_tbl[b])]; +} + +static +inline + uint32_t +gf_w8_log_divide (gf_t *gf, uint32_t a, uint32_t b) +{ + int log_sum = 0; + struct gf_w8_logtable_data *ltd; + + if (a == 0 || b == 0) return 0; + ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + + log_sum = ltd->log_tbl[a] - ltd->log_tbl[b] + (GF_MULT_GROUP_SIZE); + return (ltd->antilog_tbl[log_sum]); +} + +static + uint32_t +gf_w8_log_inverse (gf_t *gf, uint32_t a) +{ + struct gf_w8_logtable_data *ltd; + + ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + return (ltd->inv_tbl[a]); +} + +static + uint32_t +gf_w8_logzero_inverse (gf_t *gf, uint32_t a) +{ + struct gf_w8_logzero_table_data *ltd; + + ltd = (struct gf_w8_logzero_table_data *) ((gf_internal_t *) gf->scratch)->private; + return (ltd->inv_tbl[a]); +} + +static + uint32_t +gf_w8_logzero_small_inverse (gf_t *gf, uint32_t a) +{ + struct gf_w8_logzero_small_table_data *std; + + std = (struct gf_w8_logzero_small_table_data *) ((gf_internal_t *) gf->scratch)->private; + return (std->inv_tbl[a]); +} + +static + void +gf_w8_log_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + int i; + uint8_t lv; + uint8_t *s8, *d8; + struct gf_w8_logtable_data *ltd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + + lv = ltd->log_tbl[val]; + + if (xor) { + for (i = 0; i < bytes; i++) { + d8[i] ^= (s8[i] == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[s8[i]]]); + } + } else { + for (i = 0; i < bytes; i++) { + d8[i] = (s8[i] == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[s8[i]]]); + } + } +} + +static + void +gf_w8_logzero_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + int i; + uint8_t lv; + uint8_t *s8, *d8; + struct gf_w8_logzero_table_data *ltd; + struct gf_w8_logzero_small_table_data *std; + short *log; + uint8_t *alt; + gf_internal_t *h; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + + if (h->arg1 == 1) { + std = (struct gf_w8_logzero_small_table_data *) h->private; + log = std->log_tbl; + alt = std->antilog_tbl; + } else { + ltd = (struct gf_w8_logzero_table_data *) h->private; + log = ltd->log_tbl; + alt = ltd->antilog_tbl; + } + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + + lv = log[val]; + + if (xor) { + for (i = 0; i < bytes; i++) { + d8[i] ^= (alt[lv + log[s8[i]]]); + } + } else { + for (i = 0; i < bytes; i++) { + d8[i] = (alt[lv + log[s8[i]]]); + } + } +} + + static +int gf_w8_log_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_w8_logtable_data *ltd = NULL; + struct gf_w8_logzero_table_data *ztd = NULL; + struct gf_w8_logzero_small_table_data *std = NULL; + uint8_t *alt; + uint8_t *inv; + int i, b; + int check = 0; + + h = (gf_internal_t *) gf->scratch; + if (h->mult_type == GF_MULT_LOG_TABLE) { + ltd = h->private; + alt = ltd->antilog_tbl; + inv = ltd->inv_tbl; + } else if (h->mult_type == GF_MULT_LOG_ZERO) { + std = h->private; + alt = std->antilog_tbl; + std->div_tbl = (alt + 255); + inv = std->inv_tbl; + } else { + ztd = h->private; + alt = ztd->antilog_tbl; + ztd->inv_tbl = (alt + 512 + 256); + ztd->div_tbl = (alt + 255); + inv = ztd->inv_tbl; + } + + for (i = 0; i < GF_MULT_GROUP_SIZE+1; i++) { + if (h->mult_type == GF_MULT_LOG_TABLE) + ltd->log_tbl[i] = 0; + else if (h->mult_type == GF_MULT_LOG_ZERO) + std->log_tbl[i] = 0; + else + ztd->log_tbl[i] = 0; + } + + if (h->mult_type == GF_MULT_LOG_TABLE) { + ltd->log_tbl[0] = 0; + } else if (h->mult_type == GF_MULT_LOG_ZERO) { + std->log_tbl[0] = 510; + } else { + ztd->log_tbl[0] = 512; + } + + b = 1; + for (i = 0; i < GF_MULT_GROUP_SIZE; i++) { + if (h->mult_type == GF_MULT_LOG_TABLE) { + if (ltd->log_tbl[b] != 0) check = 1; + ltd->log_tbl[b] = i; + } else if (h->mult_type == GF_MULT_LOG_ZERO) { + if (std->log_tbl[b] != 0) check = 1; + std->log_tbl[b] = i; + } else { + if (ztd->log_tbl[b] != 0) check = 1; + ztd->log_tbl[b] = i; + } + alt[i] = b; + alt[i+GF_MULT_GROUP_SIZE] = b; + b <<= 1; + if (b & GF_FIELD_SIZE) { + b = b ^ h->prim_poly; + } + } + if (check) { + _gf_errno = GF_E_LOGPOLY; + return 0; + } + + if (h->mult_type == GF_MULT_LOG_ZERO) bzero(alt+510, 255); + + if (h->mult_type == GF_MULT_LOG_ZERO_EXT) { + bzero(alt+512, 255); + alt[512+512] = 0; + } + + inv[0] = 0; /* Not really, but we need to fill it with something */ + i = 1; + b = GF_MULT_GROUP_SIZE; + do { + inv[i] = alt[b]; + i <<= 1; + if (i & (1 << 8)) i ^= h->prim_poly; + b--; + } while (i != 1); + + if (h->mult_type == GF_MULT_LOG_TABLE) { + gf->inverse.w32 = gf_w8_log_inverse; + gf->divide.w32 = gf_w8_log_divide; + gf->multiply.w32 = gf_w8_log_multiply; + gf->multiply_region.w32 = gf_w8_log_multiply_region; + } else if (h->mult_type == GF_MULT_LOG_ZERO) { + gf->inverse.w32 = gf_w8_logzero_small_inverse; + gf->divide.w32 = gf_w8_logzero_small_divide; + gf->multiply.w32 = gf_w8_logzero_small_multiply; + gf->multiply_region.w32 = gf_w8_logzero_multiply_region; + } else { + gf->inverse.w32 = gf_w8_logzero_inverse; + gf->divide.w32 = gf_w8_logzero_divide; + gf->multiply.w32 = gf_w8_logzero_multiply; + gf->multiply_region.w32 = gf_w8_logzero_multiply_region; + } + return 1; +} + +/* ------------------------------------------------------------ +IMPLEMENTATION: FULL_TABLE: + +JSP: Kevin wrote this, and I'm converting it to my structure. + */ + +static + gf_val_32_t +gf_w8_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_w8_single_table_data *ftd; + + ftd = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private; + return (ftd->multtable[a][b]); +} + +static + gf_val_32_t +gf_w8_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_w8_single_table_data *ftd; + + ftd = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private; + return (ftd->divtable[a][b]); +} + +static + gf_val_32_t +gf_w8_default_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_w8_default_data *ftd; + + ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private; + return (ftd->multtable[a][b]); +} + +#ifdef INTEL_SSSE3 +static + gf_val_32_t +gf_w8_default_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_w8_default_data *ftd; + + ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private; + return (ftd->divtable[a][b]); +} +#endif + +static + gf_val_32_t +gf_w8_double_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_w8_double_table_data *ftd; + + ftd = (struct gf_w8_double_table_data *) ((gf_internal_t *) gf->scratch)->private; + return (ftd->mult[a][b]); +} + +static + gf_val_32_t +gf_w8_double_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_w8_double_table_data *ftd; + + ftd = (struct gf_w8_double_table_data *) ((gf_internal_t *) gf->scratch)->private; + return (ftd->div[a][b]); +} + +static + void +gf_w8_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint16_t *base; + uint32_t b, c, vc, vb; + gf_internal_t *h; + struct gf_w8_double_table_data *dtd; + struct gf_w8_double_table_lazy_data *ltd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) (gf->scratch); + if (h->region_type & GF_REGION_LAZY) { + ltd = (struct gf_w8_double_table_lazy_data *) h->private; + base = ltd->mult; + for (b = 0; b < GF_FIELD_SIZE; b++) { + vb = (ltd->smult[val][b] << 8); + for (c = 0; c < GF_FIELD_SIZE; c++) { + vc = ltd->smult[val][c]; + base[(b << 8)| c] = (vb | vc); + } + } + + } else { + dtd = (struct gf_w8_double_table_data *) h->private; + base = &(dtd->mult[val][0]); + } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + gf_do_initial_region_alignment(&rd); + gf_two_byte_region_table_multiply(&rd, base); + gf_do_final_region_alignment(&rd); +} + +static + gf_val_32_t +gf_w8_double_table_lazy_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_w8_double_table_lazy_data *ftd; + + ftd = (struct gf_w8_double_table_lazy_data *) ((gf_internal_t *) gf->scratch)->private; + return (ftd->smult[a][b]); +} + +static + gf_val_32_t +gf_w8_double_table_lazy_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_w8_double_table_lazy_data *ftd; + + ftd = (struct gf_w8_double_table_lazy_data *) ((gf_internal_t *) gf->scratch)->private; + return (ftd->div[a][b]); +} + +static + void +gf_w8_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + int i; + uint8_t *s8, *d8; + struct gf_w8_single_table_data *ftd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + ftd = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private; + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + + if (xor) { + for (i = 0; i < bytes; i++) { + d8[i] ^= ftd->multtable[s8[i]][val]; + } + } else { + for (i = 0; i < bytes; i++) { + d8[i] = ftd->multtable[s8[i]][val]; + } + } +} + +#ifdef INTEL_SSSE3 +static + void +gf_w8_split_multiply_region_sse(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint8_t *bh, *bl, *sptr, *dptr; + __m128i loset, t1, r, va, mth, mtl; + struct gf_w8_half_table_data *htd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) (gf->scratch))->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + bh = (uint8_t *) htd->high; + bh += (val << 4); + bl = (uint8_t *) htd->low; + bl += (val << 4); + + sptr = rd.s_start; + dptr = rd.d_start; + + mth = _mm_loadu_si128 ((__m128i *)(bh)); + mtl = _mm_loadu_si128 ((__m128i *)(bl)); + loset = _mm_set1_epi8 (0x0f); + + if (xor) { + while (sptr < (uint8_t *) rd.s_top) { + va = _mm_load_si128 ((__m128i *)(sptr)); + t1 = _mm_and_si128 (loset, va); + r = _mm_shuffle_epi8 (mtl, t1); + va = _mm_srli_epi64 (va, 4); + t1 = _mm_and_si128 (loset, va); + r = _mm_xor_si128 (r, _mm_shuffle_epi8 (mth, t1)); + va = _mm_load_si128 ((__m128i *)(dptr)); + r = _mm_xor_si128 (r, va); + _mm_store_si128 ((__m128i *)(dptr), r); + dptr += 16; + sptr += 16; + } + } else { + while (sptr < (uint8_t *) rd.s_top) { + va = _mm_load_si128 ((__m128i *)(sptr)); + t1 = _mm_and_si128 (loset, va); + r = _mm_shuffle_epi8 (mtl, t1); + va = _mm_srli_epi64 (va, 4); + t1 = _mm_and_si128 (loset, va); + r = _mm_xor_si128 (r, _mm_shuffle_epi8 (mth, t1)); + _mm_store_si128 ((__m128i *)(dptr), r); + dptr += 16; + sptr += 16; + } + } + + gf_do_final_region_alignment(&rd); +} +#endif + + +/* ------------------------------------------------------------ +IMPLEMENTATION: FULL_TABLE: + */ + +static + gf_val_32_t +gf_w8_split_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_w8_half_table_data *htd; + htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) gf->scratch)->private; + + return htd->high[b][a>>4] ^ htd->low[b][a&0xf]; +} + +static + void +gf_w8_split_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + int i; + uint8_t *s8, *d8; + struct gf_w8_half_table_data *htd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) gf->scratch)->private; + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + + if (xor) { + for (i = 0; i < bytes; i++) { + d8[i] ^= (htd->high[val][s8[i]>>4] ^ htd->low[val][s8[i]&0xf]); + } + } else { + for (i = 0; i < bytes; i++) { + d8[i] = (htd->high[val][s8[i]>>4] ^ htd->low[val][s8[i]&0xf]); + } + } +} + + + static +int gf_w8_split_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_w8_half_table_data *htd; + int a, b; + + h = (gf_internal_t *) gf->scratch; + htd = (struct gf_w8_half_table_data *)h->private; + + bzero(htd->high, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE); + bzero(htd->low, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE); + + for (a = 1; a < GF_FIELD_SIZE; a++) { + for (b = 1; b < GF_HALF_SIZE; b++) { + htd->low[a][b] = gf_w8_shift_multiply(gf,a,b); + htd->high[a][b] = gf_w8_shift_multiply(gf,a,b<<4); + } + } + + gf->multiply.w32 = gf_w8_split_multiply; + + #ifdef INTEL_SSSE3 + if (h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w32 = gf_w8_split_multiply_region; + else + gf->multiply_region.w32 = gf_w8_split_multiply_region_sse; + #else + gf->multiply_region.w32 = gf_w8_split_multiply_region; + if(h->region_type & GF_REGION_SSE) + return 0; + #endif + + return 1; +} + +/* JSP: This is disgusting, but it is what it is. If there is no SSE, + then the default is equivalent to single table. If there is SSE, then + we use the "gf_w8_default_data" which is a hybrid of SPLIT & TABLE. */ + +static +int gf_w8_table_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_w8_single_table_data *ftd = NULL; + struct gf_w8_double_table_data *dtd = NULL; + struct gf_w8_double_table_lazy_data *ltd = NULL; + struct gf_w8_default_data *dd = NULL; + int a, b, c, prod, scase, issse; + + h = (gf_internal_t *) gf->scratch; + + issse = 0; +#ifdef INTEL_SSSE3 + issse = 1; +#endif + + if (h->mult_type == GF_MULT_DEFAULT && issse) { + dd = (struct gf_w8_default_data *)h->private; + scase = 3; + bzero(dd->high, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE); + bzero(dd->low, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE); + bzero(dd->divtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + bzero(dd->multtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + } else if (h->mult_type == GF_MULT_DEFAULT || + h->region_type == 0 || (h->region_type & GF_REGION_CAUCHY)) { + ftd = (struct gf_w8_single_table_data *)h->private; + bzero(ftd->divtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + bzero(ftd->multtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + scase = 0; + } else if (h->region_type == GF_REGION_DOUBLE_TABLE) { + dtd = (struct gf_w8_double_table_data *)h->private; + bzero(dtd->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + bzero(dtd->mult, sizeof(uint16_t) * GF_FIELD_SIZE * GF_FIELD_SIZE * GF_FIELD_SIZE); + scase = 1; + } else if (h->region_type == (GF_REGION_DOUBLE_TABLE | GF_REGION_LAZY)) { + ltd = (struct gf_w8_double_table_lazy_data *)h->private; + bzero(ltd->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + bzero(ltd->smult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + scase = 2; + } else { + fprintf(stderr, "Internal error in gf_w8_table_init\n"); + exit(0); + } + + for (a = 1; a < GF_FIELD_SIZE; a++) { + for (b = 1; b < GF_FIELD_SIZE; b++) { + prod = gf_w8_shift_multiply(gf,a,b); + switch (scase) { + case 0: + ftd->multtable[a][b] = prod; + ftd->divtable[prod][b] = a; + break; + case 1: + dtd->div[prod][b] = a; + for (c = 0; c < GF_FIELD_SIZE; c++) { + dtd->mult[a][(c<<8)|b] |= prod; + dtd->mult[a][(b<<8)|c] |= (prod<<8); + } + break; + case 2: + ltd->div[prod][b] = a; + ltd->smult[a][b] = prod; + break; + case 3: + dd->multtable[a][b] = prod; + dd->divtable[prod][b] = a; + if ((b & 0xf) == b) { dd->low[a][b] = prod; } + if ((b & 0xf0) == b) { dd->high[a][b>>4] = prod; } + break; + } + } + } + + gf->inverse.w32 = NULL; /* Will set from divide */ + switch (scase) { + case 0: + gf->divide.w32 = gf_w8_table_divide; + gf->multiply.w32 = gf_w8_table_multiply; + gf->multiply_region.w32 = gf_w8_table_multiply_region; + break; + case 1: + gf->divide.w32 = gf_w8_double_table_divide; + gf->multiply.w32 = gf_w8_double_table_multiply; + gf->multiply_region.w32 = gf_w8_double_table_multiply_region; + break; + case 2: + gf->divide.w32 = gf_w8_double_table_lazy_divide; + gf->multiply.w32 = gf_w8_double_table_lazy_multiply; + gf->multiply_region.w32 = gf_w8_double_table_multiply_region; + break; + case 3: +#ifdef INTEL_SSSE3 + gf->divide.w32 = gf_w8_default_divide; + gf->multiply.w32 = gf_w8_default_multiply; + gf->multiply_region.w32 = gf_w8_split_multiply_region_sse; +#endif + break; + } + return 1; +} + +static + void +gf_w8_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint8_t val0 = val & 0x0f; + uint8_t val1 = (val & 0xf0) >> 4; + gf_region_data rd; + int sub_reg_size; + + if (val == 0) { + if (xor) return; + bzero(dest, bytes); + return; + } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); + gf_do_initial_region_alignment(&rd); + + sub_reg_size = ((char*)rd.d_top - (char*)rd.d_start) / 2; + + base_gf->multiply_region.w32(base_gf, rd.s_start, rd.d_start, val0, sub_reg_size, xor); + base_gf->multiply_region.w32(base_gf, (char*)rd.s_start+sub_reg_size, rd.d_start, val1, sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, rd.s_start, (char*)rd.d_start+sub_reg_size, val1, sub_reg_size, xor); + base_gf->multiply_region.w32(base_gf, (char*)rd.s_start+sub_reg_size, (char*)rd.d_start+sub_reg_size, val0, sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, (char*)rd.s_start+sub_reg_size, (char*)rd.d_start+sub_reg_size, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1); + + gf_do_final_region_alignment(&rd); +} + +static +gf_val_32_t +gf_w8_composite_multiply_recursive(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint8_t b0 = b & 0x0f; + uint8_t b1 = (b & 0xf0) >> 4; + uint8_t a0 = a & 0x0f; + uint8_t a1 = (a & 0xf0) >> 4; + uint8_t a1b1; + + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + return ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((base_gf->multiply.w32(base_gf, a1, b0) ^ + base_gf->multiply.w32(base_gf, a0, b1) ^ + base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4)); +} + +static +gf_val_32_t +gf_w8_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + uint8_t b0 = b & 0x0f; + uint8_t b1 = (b & 0xf0) >> 4; + uint8_t a0 = a & 0x0f; + uint8_t a1 = (a & 0xf0) >> 4; + uint8_t a1b1, *mt; + struct gf_w8_composite_data *cd; + + cd = (struct gf_w8_composite_data *) h->private; + mt = cd->mult_table; + + a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1); + + return ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | + ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^ + GF_W4_INLINE_MULTDIV(mt, a0, b1) ^ + GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4)); +} + +/* + * Composite field division trick (explained in 2007 tech report) + * + * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1 + * + * let c = b^-1 + * + * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0) + * + * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1 + * + * let d = b1c1 and d+1 = b0c0 + * + * solve s*b1c1+b1c0+b0c1 = 0 + * + * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1 + * + * c0 = (d+1)b0^-1 + * c1 = d*b1^-1 + * + * a / b = a * c + */ + +static +gf_val_32_t +gf_w8_composite_inverse(gf_t *gf, gf_val_32_t a) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint8_t a0 = a & 0x0f; + uint8_t a1 = (a & 0xf0) >> 4; + uint8_t c0, c1, c, d, tmp; + uint8_t a0inv, a1inv; + + if (a0 == 0) { + a1inv = base_gf->inverse.w32(base_gf, a1) & 0xf; + c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly); + c1 = a1inv; + } else if (a1 == 0) { + c0 = base_gf->inverse.w32(base_gf, a0); + c1 = 0; + } else { + a1inv = base_gf->inverse.w32(base_gf, a1) & 0xf; + a0inv = base_gf->inverse.w32(base_gf, a0) & 0xf; + + d = base_gf->multiply.w32(base_gf, a1, a0inv) & 0xf; + + tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly) & 0xf; + tmp = base_gf->inverse.w32(base_gf, tmp) & 0xf; + + d = base_gf->multiply.w32(base_gf, d, tmp) & 0xf; + + c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv) & 0xf; + c1 = base_gf->multiply.w32(base_gf, d, a1inv) & 0xf; + } + + c = c0 | (c1 << 4); + + return c; +} + +static +void +gf_w8_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + gf_region_data rd; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint8_t b0 = val & 0x0f; + uint8_t b1 = (val & 0xf0) >> 4; + uint8_t *s8; + uint8_t *d8; + uint8_t *mt; + uint8_t a0, a1, a1b1; + struct gf_w8_composite_data *cd; + + cd = (struct gf_w8_composite_data *) h->private; + + if (val == 0) { + if (xor) return; + bzero(dest, bytes); + return; + } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1); + gf_do_initial_region_alignment(&rd); + + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + mt = cd->mult_table; + if (mt == NULL) { + if (xor) { + while (d8 < (uint8_t *) rd.d_top) { + a0 = *s8 & 0x0f; + a1 = (*s8 & 0xf0) >> 4; + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + *d8 ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((base_gf->multiply.w32(base_gf, a1, b0) ^ + base_gf->multiply.w32(base_gf, a0, b1) ^ + base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4)); + s8++; + d8++; + } + } else { + while (d8 < (uint8_t *) rd.d_top) { + a0 = *s8 & 0x0f; + a1 = (*s8 & 0xf0) >> 4; + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + *d8 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((base_gf->multiply.w32(base_gf, a1, b0) ^ + base_gf->multiply.w32(base_gf, a0, b1) ^ + base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4)); + s8++; + d8++; + } + } + } else { + if (xor) { + while (d8 < (uint8_t *) rd.d_top) { + a0 = *s8 & 0x0f; + a1 = (*s8 & 0xf0) >> 4; + a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1); + + *d8 ^= ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | + ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^ + GF_W4_INLINE_MULTDIV(mt, a0, b1) ^ + GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4)); + s8++; + d8++; + } + } else { + while (d8 < (uint8_t *) rd.d_top) { + a0 = *s8 & 0x0f; + a1 = (*s8 & 0xf0) >> 4; + a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1); + + *d8 = ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | + ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^ + GF_W4_INLINE_MULTDIV(mt, a0, b1) ^ + GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4)); + s8++; + d8++; + } + } + } + gf_do_final_region_alignment(&rd); + return; +} + +static +int gf_w8_composite_init(gf_t *gf) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + struct gf_w8_composite_data *cd; + + if (h->base_gf == NULL) return 0; + + cd = (struct gf_w8_composite_data *) h->private; + cd->mult_table = gf_w4_get_mult_table(h->base_gf); + + if (h->region_type & GF_REGION_ALTMAP) { + gf->multiply_region.w32 = gf_w8_composite_multiply_region_alt; + } else { + gf->multiply_region.w32 = gf_w8_composite_multiply_region; + } + + if (cd->mult_table == NULL) { + gf->multiply.w32 = gf_w8_composite_multiply_recursive; + } else { + gf->multiply.w32 = gf_w8_composite_multiply_inline; + } + gf->divide.w32 = NULL; + gf->inverse.w32 = gf_w8_composite_inverse; + + return 1; +} + +static +inline + gf_val_32_t +gf_w8_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t prod, pp, pmask, amask; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + + prod = 0; + pmask = 0x80; + amask = 0x80; + + while (amask != 0) { + if (prod & pmask) { + prod = ((prod << 1) ^ pp); + } else { + prod <<= 1; + } + if (a & amask) prod ^= b; + amask >>= 1; + } + return prod; +} + +static +inline + gf_val_32_t +gf_w8_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t prod, pp, bmask; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + prod = 0; + bmask = 0x80; + + while (1) { + if (a & 1) prod ^= b; + a >>= 1; + if (a == 0) return prod; + if (b & bmask) { + b = ((b << 1) ^ pp); + } else { + b <<= 1; + } + } +} + +static + void +gf_w8_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t *s64, *d64, t1, t2, ta, prod, amask; + gf_region_data rd; + struct gf_w8_bytwo_data *btd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + gf_do_initial_region_alignment(&rd); + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + + if (xor) { + while (s64 < (uint64_t *) rd.s_top) { + prod = 0; + amask = 0x80; + ta = *s64; + while (amask != 0) { + AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); + if (val & amask) prod ^= ta; + amask >>= 1; + } + *d64 ^= prod; + d64++; + s64++; + } + } else { + while (s64 < (uint64_t *) rd.s_top) { + prod = 0; + amask = 0x80; + ta = *s64; + while (amask != 0) { + AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); + if (val & amask) prod ^= ta; + amask >>= 1; + } + *d64 = prod; + d64++; + s64++; + } + } + gf_do_final_region_alignment(&rd); +} + +#define BYTWO_P_ONESTEP {\ + SSE_AB2(pp, m1 ,m2, prod, t1, t2); \ + t1 = _mm_and_si128(v, one); \ + t1 = _mm_sub_epi8(t1, one); \ + t1 = _mm_and_si128(t1, ta); \ + prod = _mm_xor_si128(prod, t1); \ + v = _mm_srli_epi64(v, 1); } + +#ifdef INTEL_SSE2 +static + void +gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + int i; + uint8_t *s8, *d8; + uint8_t vrev; + __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v; + struct gf_w8_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + vrev = 0; + for (i = 0; i < 8; i++) { + vrev <<= 1; + if (!(val & (1 << i))) vrev |= 1; + } + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + m2 = _mm_set1_epi8((btd->mask2)&0xff); + one = _mm_set1_epi8(1); + + while (d8 < (uint8_t *) rd.d_top) { + prod = _mm_setzero_si128(); + v = _mm_set1_epi8(vrev); + ta = _mm_load_si128((__m128i *) s8); + tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8); + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp)); + d8 += 16; + s8 += 16; + } + gf_do_final_region_alignment(&rd); +} +#endif + +#ifdef INTEL_SSE2 +static + void +gf_w8_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w8_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + m2 = _mm_set1_epi8((btd->mask2)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, m2, va, t1, t2); + _mm_store_si128((__m128i *)d8, va); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static + void +gf_w8_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w8_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + m2 = _mm_set1_epi8((btd->mask2)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, m2, va, t1, t2); + vb = _mm_load_si128 ((__m128i *)(d8)); + vb = _mm_xor_si128(vb, va); + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } +} +#endif + + +#ifdef INTEL_SSE2 +static + void +gf_w8_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + int itb; + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va, vb; + struct gf_w8_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + if (val == 2) { + if (xor) { + gf_w8_bytwo_b_sse_region_2_xor(&rd, btd); + } else { + gf_w8_bytwo_b_sse_region_2_noxor(&rd, btd); + } + gf_do_final_region_alignment(&rd); + return; + } + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + m2 = _mm_set1_epi8((btd->mask2)&0xff); + + while (d8 < (uint8_t *) rd.d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8)); + itb = val; + while (1) { + if (itb & 1) vb = _mm_xor_si128(vb, va); + itb >>= 1; + if (itb == 0) break; + SSE_AB2(pp, m1, m2, va, t1, t2); + } + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } + + gf_do_final_region_alignment(&rd); +} +#endif + +static + void +gf_w8_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t *s64, *d64, t1, t2, ta, tb, prod; + struct gf_w8_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + + switch (val) { + case 2: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= ta; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta; + d64++; + s64++; + } + } + break; + case 3: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 4: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= ta; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta; + d64++; + s64++; + } + } + break; + case 5: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta ^ prod; + d64++; + s64++; + } + } + case 6: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta ^ prod; + d64++; + s64++; + } + } + /* + case 7: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta ^ prod; + d64++; + s64++; + } + } + break; + */ + case 8: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= ta; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta; + d64++; + s64++; + } + } + break; + /* + case 9: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 10: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 11: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 12: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 13: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 14: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 15: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + */ + default: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + prod = *d64 ; + ta = *s64; + tb = val; + while (1) { + if (tb & 1) prod ^= ta; + tb >>= 1; + if (tb == 0) break; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + } + *d64 = prod; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + prod = 0 ; + ta = *s64; + tb = val; + while (1) { + if (tb & 1) prod ^= ta; + tb >>= 1; + if (tb == 0) break; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + } + *d64 = prod; + d64++; + s64++; + } + } + break; + } + gf_do_final_region_alignment(&rd); +} + + static +int gf_w8_bytwo_init(gf_t *gf) +{ + gf_internal_t *h; + uint64_t ip, m1, m2; + struct gf_w8_bytwo_data *btd; + + h = (gf_internal_t *) gf->scratch; + btd = (struct gf_w8_bytwo_data *) (h->private); + ip = h->prim_poly & 0xff; + m1 = 0xfe; + m2 = 0x80; + btd->prim_poly = 0; + btd->mask1 = 0; + btd->mask2 = 0; + + while (ip != 0) { + btd->prim_poly |= ip; + btd->mask1 |= m1; + btd->mask2 |= m2; + ip <<= GF_FIELD_WIDTH; + m1 <<= GF_FIELD_WIDTH; + m2 <<= GF_FIELD_WIDTH; + } + + if (h->mult_type == GF_MULT_BYTWO_p) { + gf->multiply.w32 = gf_w8_bytwo_p_multiply; +#ifdef INTEL_SSE2 + if (h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w32 = gf_w8_bytwo_p_nosse_multiply_region; + else + gf->multiply_region.w32 = gf_w8_bytwo_p_sse_multiply_region; +#else + gf->multiply_region.w32 = gf_w8_bytwo_p_nosse_multiply_region; + if(h->region_type & GF_REGION_SSE) + return 0; +#endif + } else { + gf->multiply.w32 = gf_w8_bytwo_b_multiply; +#ifdef INTEL_SSE2 + if (h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w32 = gf_w8_bytwo_b_nosse_multiply_region; + else + gf->multiply_region.w32 = gf_w8_bytwo_b_sse_multiply_region; +#else + gf->multiply_region.w32 = gf_w8_bytwo_b_nosse_multiply_region; + if(h->region_type & GF_REGION_SSE) + return 0; +#endif + } + return 1; +} + + +/* ------------------------------------------------------------ + General procedures. + You don't need to error check here on in init, because it's done + for you in gf_error_check(). + */ + +int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) +{ + switch(mult_type) + { + case GF_MULT_DEFAULT: +#ifdef INTEL_SSSE3 + return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64; +#endif + return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64; + case GF_MULT_TABLE: + if (region_type == GF_REGION_CAUCHY) { + return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64; + } + + if (region_type == GF_REGION_DEFAULT) { + return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64; + } + if (region_type & GF_REGION_DOUBLE_TABLE) { + if (region_type == GF_REGION_DOUBLE_TABLE) { + return sizeof(gf_internal_t) + sizeof(struct gf_w8_double_table_data) + 64; + } else if (region_type == (GF_REGION_DOUBLE_TABLE | GF_REGION_LAZY)) { + return sizeof(gf_internal_t) + sizeof(struct gf_w8_double_table_lazy_data) + 64; + } else { + return 0; + } + } + return 0; + break; + case GF_MULT_BYTWO_p: + case GF_MULT_BYTWO_b: + return sizeof(gf_internal_t) + sizeof(struct gf_w8_bytwo_data); + break; + case GF_MULT_SPLIT_TABLE: + if ((arg1 == 4 && arg2 == 8) || (arg1 == 8 && arg2 == 4)) { + return sizeof(gf_internal_t) + sizeof(struct gf_w8_half_table_data) + 64; + } + break; + case GF_MULT_LOG_TABLE: + return sizeof(gf_internal_t) + sizeof(struct gf_w8_logtable_data) + 64; + break; + case GF_MULT_LOG_ZERO: + return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_small_table_data) + 64; + break; + case GF_MULT_LOG_ZERO_EXT: + return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_table_data) + 64; + break; + case GF_MULT_CARRY_FREE: + return sizeof(gf_internal_t); + break; + case GF_MULT_SHIFT: + return sizeof(gf_internal_t); + break; + case GF_MULT_COMPOSITE: + return sizeof(gf_internal_t) + sizeof(struct gf_w8_composite_data) + 64; + default: + return 0; + } + return 0; +} + +int gf_w8_init(gf_t *gf) +{ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + /* Allen: set default primitive polynomial / irreducible polynomial if needed */ + + if (h->prim_poly == 0) { + if (h->mult_type == GF_MULT_COMPOSITE) { + h->prim_poly = gf_composite_get_default_poly(h->base_gf); + if (h->prim_poly == 0) return 0; /* JSP: This shouldn't happen, but just in case. */ + } else { + h->prim_poly = 0x11d; + } + } + if (h->mult_type != GF_MULT_COMPOSITE) { + h->prim_poly |= 0x100; + } + + gf->multiply.w32 = NULL; + gf->divide.w32 = NULL; + gf->inverse.w32 = NULL; + gf->multiply_region.w32 = NULL; + gf->extract_word.w32 = gf_w8_extract_word; + + switch(h->mult_type) { + case GF_MULT_DEFAULT: + case GF_MULT_TABLE: if (gf_w8_table_init(gf) == 0) return 0; break; + case GF_MULT_BYTWO_p: + case GF_MULT_BYTWO_b: if (gf_w8_bytwo_init(gf) == 0) return 0; break; + case GF_MULT_LOG_ZERO: + case GF_MULT_LOG_ZERO_EXT: + case GF_MULT_LOG_TABLE: if (gf_w8_log_init(gf) == 0) return 0; break; + case GF_MULT_CARRY_FREE: if (gf_w8_cfm_init(gf) == 0) return 0; break; + case GF_MULT_SHIFT: if (gf_w8_shift_init(gf) == 0) return 0; break; + case GF_MULT_SPLIT_TABLE: if (gf_w8_split_init(gf) == 0) return 0; break; + case GF_MULT_COMPOSITE: if (gf_w8_composite_init(gf) == 0) return 0; break; + default: return 0; + } + + if (h->divide_type == GF_DIVIDE_EUCLID) { + gf->divide.w32 = gf_w8_divide_from_inverse; + gf->inverse.w32 = gf_w8_euclid; + } else if (h->divide_type == GF_DIVIDE_MATRIX) { + gf->divide.w32 = gf_w8_divide_from_inverse; + gf->inverse.w32 = gf_w8_matrix; + } + + if (gf->divide.w32 == NULL) { + gf->divide.w32 = gf_w8_divide_from_inverse; + if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w8_euclid; + } + + if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w8_inverse_from_divide; + + if (h->mult_type == GF_MULT_COMPOSITE && (h->region_type & GF_REGION_ALTMAP)) { + gf->extract_word.w32 = gf_w8_composite_extract_word; + } + + if (h->region_type == GF_REGION_CAUCHY) { + gf->multiply_region.w32 = gf_wgen_cauchy_region; + gf->extract_word.w32 = gf_wgen_extract_word; + } + + if (gf->multiply_region.w32 == NULL) { + gf->multiply_region.w32 = gf_w8_multiply_region_from_single; + } + + return 1; +} + + +/* Inline setup functions */ + +uint8_t *gf_w8_get_mult_table(gf_t *gf) +{ + gf_internal_t *h; + struct gf_w8_default_data *ftd; + struct gf_w8_single_table_data *std; + + h = (gf_internal_t *) gf->scratch; + if (gf->multiply.w32 == gf_w8_default_multiply) { + ftd = (struct gf_w8_default_data *) h->private; + return (uint8_t *) ftd->multtable; + } else if (gf->multiply.w32 == gf_w8_table_multiply) { + std = (struct gf_w8_single_table_data *) h->private; + return (uint8_t *) std->multtable; + } + return NULL; +} + +uint8_t *gf_w8_get_div_table(gf_t *gf) +{ + struct gf_w8_default_data *ftd; + struct gf_w8_single_table_data *std; + + if (gf->multiply.w32 == gf_w8_default_multiply) { + ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private; + return (uint8_t *) ftd->divtable; + } else if (gf->multiply.w32 == gf_w8_table_multiply) { + std = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private; + return (uint8_t *) std->divtable; + } + return NULL; +} diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_wgen.c b/src/erasure-code/jerasure/gf-complete/src/gf_wgen.c new file mode 100644 index 000000000000..68c6bb078580 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/gf_wgen.c @@ -0,0 +1,1019 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_wgen.c + * + * Routines for Galois fields for general w < 32. For specific w, + like 4, 8, 16, 32, 64 and 128, see the other files. + */ + +#include "gf_int.h" +#include +#include + +struct gf_wgen_table_w8_data { + uint8_t *mult; + uint8_t *div; + uint8_t base; +}; + +struct gf_wgen_table_w16_data { + uint16_t *mult; + uint16_t *div; + uint16_t base; +}; + +struct gf_wgen_log_w8_data { + uint8_t *log; + uint8_t *anti; + uint8_t *danti; + uint8_t base; +}; + +struct gf_wgen_log_w16_data { + uint16_t *log; + uint16_t *anti; + uint16_t *danti; + uint16_t base; +}; + +struct gf_wgen_log_w32_data { + uint32_t *log; + uint32_t *anti; + uint32_t *danti; + uint32_t base; +}; + +struct gf_wgen_group_data { + uint32_t *reduce; + uint32_t *shift; + uint32_t mask; + uint64_t rmask; + int tshift; + uint32_t memory; +}; + +static +inline +gf_val_32_t gf_wgen_inverse_from_divide (gf_t *gf, gf_val_32_t a) +{ + return gf->divide.w32(gf, 1, a); +} + +static +inline +gf_val_32_t gf_wgen_divide_from_inverse (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + b = gf->inverse.w32(gf, b); + return gf->multiply.w32(gf, a, b); +} + +static +inline +gf_val_32_t gf_wgen_euclid (gf_t *gf, gf_val_32_t b) +{ + + gf_val_32_t e_i, e_im1, e_ip1; + gf_val_32_t d_i, d_im1, d_ip1; + gf_val_32_t y_i, y_im1, y_ip1; + gf_val_32_t c_i; + + if (b == 0) return -1; + e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; + e_i = b; + d_im1 = ((gf_internal_t *) (gf->scratch))->w; + for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ; + y_i = 1; + y_im1 = 0; + + while (e_i != 1) { + + e_ip1 = e_im1; + d_ip1 = d_im1; + c_i = 0; + + while (d_ip1 >= d_i) { + c_i ^= (1 << (d_ip1 - d_i)); + e_ip1 ^= (e_i << (d_ip1 - d_i)); + if (e_ip1 == 0) return 0; + while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--; + } + + y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i); + y_im1 = y_i; + y_i = y_ip1; + + e_im1 = e_i; + d_im1 = d_i; + e_i = e_ip1; + d_i = d_ip1; + } + + return y_i; +} + +gf_val_32_t gf_wgen_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + uint8_t *ptr; + uint32_t rv; + int rs; + int byte, bit, i; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + rs = bytes / h->w; + byte = index/8; + bit = index%8; + + ptr = (uint8_t *) start; + ptr += bytes; + ptr -= rs; + ptr += byte; + + rv = 0; + for (i = 0; i < h->w; i++) { + rv <<= 1; + if ((*ptr) & (1 << bit)) rv |= 1; + ptr -= rs; + } + + return rv; +} + +static +inline +gf_val_32_t gf_wgen_matrix (gf_t *gf, gf_val_32_t b) +{ + return gf_bitmatrix_inverse(b, ((gf_internal_t *) (gf->scratch))->w, + ((gf_internal_t *) (gf->scratch))->prim_poly); +} + +static +inline +uint32_t +gf_wgen_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32) +{ + uint64_t product, i, pp, a, b, one; + gf_internal_t *h; + + a = a32; + b = b32; + h = (gf_internal_t *) gf->scratch; + one = 1; + pp = h->prim_poly | (one << h->w); + + product = 0; + + for (i = 0; i < h->w; i++) { + if (a & (one << i)) product ^= (b << i); + } + for (i = h->w*2-1; i >= h->w; i--) { + if (product & (one << i)) product ^= (pp << (i-h->w)); + } + return product; +} + +static +int gf_wgen_shift_init(gf_t *gf) +{ + gf->multiply.w32 = gf_wgen_shift_multiply; + gf->inverse.w32 = gf_wgen_euclid; + return 1; +} + +static +gf_val_32_t +gf_wgen_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t prod, pp, bmask; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + prod = 0; + bmask = (1 << (h->w-1)); + + while (1) { + if (a & 1) prod ^= b; + a >>= 1; + if (a == 0) return prod; + if (b & bmask) { + b = ((b << 1) ^ pp); + } else { + b <<= 1; + } + } +} + +static +int gf_wgen_bytwo_b_init(gf_t *gf) +{ + gf->multiply.w32 = gf_wgen_bytwo_b_multiply; + gf->inverse.w32 = gf_wgen_euclid; + return 1; +} + +static +inline +gf_val_32_t +gf_wgen_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t prod, pp, pmask, amask; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + prod = 0; + pmask = (1 << ((h->w)-1)); /*Ben: Had an operator precedence warning here*/ + amask = pmask; + + while (amask != 0) { + if (prod & pmask) { + prod = ((prod << 1) ^ pp); + } else { + prod <<= 1; + } + if (a & amask) prod ^= b; + amask >>= 1; + } + return prod; +} + + +static +int gf_wgen_bytwo_p_init(gf_t *gf) +{ + gf->multiply.w32 = gf_wgen_bytwo_p_multiply; + gf->inverse.w32 = gf_wgen_euclid; + return 1; +} + +static +void +gf_wgen_group_set_shift_tables(uint32_t *shift, uint32_t val, gf_internal_t *h) +{ + int i; + uint32_t j; + int g_s; + + if (h->mult_type == GF_MULT_DEFAULT) { + g_s = 2; + } else { + g_s = h->arg1; + } + + shift[0] = 0; + + for (i = 1; i < (1 << g_s); i <<= 1) { + for (j = 0; j < i; j++) shift[i|j] = shift[j]^val; + if (val & (1 << (h->w-1))) { + val <<= 1; + val ^= h->prim_poly; + } else { + val <<= 1; + } + } +} + +static +inline +gf_val_32_t +gf_wgen_group_s_equals_r_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + int leftover, rs; + uint32_t p, l, ind, a32; + int bits_left; + int g_s; + int w; + + struct gf_wgen_group_data *gd; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + g_s = h->arg1; + w = h->w; + + gd = (struct gf_wgen_group_data *) h->private; + gf_wgen_group_set_shift_tables(gd->shift, b, h); + + leftover = w % g_s; + if (leftover == 0) leftover = g_s; + + rs = w - leftover; + a32 = a; + ind = a32 >> rs; + a32 <<= leftover; + a32 &= gd->mask; + p = gd->shift[ind]; + + bits_left = rs; + rs = w - g_s; + + while (bits_left > 0) { + bits_left -= g_s; + ind = a32 >> rs; + a32 <<= g_s; + a32 &= gd->mask; + l = p >> rs; + p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s)) & gd->mask; + } + return p; +} + +char *bits(uint32_t v) +{ + char *rv; + int i, j; + + rv = malloc(30); + j = 0; + for (i = 27; i >= 0; i--) { + rv[j] = '0' + ((v & (1 << i)) ? 1 : 0); + j++; + } + rv[j] = '\0'; + return rv; +} +char *bits_56(uint64_t v) +{ + char *rv; + int i, j; + uint64_t one; + + one = 1; + + rv = malloc(60); + j = 0; + for (i = 55; i >= 0; i--) { + rv[j] = '0' + ((v & (one << i)) ? 1 : 0); + j++; + } + rv[j] = '\0'; + return rv; +} + +static +inline +gf_val_32_t +gf_wgen_group_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + int i; + int leftover; + uint64_t p, l, r; + uint32_t a32, ind; + int g_s, g_r; + struct gf_wgen_group_data *gd; + int w; + + gf_internal_t *h = (gf_internal_t *) gf->scratch; + if (h->mult_type == GF_MULT_DEFAULT) { + g_s = 2; + g_r = 8; + } else { + g_s = h->arg1; + g_r = h->arg2; + } + w = h->w; + gd = (struct gf_wgen_group_data *) h->private; + gf_wgen_group_set_shift_tables(gd->shift, b, h); + + leftover = w % g_s; + if (leftover == 0) leftover = g_s; + + a32 = a; + ind = a32 >> (w - leftover); + p = gd->shift[ind]; + p <<= g_s; + a32 <<= leftover; + a32 &= gd->mask; + + i = (w - leftover); + while (i > g_s) { + ind = a32 >> (w-g_s); + p ^= gd->shift[ind]; + a32 <<= g_s; + a32 &= gd->mask; + p <<= g_s; + i -= g_s; + } + + ind = a32 >> (h->w-g_s); + p ^= gd->shift[ind]; + + for (i = gd->tshift ; i >= 0; i -= g_r) { + l = p & (gd->rmask << i); + r = gd->reduce[l >> (i+w)]; + r <<= (i); + p ^= r; + } + return p & gd->mask; +} + +static +int gf_wgen_group_init(gf_t *gf) +{ + uint32_t i, j, p, index; + struct gf_wgen_group_data *gd; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + int g_s, g_r; + + if (h->mult_type == GF_MULT_DEFAULT) { + g_s = 2; + g_r = 8; + } else { + g_s = h->arg1; + g_r = h->arg2; + } + gd = (struct gf_wgen_group_data *) h->private; + gd->shift = &(gd->memory); + gd->reduce = gd->shift + (1 << g_s); + gd->mask = (h->w != 31) ? ((1 << h->w)-1) : 0x7fffffff; + + gd->rmask = (1 << g_r) - 1; + gd->rmask <<= h->w; + + gd->tshift = h->w % g_s; + if (gd->tshift == 0) gd->tshift = g_s; + gd->tshift = (h->w - gd->tshift); + gd->tshift = ((gd->tshift-1)/g_r) * g_r; + + gd->reduce[0] = 0; + for (i = 0; i < (1 << g_r); i++) { + p = 0; + index = 0; + for (j = 0; j < g_r; j++) { + if (i & (1 << j)) { + p ^= (h->prim_poly << j); + index ^= (h->prim_poly >> (h->w-j)); + } + } + gd->reduce[index] = (p & gd->mask); + } + + if (g_s == g_r) { + gf->multiply.w32 = gf_wgen_group_s_equals_r_multiply; + } else { + gf->multiply.w32 = gf_wgen_group_multiply; + } + gf->divide.w32 = NULL; + gf->divide.w32 = NULL; + return 1; +} + + +static +gf_val_32_t +gf_wgen_table_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h; + struct gf_wgen_table_w8_data *std; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_wgen_table_w8_data *) h->private; + + return (std->mult[(a<w)+b]); +} + +static +gf_val_32_t +gf_wgen_table_8_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h; + struct gf_wgen_table_w8_data *std; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_wgen_table_w8_data *) h->private; + + return (std->div[(a<w)+b]); +} + +static +int gf_wgen_table_8_init(gf_t *gf) +{ + gf_internal_t *h; + int w; + struct gf_wgen_table_w8_data *std; + uint32_t a, b, p; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + std = (struct gf_wgen_table_w8_data *) h->private; + + std->mult = &(std->base); + std->div = std->mult + ((1<w)*(1<w)); + + for (a = 0; a < (1 << w); a++) { + std->mult[a] = 0; + std->mult[a<div[a] = 0; + std->div[a<mult[(a<div[(p<multiply.w32 = gf_wgen_table_8_multiply; + gf->divide.w32 = gf_wgen_table_8_divide; + return 1; +} + +static +gf_val_32_t +gf_wgen_table_16_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h; + struct gf_wgen_table_w16_data *std; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_wgen_table_w16_data *) h->private; + + return (std->mult[(a<w)+b]); +} + +static +gf_val_32_t +gf_wgen_table_16_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h; + struct gf_wgen_table_w16_data *std; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_wgen_table_w16_data *) h->private; + + return (std->div[(a<w)+b]); +} + +static +int gf_wgen_table_16_init(gf_t *gf) +{ + gf_internal_t *h; + int w; + struct gf_wgen_table_w16_data *std; + uint32_t a, b, p; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + std = (struct gf_wgen_table_w16_data *) h->private; + + std->mult = &(std->base); + std->div = std->mult + ((1<w)*(1<w)); + + for (a = 0; a < (1 << w); a++) { + std->mult[a] = 0; + std->mult[a<div[a] = 0; + std->div[a<mult[(a<div[(p<multiply.w32 = gf_wgen_table_16_multiply; + gf->divide.w32 = gf_wgen_table_16_divide; + return 1; +} + +static +int gf_wgen_table_init(gf_t *gf) +{ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + if (h->w <= 8) return gf_wgen_table_8_init(gf); + if (h->w <= 14) return gf_wgen_table_16_init(gf); + + /* Returning zero to make the compiler happy, but this won't get + executed, because it is tested in _scratch_space. */ + + return 0; +} + +static +gf_val_32_t +gf_wgen_log_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h; + struct gf_wgen_log_w8_data *std; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_wgen_log_w8_data *) h->private; + + if (a == 0 || b == 0) return 0; + return (std->anti[std->log[a]+std->log[b]]); +} + +static +gf_val_32_t +gf_wgen_log_8_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h; + struct gf_wgen_log_w8_data *std; + int index; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_wgen_log_w8_data *) h->private; + + if (a == 0 || b == 0) return 0; + index = std->log[a]; + index -= std->log[b]; + + return (std->danti[index]); +} + +static +int gf_wgen_log_8_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_wgen_log_w8_data *std; + int w; + uint32_t a, i; + int check = 0; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + std = (struct gf_wgen_log_w8_data *) h->private; + + std->log = &(std->base); + std->anti = std->log + (1<w); + std->danti = std->anti + (1<w)-1; + + for (i = 0; i < (1 << w); i++) + std->log[i] = 0; + + a = 1; + for(i=0; i < (1<log[a] != 0) check = 1; + std->log[a] = i; + std->anti[i] = a; + std->danti[i] = a; + a <<= 1; + if(a & (1<prim_poly; + //a &= ((1 << w)-1); + } + + if (check != 0) { + _gf_errno = GF_E_LOGPOLY; + return 0; + } + + gf->multiply.w32 = gf_wgen_log_8_multiply; + gf->divide.w32 = gf_wgen_log_8_divide; + return 1; +} + +static +gf_val_32_t +gf_wgen_log_16_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h; + struct gf_wgen_log_w16_data *std; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_wgen_log_w16_data *) h->private; + + if (a == 0 || b == 0) return 0; + return (std->anti[std->log[a]+std->log[b]]); +} + +static +gf_val_32_t +gf_wgen_log_16_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h; + struct gf_wgen_log_w16_data *std; + int index; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_wgen_log_w16_data *) h->private; + + if (a == 0 || b == 0) return 0; + index = std->log[a]; + index -= std->log[b]; + + return (std->danti[index]); +} + +static +int gf_wgen_log_16_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_wgen_log_w16_data *std; + int w; + uint32_t a, i; + int check = 0; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + std = (struct gf_wgen_log_w16_data *) h->private; + + std->log = &(std->base); + std->anti = std->log + (1<w); + std->danti = std->anti + (1<w)-1; + + for (i = 0; i < (1 << w); i++) + std->log[i] = 0; + + a = 1; + for(i=0; i < (1<log[a] != 0) check = 1; + std->log[a] = i; + std->anti[i] = a; + std->danti[i] = a; + a <<= 1; + if(a & (1<prim_poly; + //a &= ((1 << w)-1); + } + + if (check) { + if (h->mult_type != GF_MULT_LOG_TABLE) return gf_wgen_shift_init(gf); + _gf_errno = GF_E_LOGPOLY; + return 0; + } + + gf->multiply.w32 = gf_wgen_log_16_multiply; + gf->divide.w32 = gf_wgen_log_16_divide; + return 1; +} + +static +gf_val_32_t +gf_wgen_log_32_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h; + struct gf_wgen_log_w32_data *std; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_wgen_log_w32_data *) h->private; + + if (a == 0 || b == 0) return 0; + return (std->anti[std->log[a]+std->log[b]]); +} + +static +gf_val_32_t +gf_wgen_log_32_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h; + struct gf_wgen_log_w32_data *std; + int index; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_wgen_log_w32_data *) h->private; + + if (a == 0 || b == 0) return 0; + index = std->log[a]; + index -= std->log[b]; + + return (std->danti[index]); +} + +static +int gf_wgen_log_32_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_wgen_log_w32_data *std; + int w; + uint32_t a, i; + int check = 0; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + std = (struct gf_wgen_log_w32_data *) h->private; + + std->log = &(std->base); + std->anti = std->log + (1<w); + std->danti = std->anti + (1<w)-1; + + for (i = 0; i < (1 << w); i++) + std->log[i] = 0; + + a = 1; + for(i=0; i < (1<log[a] != 0) check = 1; + std->log[a] = i; + std->anti[i] = a; + std->danti[i] = a; + a <<= 1; + if(a & (1<prim_poly; + //a &= ((1 << w)-1); + } + + if (check != 0) { + _gf_errno = GF_E_LOGPOLY; + return 0; + } + + gf->multiply.w32 = gf_wgen_log_32_multiply; + gf->divide.w32 = gf_wgen_log_32_divide; + return 1; +} + +static +int gf_wgen_log_init(gf_t *gf) +{ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + if (h->w <= 8) return gf_wgen_log_8_init(gf); + if (h->w <= 16) return gf_wgen_log_16_init(gf); + if (h->w <= 32) return gf_wgen_log_32_init(gf); + + /* Returning zero to make the compiler happy, but this won't get + executed, because it is tested in _scratch_space. */ + + return 0; +} + +int gf_wgen_scratch_size(int w, int mult_type, int region_type, int divide_type, int arg1, int arg2) +{ + + switch(mult_type) + { + case GF_MULT_DEFAULT: + if (w <= 8) { + return sizeof(gf_internal_t) + sizeof(struct gf_wgen_table_w8_data) + + sizeof(uint8_t)*(1 << w)*(1<scratch; + rs = bytes / (h->w); + + written = (xor) ? 0xffffffff : 0; + for (i = 0; i < h->w; i++) { + for (j = 0; j < h->w; j++) { + if (val & (1 << j)) { + gf_multby_one(src, ((char*)dest) + j*rs, rs, (written & (1 << j))); + written |= (1 << j); + } + } + src = (char*)src + rs; + val = gf->multiply.w32(gf, val, 2); + } +} + +int gf_wgen_init(gf_t *gf) +{ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + if (h->prim_poly == 0) { + switch (h->w) { + case 1: h->prim_poly = 1; break; + case 2: h->prim_poly = 7; break; + case 3: h->prim_poly = 013; break; + case 4: h->prim_poly = 023; break; + case 5: h->prim_poly = 045; break; + case 6: h->prim_poly = 0103; break; + case 7: h->prim_poly = 0211; break; + case 8: h->prim_poly = 0435; break; + case 9: h->prim_poly = 01021; break; + case 10: h->prim_poly = 02011; break; + case 11: h->prim_poly = 04005; break; + case 12: h->prim_poly = 010123; break; + case 13: h->prim_poly = 020033; break; + case 14: h->prim_poly = 042103; break; + case 15: h->prim_poly = 0100003; break; + case 16: h->prim_poly = 0210013; break; + case 17: h->prim_poly = 0400011; break; + case 18: h->prim_poly = 01000201; break; + case 19: h->prim_poly = 02000047; break; + case 20: h->prim_poly = 04000011; break; + case 21: h->prim_poly = 010000005; break; + case 22: h->prim_poly = 020000003; break; + case 23: h->prim_poly = 040000041; break; + case 24: h->prim_poly = 0100000207; break; + case 25: h->prim_poly = 0200000011; break; + case 26: h->prim_poly = 0400000107; break; + case 27: h->prim_poly = 01000000047; break; + case 28: h->prim_poly = 02000000011; break; + case 29: h->prim_poly = 04000000005; break; + case 30: h->prim_poly = 010040000007; break; + case 31: h->prim_poly = 020000000011; break; + case 32: h->prim_poly = 00020000007; break; + default: fprintf(stderr, "gf_wgen_init: w not defined yet\n"); exit(1); + } + } else { + if (h->w == 32) { + h->prim_poly &= 0xffffffff; + } else { + h->prim_poly |= (1 << h->w); + if (h->prim_poly & ~((1ULL<<(h->w+1))-1)) return 0; + } + } + + gf->multiply.w32 = NULL; + gf->divide.w32 = NULL; + gf->inverse.w32 = NULL; + gf->multiply_region.w32 = gf_wgen_cauchy_region; + gf->extract_word.w32 = gf_wgen_extract_word; + + switch(h->mult_type) { + case GF_MULT_DEFAULT: + if (h->w <= 8) { + if (gf_wgen_table_init(gf) == 0) return 0; + } else if (h->w <= 16) { + if (gf_wgen_log_init(gf) == 0) return 0; + } else { + if (gf_wgen_bytwo_p_init(gf) == 0) return 0; + } + break; + case GF_MULT_SHIFT: if (gf_wgen_shift_init(gf) == 0) return 0; break; + case GF_MULT_BYTWO_b: if (gf_wgen_bytwo_b_init(gf) == 0) return 0; break; + case GF_MULT_BYTWO_p: if (gf_wgen_bytwo_p_init(gf) == 0) return 0; break; + case GF_MULT_GROUP: if (gf_wgen_group_init(gf) == 0) return 0; break; + case GF_MULT_TABLE: if (gf_wgen_table_init(gf) == 0) return 0; break; + case GF_MULT_LOG_TABLE: if (gf_wgen_log_init(gf) == 0) return 0; break; + default: return 0; + } + if (h->divide_type == GF_DIVIDE_EUCLID) { + gf->divide.w32 = gf_wgen_divide_from_inverse; + gf->inverse.w32 = gf_wgen_euclid; + } else if (h->divide_type == GF_DIVIDE_MATRIX) { + gf->divide.w32 = gf_wgen_divide_from_inverse; + gf->inverse.w32 = gf_wgen_matrix; + } + + if (gf->inverse.w32== NULL && gf->divide.w32 == NULL) gf->inverse.w32 = gf_wgen_euclid; + + if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) { + gf->divide.w32 = gf_wgen_divide_from_inverse; + } + if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) { + gf->inverse.w32 = gf_wgen_inverse_from_divide; + } + return 1; +} diff --git a/src/erasure-code/jerasure/jerasure b/src/erasure-code/jerasure/jerasure deleted file mode 160000 index db7abf848419..000000000000 --- a/src/erasure-code/jerasure/jerasure +++ /dev/null @@ -1 +0,0 @@ -Subproject commit db7abf84841933e2f7e51d2a3b4923f70eef6c62 diff --git a/src/erasure-code/jerasure/jerasure/include/cauchy.h b/src/erasure-code/jerasure/jerasure/include/cauchy.h new file mode 100644 index 000000000000..a4fad6bd6ee7 --- /dev/null +++ b/src/erasure-code/jerasure/jerasure/include/cauchy.h @@ -0,0 +1,45 @@ +/* * + * Copyright (c) 2013, James S. Plank and Kevin Greenan + * All rights reserved. + * + * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure + * Coding Techniques + * + * Revision 2.0: Galois Field backend now links to GF-Complete + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the University of Tennessee nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +extern int *cauchy_original_coding_matrix(int k, int m, int w); +extern int *cauchy_xy_coding_matrix(int k, int m, int w, int *x, int *y); +extern void cauchy_improve_coding_matrix(int k, int m, int w, int *matrix); +extern int *cauchy_good_general_coding_matrix(int k, int m, int w); +extern int cauchy_n_ones(int n, int w); diff --git a/src/erasure-code/jerasure/jerasure/include/galois.h b/src/erasure-code/jerasure/jerasure/include/galois.h new file mode 100644 index 000000000000..d75be6a5d9a0 --- /dev/null +++ b/src/erasure-code/jerasure/jerasure/include/galois.h @@ -0,0 +1,99 @@ +/* * + * Copyright (c) 2013, James S. Plank and Kevin Greenan + * All rights reserved. + * + * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure + * Coding Techniques + * + * Revision 2.0: Galois Field backend now links to GF-Complete + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the University of Tennessee nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef _GALOIS_H +#define _GALOIS_H + +#include +#include +#include + +extern void galois_change_technique(gf_t *gf, int w); + +extern int galois_single_multiply(int a, int b, int w); +extern int galois_single_divide(int a, int b, int w); +extern int galois_inverse(int x, int w); + +void galois_region_xor( char *src, /* Source Region */ + char *dest, /* Dest Region (holds result) */ + int nbytes); /* Number of bytes in region */ + +/* These multiply regions in w=8, w=16 and w=32. They are much faster + than calling galois_single_multiply. The regions must be long word aligned. */ + +void galois_w08_region_multiply(char *region, /* Region to multiply */ + int multby, /* Number to multiply by */ + int nbytes, /* Number of bytes in region */ + char *r2, /* If r2 != NULL, products go here. + Otherwise region is overwritten */ + int add); /* If (r2 != NULL && add) the produce is XOR'd with r2 */ + +void galois_w16_region_multiply(char *region, /* Region to multiply */ + int multby, /* Number to multiply by */ + int nbytes, /* Number of bytes in region */ + char *r2, /* If r2 != NULL, products go here. + Otherwise region is overwritten */ + int add); /* If (r2 != NULL && add) the produce is XOR'd with r2 */ + +void galois_w32_region_multiply(char *region, /* Region to multiply */ + int multby, /* Number to multiply by */ + int nbytes, /* Number of bytes in region */ + char *r2, /* If r2 != NULL, products go here. + Otherwise region is overwritten */ + int add); /* If (r2 != NULL && add) the produce is XOR'd with r2 */ + +gf_t* galois_init_field(int w, + int mult_type, + int region_type, + int divide_type, + uint64_t prim_poly, + int arg1, + int arg2); + +gf_t* galois_init_composite_field(int w, + int region_type, + int divide_type, + int degree, + gf_t* base_gf); + +gf_t * galois_get_field_ptr(int w); + + +#endif diff --git a/src/erasure-code/jerasure/jerasure/include/jerasure.h b/src/erasure-code/jerasure/jerasure/include/jerasure.h new file mode 100644 index 000000000000..08367809b715 --- /dev/null +++ b/src/erasure-code/jerasure/jerasure/include/jerasure.h @@ -0,0 +1,294 @@ +/* * + * Copyright (c) 2013, James S. Plank and Kevin Greenan + * All rights reserved. + * + * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure + * Coding Techniques + * + * Revision 2.0: Galois Field backend now links to GF-Complete + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the University of Tennessee nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef _JERASURE_H +#define _JERASURE_H + +/* This uses procedures from the Galois Field arithmetic library */ + +#include "galois.h" + +/* ------------------------------------------------------------ */ +/* In all of the routines below: + + k = Number of data devices + m = Number of coding devices + w = Word size + + data_ptrs = An array of k pointers to data which is size bytes. + Size must be a multiple of sizeof(long). + Pointers must also be longword aligned. + + coding_ptrs = An array of m pointers to coding data which is size bytes. + + packetsize = The size of a coding block with bitmatrix coding. + When you code with a bitmatrix, you will use w packets + of size packetsize. + + matrix = an array of k*m integers. + It represents an m by k matrix. + Element i,j is in matrix[i*k+j]; + + bitmatrix = an array of k*m*w*w integers. + It represents an mw by kw matrix. + Element i,j is in matrix[i*k*w+j]; + + erasures = an array of id's of erased devices. + Id's are integers between 0 and k+m-1. + Id's 0 to k-1 are id's of data devices. + Id's k to k+m-1 are id's of coding devices: + Coding device id = id-k. + If there are e erasures, erasures[e] = -1. + + schedule = an array of schedule operations. + + If there are m operations, then schedule[m][0] = -1. + + operation = an array of 5 integers: + + 0 = operation: 0 for copy, 1 for xor (-1 for end) + 1 = source device (0 - k+m-1) + 2 = source packet (0 - w-1) + 3 = destination device (0 - k+m-1) + 4 = destination packet (0 - w-1) + */ + +/* --------------------------------------------------------------- */ +/* Bitmatrices / schedules ---------------------------------------- */ +/* + - jerasure_matrix_to_bitmatrix turns a m X k matrix in GF(2^w) into a + wm X wk bitmatrix (in GF(2)). This is + explained in the Cauchy Reed-Solomon coding + paper. + + - jerasure_dumb_bitmatrix_to_schedule turns a bitmatrix into a schedule + using the straightforward algorithm -- just + schedule the dot products defined by each + row of the matrix. + + - jerasure_smart_bitmatrix_to_schedule turns a bitmatrix into a schedule, + but tries to use previous dot products to + calculate new ones. This is the optimization + explained in the original Liberation code paper. + + - jerasure_generate_schedule_cache precalcalculate all the schedule for the + given distribution bitmatrix. M must equal 2. + + - jerasure_free_schedule frees a schedule that was allocated with + jerasure_XXX_bitmatrix_to_schedule. + + - jerasure_free_schedule_cache frees a schedule cache that was created with + jerasure_generate_schedule_cache. + */ + +int *jerasure_matrix_to_bitmatrix(int k, int m, int w, int *matrix); +int **jerasure_dumb_bitmatrix_to_schedule(int k, int m, int w, int *bitmatrix); +int **jerasure_smart_bitmatrix_to_schedule(int k, int m, int w, int *bitmatrix); +int ***jerasure_generate_schedule_cache(int k, int m, int w, int *bitmatrix, int smart); + +void jerasure_free_schedule(int **schedule); +void jerasure_free_schedule_cache(int k, int m, int ***cache); + + +/* ------------------------------------------------------------ */ +/* Encoding - these are all straightforward. jerasure_matrix_encode only + works with w = 8|16|32. */ + +void jerasure_do_parity(int k, char **data_ptrs, char *parity_ptr, int size); + +void jerasure_matrix_encode(int k, int m, int w, int *matrix, + char **data_ptrs, char **coding_ptrs, int size); + +void jerasure_bitmatrix_encode(int k, int m, int w, int *bitmatrix, + char **data_ptrs, char **coding_ptrs, int size, int packetsize); + +void jerasure_schedule_encode(int k, int m, int w, int **schedule, + char **data_ptrs, char **coding_ptrs, int size, int packetsize); + +/* ------------------------------------------------------------ */ +/* Decoding. -------------------------------------------------- */ + +/* These return integers, because the matrix may not be invertible. + + The parameter row_k_ones should be set to 1 if row k of the matrix + (or rows kw to (k+1)w+1) of th distribution matrix are all ones + (or all identity matrices). Then you can improve the performance + of decoding when there is more than one failure, and the parity + device didn't fail. You do it by decoding all but one of the data + devices, and then decoding the last data device from the data devices + and the parity device. + + jerasure_schedule_decode_lazy generates the schedule on the fly. + + jerasure_matrix_decode only works when w = 8|16|32. + + jerasure_make_decoding_matrix/bitmatrix make the k*k decoding matrix + (or wk*wk bitmatrix) by taking the rows corresponding to k + non-erased devices of the distribution matrix, and then + inverting that matrix. + + You should already have allocated the decoding matrix and + dm_ids, which is a vector of k integers. These will be + filled in appropriately. dm_ids[i] is the id of element + i of the survivors vector. I.e. row i of the decoding matrix + times dm_ids equals data drive i. + + Both of these routines take "erased" instead of "erasures". + Erased is a vector with k+m elements, which has 0 or 1 for + each device's id, according to whether the device is erased. + + jerasure_erasures_to_erased allocates and returns erased from erasures. + + */ + +int jerasure_matrix_decode(int k, int m, int w, + int *matrix, int row_k_ones, int *erasures, + char **data_ptrs, char **coding_ptrs, int size); + +int jerasure_bitmatrix_decode(int k, int m, int w, + int *bitmatrix, int row_k_ones, int *erasures, + char **data_ptrs, char **coding_ptrs, int size, int packetsize); + +int jerasure_schedule_decode_lazy(int k, int m, int w, int *bitmatrix, int *erasures, + char **data_ptrs, char **coding_ptrs, int size, int packetsize, + int smart); + +int jerasure_schedule_decode_cache(int k, int m, int w, int ***scache, int *erasures, + char **data_ptrs, char **coding_ptrs, int size, int packetsize); + +int jerasure_make_decoding_matrix(int k, int m, int w, int *matrix, int *erased, + int *decoding_matrix, int *dm_ids); + +int jerasure_make_decoding_bitmatrix(int k, int m, int w, int *matrix, int *erased, + int *decoding_matrix, int *dm_ids); + +int *jerasure_erasures_to_erased(int k, int m, int *erasures); + +/* ------------------------------------------------------------ */ +/* These perform dot products and schedules. -------------------*/ +/* + src_ids is a matrix of k id's (0 - k-1 for data devices, k - k+m-1 + for coding devices) that identify the source devices. Dest_id is + the id of the destination device. + + jerasure_matrix_dotprod only works when w = 8|16|32. + + jerasure_do_scheduled_operations executes the schedule on w*packetsize worth of + bytes from each device. ptrs is an array of pointers which should have as many + elements as the highest referenced device in the schedule. + + */ + +void jerasure_matrix_dotprod(int k, int w, int *matrix_row, + int *src_ids, int dest_id, + char **data_ptrs, char **coding_ptrs, int size); + +void jerasure_bitmatrix_dotprod(int k, int w, int *bitmatrix_row, + int *src_ids, int dest_id, + char **data_ptrs, char **coding_ptrs, int size, int packetsize); + +void jerasure_do_scheduled_operations(char **ptrs, int **schedule, int packetsize); + +/* ------------------------------------------------------------ */ +/* Matrix Inversion ------------------------------------------- */ +/* + The two matrix inversion functions work on rows*rows matrices of + ints. If a bitmatrix, then each int will just be zero or one. + Otherwise, they will be elements of gf(2^w). Obviously, you can + do bit matrices with crs_invert_matrix() and set w = 1, but + crs_invert_bitmatrix will be more efficient. + + The two invertible functions return whether a matrix is invertible. + They are more efficient than the inverstion functions. + + Mat will be destroyed when the matrix inversion or invertible + testing is done. Sorry. + + Inv must be allocated by the caller. + + The two invert_matrix functions return 0 on success, and -1 if the + matrix is uninvertible. + + The two invertible function simply return whether the matrix is + invertible. (0 or 1). Mat will be destroyed. + */ + +int jerasure_invert_matrix(int *mat, int *inv, int rows, int w); +int jerasure_invert_bitmatrix(int *mat, int *inv, int rows); +int jerasure_invertible_matrix(int *mat, int rows, int w); +int jerasure_invertible_bitmatrix(int *mat, int rows); + +/* ------------------------------------------------------------ */ +/* Basic matrix operations -------------------------------------*/ +/* + Each of the print_matrix routines require a w. In jerasure_print_matrix, + this is to calculate the field width. In jerasure_print_bitmatrix, it is + to put spaces between the bits. + + jerasure_matrix_multiply is a simple matrix multiplier in GF(2^w). It returns a r1*c2 + matrix, which is the product of the two input matrices. It allocates + the product. Obviously, c1 should equal r2. However, this is not + validated by the procedure. +*/ + +void jerasure_print_matrix(int *matrix, int rows, int cols, int w); +void jerasure_print_bitmatrix(int *matrix, int rows, int cols, int w); + + +int *jerasure_matrix_multiply(int *m1, int *m2, int r1, int c1, int r2, int c2, int w); + +/* ------------------------------------------------------------ */ +/* Stats ------------------------------------------------------ */ +/* + jerasure_get_stats fills in a vector of three doubles: + + fill_in[0] is the number of bytes that have been XOR'd + fill_in[1] is the number of bytes that have been copied + fill_in[2] is the number of bytes that have been multiplied + by a constant in GF(2^w) + + When jerasure_get_stats() is called, it resets its values. + */ + +void jerasure_get_stats(double *fill_in); + +int jerasure_autoconf_test(); + +#endif diff --git a/src/erasure-code/jerasure/jerasure/include/liberation.h b/src/erasure-code/jerasure/jerasure/include/liberation.h new file mode 100644 index 000000000000..f2fb7233fcf9 --- /dev/null +++ b/src/erasure-code/jerasure/jerasure/include/liberation.h @@ -0,0 +1,47 @@ +/* * + * Copyright (c) 2013, James S. Plank and Kevin Greenan + * All rights reserved. + * + * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure + * Coding Techniques + * + * Revision 2.0: Galois Field backend now links to GF-Complete + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the University of Tennessee nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef _LIBERATION + +extern int *liberation_coding_bitmatrix(int k, int w); +extern int *liber8tion_coding_bitmatrix(int k); +extern int *blaum_roth_coding_bitmatrix(int k, int w); + +#endif diff --git a/src/erasure-code/jerasure/jerasure/include/reed_sol.h b/src/erasure-code/jerasure/jerasure/include/reed_sol.h new file mode 100644 index 000000000000..d2d8fe8caf2f --- /dev/null +++ b/src/erasure-code/jerasure/jerasure/include/reed_sol.h @@ -0,0 +1,50 @@ +/* * + * Copyright (c) 2013, James S. Plank and Kevin Greenan + * All rights reserved. + * + * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure + * Coding Techniques + * + * Revision 2.0: Galois Field backend now links to GF-Complete + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the University of Tennessee nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +extern int *reed_sol_vandermonde_coding_matrix(int k, int m, int w); +extern int *reed_sol_extended_vandermonde_matrix(int rows, int cols, int w); +extern int *reed_sol_big_vandermonde_distribution_matrix(int rows, int cols, int w); + +extern int reed_sol_r6_encode(int k, int w, char **data_ptrs, char **coding_ptrs, int size); +extern int *reed_sol_r6_coding_matrix(int k, int w); + +extern void reed_sol_galois_w08_region_multby_2(char *region, int nbytes); +extern void reed_sol_galois_w16_region_multby_2(char *region, int nbytes); +extern void reed_sol_galois_w32_region_multby_2(char *region, int nbytes); diff --git a/src/erasure-code/jerasure/jerasure/src/cauchy.c b/src/erasure-code/jerasure/jerasure/src/cauchy.c new file mode 100644 index 000000000000..f63dfb7eab48 --- /dev/null +++ b/src/erasure-code/jerasure/jerasure/src/cauchy.c @@ -0,0 +1,405 @@ +/* * + * Copyright (c) 2014, James S. Plank and Kevin Greenan + * All rights reserved. + * + * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure + * Coding Techniques + * + * Revision 2.0: Galois Field backend now links to GF-Complete + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the University of Tennessee nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* Jerasure's authors: + + Revision 2.x - 2014: James S. Plank and Kevin M. Greenan + Revision 1.2 - 2008: James S. Plank, Scott Simmerman and Catherine D. Schuman. + Revision 1.0 - 2007: James S. Plank + */ + +#include +#include +#include + +#include "galois.h" +#include "jerasure.h" +#include "cauchy.h" + +static int PPs[33] = { -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1 }; +static int NOs[33]; +static int ONEs[33][33]; + +static int *cbest_0; +static int *cbest_1; +static int cbest_2[3]; +static int cbest_3[7]; +static int cbest_4[15]; +static int cbest_5[31]; +static int cbest_6[63]; +static int cbest_7[127]; +static int cbest_8[255]; +static int cbest_9[511]; +static int cbest_10[1023]; +static int cbest_11[1023]; +static int *cbest_12, *cbest_13, *cbest_14, *cbest_15, *cbest_16, *cbest_17, *cbest_18, *cbest_19, *cbest_20, + *cbest_21, *cbest_22, *cbest_23, *cbest_24, *cbest_25, *cbest_26, *cbest_27, *cbest_28, *cbest_29, *cbest_30, + *cbest_31, *cbest_32; + +static int cbest_max_k[33] = { -1, -1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 1023, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1 }; + +static int cbest_init = 0; + +static int *cbest_all[33]; + + +#define talloc(type, num) (type *) malloc(sizeof(type)*(num)) + +int cauchy_n_ones(int n, int w) +{ + int no; + int cno; + int nones; + int i, j; + int highbit; + + highbit = (1 << (w-1)); + + if (PPs[w] == -1) { + nones = 0; + PPs[w] = galois_single_multiply(highbit, 2, w); + for (i = 0; i < w; i++) { + if (PPs[w] & (1 << i)) { + ONEs[w][nones] = (1 << i); + nones++; + } + } + NOs[w] = nones; + } + + no = 0; + for (i = 0; i < w; i++) if (n & (1 << i)) no++; + cno = no; + for (i = 1; i < w; i++) { + if (n & highbit) { + n ^= highbit; + n <<= 1; + n ^= PPs[w]; + cno--; + for (j = 0; j < NOs[w]; j++) { + cno += (n & ONEs[w][j]) ? 1 : -1; + } + } else { + n <<= 1; + } + no += cno; + } + return no; +} + +int *cauchy_original_coding_matrix(int k, int m, int w) +{ + int *matrix; + int i, j, index; + + if (w < 31 && (k+m) > (1 << w)) return NULL; + matrix = talloc(int, k*m); + if (matrix == NULL) return NULL; + index = 0; + for (i = 0; i < m; i++) { + for (j = 0; j < k; j++) { + matrix[index] = galois_single_divide(1, (i ^ (m+j)), w); + index++; + } + } + return matrix; +} + +int *cauchy_xy_coding_matrix(int k, int m, int w, int *X, int *Y) +{ + int index, i, j; + int *matrix; + + matrix = talloc(int, k*m); + if (matrix == NULL) { return NULL; } + index = 0; + for (i = 0; i < m; i++) { + for (j = 0; j < k; j++) { + matrix[index] = galois_single_divide(1, (X[i] ^ Y[j]), w); + index++; + } + } + return matrix; +} + +void cauchy_improve_coding_matrix(int k, int m, int w, int *matrix) +{ + int index, i, j, x; + int tmp; + int bno, tno, bno_index; + + for (j = 0; j < k; j++) { + if (matrix[j] != 1) { + tmp = galois_single_divide(1, matrix[j], w); + index = j; + for (i = 0; i < m; i++) { + matrix[index] = galois_single_multiply(matrix[index], tmp, w); + index += k; + } + } + } + for (i = 1; i < m; i++) { + bno = 0; + index = i*k; + for (j = 0; j < k; j++) bno += cauchy_n_ones(matrix[index+j], w); + bno_index = -1; + for (j = 0; j < k; j++) { + if (matrix[index+j] != 1) { + tmp = galois_single_divide(1, matrix[index+j], w); + tno = 0; + for (x = 0; x < k; x++) { + tno += cauchy_n_ones(galois_single_multiply(matrix[index+x], tmp, w), w); + } + if (tno < bno) { + bno = tno; + bno_index = j; + } + } + } + if (bno_index != -1) { + tmp = galois_single_divide(1, matrix[index+bno_index], w); + for (j = 0; j < k; j++) { + matrix[index+j] = galois_single_multiply(matrix[index+j], tmp, w); + } + } + } +} + +int *cauchy_good_general_coding_matrix(int k, int m, int w) +{ + int *matrix, i; + + if (m == 2 && k <= cbest_max_k[w]) { + matrix = talloc(int, k*m); + if (matrix == NULL) return NULL; + if (!cbest_init) { + cbest_init = 1; + cbest_all[0] = cbest_0; cbest_all[1] = cbest_1; cbest_all[2] = cbest_2; cbest_all[3] = cbest_3; cbest_all[4] = + cbest_4; cbest_all[5] = cbest_5; cbest_all[6] = cbest_6; cbest_all[7] = cbest_7; cbest_all[8] = cbest_8; + cbest_all[9] = cbest_9; cbest_all[10] = cbest_10; cbest_all[11] = cbest_11; cbest_all[12] = cbest_12; + cbest_all[13] = cbest_13; cbest_all[14] = cbest_14; cbest_all[15] = cbest_15; cbest_all[16] = cbest_16; + cbest_all[17] = cbest_17; cbest_all[18] = cbest_18; cbest_all[19] = cbest_19; cbest_all[20] = cbest_20; + cbest_all[21] = cbest_21; cbest_all[22] = cbest_22; cbest_all[23] = cbest_23; cbest_all[24] = cbest_24; + cbest_all[25] = cbest_25; cbest_all[26] = cbest_26; cbest_all[27] = cbest_27; cbest_all[28] = cbest_28; + cbest_all[29] = cbest_29; cbest_all[30] = cbest_30; cbest_all[31] = cbest_31; cbest_all[32] = (int *) cbest_32; + } + for (i = 0; i < k; i++) { + matrix[i] = 1; + matrix[i+k] = cbest_all[w][i]; + } + return matrix; + } else { + matrix = cauchy_original_coding_matrix(k, m, w); + if (matrix == NULL) return NULL; + cauchy_improve_coding_matrix(k, m, w, matrix); + return matrix; + } +} + +static int cbest_2[3] = { 1, 2, 3 }; +static int cbest_3[7] = { 1, 2, 5, 4, 7, 3, 6 }; + +static int cbest_4[15] = { 1, 2, 9, 4, 8, 13, 3, 6, 12, 5, 11, 15, 10, 14, 7 }; + +static int cbest_5[31] = { 1, 2, 18, 4, 9, 8, 22, 16, 3, 11, 19, 5, 10, 6, 20, 27, 13, 23, 26, 12, + 17, 25, 24, 31, 30, 7, 15, 21, 29, 14, 28 }; + +static int cbest_6[63] = { 1, 2, 33, 4, 8, 49, 16, 32, 57, 3, 6, 12, 24, 48, 5, 35, 9, 37, 10, 17, + 41, 51, 56, 61, 18, 28, 53, 14, 20, 34, 7, 13, 25, 36, 59, 26, 39, 40, 45, 50, 60, 52, 63, + 11, 30, 55, 19, 22, 29, 43, 58, 15, 21, 38, 44, 47, 62, 27, 54, 42, 31, 23, 46 }; + +static int cbest_7[127] = { 1, 2, 68, 4, 34, 8, 17, 16, 76, 32, 38, 3, 64, 69, 5, 19, 35, 70, 6, 9, + 18, 102, 10, 36, 85, 12, 21, 42, 51, 72, 77, 84, 20, 25, 33, 50, 78, 98, 24, 39, 49, 100, 110 + , 48, 65, 93, 40, 66, 71, 92, 7, 46, 55, 87, 96, 103, 106, 11, 23, 37, 54, 81, 86, 108, 13, + 22, 27, 43, 53, 73, 80, 14, 26, 52, 74, 79, 99, 119, 44, 95, 101, 104, 111, 118, 29, 59, 89, + 94, 117, 28, 41, 58, 67, 88, 115, 116, 47, 57, 83, 97, 107, 114, 127, 56, 82, 109, 113, 126, + 112, 125, 15, 63, 75, 123, 124, 31, 45, 62, 91, 105, 122, 30, 61, 90, 121, 60, 120 }; + +static int cbest_8[255] = { 1, 2, 142, 4, 71, 8, 70, 173, 3, 35, 143, 16, 17, 67, 134, 140, 172, 6, 34 + , 69, 201, 216, 5, 33, 86, 12, 65, 138, 158, 159, 175, 10, 32, 43, 66, 108, 130, 193, 234, 9, + 24, 25, 50, 68, 79, 100, 132, 174, 200, 217, 20, 21, 42, 48, 87, 169, 41, 54, 64, 84, 96, 117 + , 154, 155, 165, 226, 77, 82, 135, 136, 141, 168, 192, 218, 238, 7, 18, 19, 39, 40, 78, 113, + 116, 128, 164, 180, 195, 205, 220, 232, 14, 26, 27, 58, 109, 156, 157, 203, 235, 13, 28, 29, 38 + , 51, 56, 75, 85, 90, 101, 110, 112, 139, 171, 11, 37, 49, 52, 76, 83, 102, 119, 131, 150, 151 + , 167, 182, 184, 188, 197, 219, 224, 45, 55, 80, 94, 97, 133, 170, 194, 204, 221, 227, 236, 36, + 47, 73, 92, 98, 104, 118, 152, 153, 166, 202, 207, 239, 251, 22, 23, 44, 74, 91, 148, 149, 161 + , 181, 190, 233, 46, 59, 88, 137, 146, 147, 163, 196, 208, 212, 222, 250, 57, 81, 95, 106, 111, + 129, 160, 176, 199, 243, 249, 15, 53, 72, 93, 103, 115, 125, 162, 183, 185, 189, 206, 225, 255, + 186, 210, 230, 237, 242, 248, 30, 31, 62, 89, 99, 105, 114, 121, 124, 178, 209, 213, 223, 228, + 241, 254, 60, 191, 198, 247, 120, 240, 107, 127, 144, 145, 177, 211, 214, 246, 245, 123, 126, + 187, 231, 253, 63, 179, 229, 244, 61, 122, 215, 252 }; + +static int cbest_9[511] = { 1, 2, 264, 4, 132, 8, 66, 16, 33, 32, 280, 64, 140, 128, 3, 70, 265, 5, + 133, 256, 266, 6, 9, 35, 67, 134, 268, 396, 10, 17, 34, 330, 12, 18, 68, 198, 297, 20, 37, 74 + , 136, 148, 165, 281, 296, 24, 36, 41, 65, 82, 99, 164, 272, 282, 388, 40, 49, 98, 141, 194, + 284, 328, 412, 48, 97, 129, 142, 196, 346, 71, 72, 96, 130, 313, 392, 80, 206, 257, 267, 312, + 334, 7, 135, 156, 173, 192, 258, 269, 397, 404, 11, 78, 144, 161, 172, 260, 270, 299, 331, 344, + 398, 13, 19, 39, 69, 86, 103, 160, 167, 199, 202, 298, 322, 384, 14, 21, 38, 43, 75, 102, 137, + 149, 166, 204, 289, 332, 408, 462, 22, 25, 42, 51, 83, 101, 138, 150, 273, 283, 288, 301, 350, + 389, 429, 26, 50, 76, 100, 195, 274, 285, 300, 329, 363, 390, 413, 428, 28, 45, 84, 143, 197, + 200, 214, 231, 276, 286, 315, 320, 347, 362, 414, 458, 44, 53, 73, 90, 107, 131, 152, 169, 181, + 230, 314, 338, 361, 393, 400, 454, 460, 52, 57, 81, 106, 115, 168, 175, 180, 207, 229, 305, 335 + , 348, 360, 394, 421, 478, 56, 105, 114, 157, 163, 174, 193, 210, 227, 228, 259, 304, 317, 326, + 405, 420, 445, 79, 104, 113, 145, 158, 162, 212, 226, 261, 271, 316, 345, 379, 399, 406, 444, + 450, 456, 87, 88, 112, 146, 203, 225, 262, 291, 323, 336, 378, 385, 425, 452, 474, 15, 205, 222 + , 224, 239, 290, 303, 333, 367, 377, 386, 409, 424, 431, 463, 470, 476, 23, 139, 151, 189, 208, + 238, 302, 324, 351, 366, 376, 410, 430, 437, 27, 47, 77, 94, 111, 177, 188, 237, 275, 293, 342, + 365, 391, 436, 448, 29, 46, 55, 85, 110, 119, 171, 176, 183, 201, 215, 218, 235, 236, 277, 287, + 292, 321, 355, 364, 415, 417, 459, 466, 472, 30, 54, 59, 91, 109, 118, 153, 170, 182, 220, 234, + 278, 307, 339, 354, 401, 416, 423, 441, 455, 461, 468, 495, 58, 108, 117, 154, 233, 306, 319, + 349, 353, 383, 395, 402, 422, 440, 447, 479, 494, 92, 116, 211, 232, 318, 327, 340, 352, 382, + 446, 493, 61, 159, 213, 216, 247, 309, 381, 407, 427, 451, 457, 464, 491, 492, 60, 89, 123, 147 + , 185, 246, 263, 308, 337, 371, 380, 426, 433, 453, 475, 487, 490, 122, 184, 191, 223, 245, 370, + 387, 432, 439, 471, 477, 486, 489, 511, 121, 179, 190, 209, 243, 244, 295, 325, 359, 369, 411, + 438, 485, 488, 510, 95, 120, 178, 242, 294, 343, 358, 368, 419, 449, 483, 484, 509, 219, 241, + 357, 418, 443, 467, 473, 482, 507, 508, 31, 221, 240, 255, 279, 356, 442, 469, 481, 503, 506, + 155, 254, 403, 480, 502, 505, 63, 93, 127, 253, 311, 341, 375, 501, 504, 62, 126, 187, 217, 251 + , 252, 310, 374, 435, 465, 499, 500, 125, 186, 250, 373, 434, 498, 124, 249, 372, 497, 248, 496 + }; + +static int cbest_10[1023] = { 1, 2, 516, 4, 258, 8, 129, 16, 32, 580, 64, 128, 290, 145, 256, 3, 512, + 517, 5, 259, 518, 588, 6, 9, 18, 36, 72, 144, 774, 10, 17, 131, 262, 288, 524, 645, 12, 33, + 133, 266, 294, 387, 532, 576, 581, 20, 34, 65, 137, 274, 548, 582, 24, 66, 291, 838, 40, 68, + 130, 147, 161, 322, 644, 709, 806, 48, 132, 193, 257, 386, 596, 80, 136, 298, 419, 612, 661, 772 + , 96, 149, 260, 272, 306, 403, 513, 146, 153, 160, 264, 292, 385, 514, 519, 544, 584, 589, 708, + 870, 7, 19, 37, 73, 192, 354, 590, 770, 775, 11, 38, 74, 177, 263, 289, 418, 520, 525, 534, 641 + , 660, 725, 802, 836, 846, 13, 22, 76, 148, 209, 267, 295, 320, 330, 402, 526, 528, 533, 577, + 647, 717, 804, 14, 21, 26, 35, 44, 135, 152, 165, 201, 275, 304, 384, 401, 435, 549, 578, 583, + 604, 608, 782, 903, 25, 52, 67, 88, 139, 270, 296, 391, 417, 550, 620, 653, 790, 834, 839, 41, + 50, 69, 104, 141, 176, 278, 302, 323, 395, 423, 540, 598, 640, 705, 724, 807, 866, 28, 42, 49, + 70, 82, 100, 163, 208, 282, 310, 556, 592, 597, 646, 663, 677, 711, 716, 868, 878, 81, 134, 151 + , 164, 195, 200, 299, 326, 352, 362, 400, 434, 564, 613, 657, 768, 773, 902, 967, 97, 138, 155, + 169, 197, 261, 273, 307, 358, 390, 416, 433, 451, 614, 652, 733, 800, 814, 844, 854, 935, 56, 84 + , 98, 140, 181, 217, 265, 293, 328, 338, 394, 422, 515, 545, 585, 704, 788, 822, 871, 919, 162, + 179, 276, 355, 407, 427, 546, 586, 591, 616, 662, 669, 676, 710, 727, 741, 771, 780, 901, 39, 75 + , 150, 157, 194, 211, 225, 268, 280, 308, 314, 389, 411, 439, 521, 530, 535, 628, 656, 721, 803, + 832, 837, 842, 847, 966, 23, 77, 112, 154, 168, 196, 300, 321, 331, 393, 421, 432, 450, 522, 527 + , 529, 552, 606, 643, 673, 693, 713, 732, 805, 864, 874, 934, 999, 15, 27, 45, 54, 78, 90, 108, + 180, 216, 305, 483, 560, 579, 600, 605, 609, 719, 778, 783, 852, 876, 886, 899, 918, 983, 46, 53 + , 89, 167, 178, 185, 203, 213, 271, 297, 324, 334, 336, 360, 370, 406, 426, 467, 542, 551, 610, + 621, 649, 668, 726, 740, 786, 791, 810, 820, 835, 900, 917, 931, 951, 965, 975, 30, 51, 105, 156 + , 205, 210, 224, 279, 303, 356, 366, 388, 405, 410, 438, 449, 459, 536, 541, 594, 599, 622, 655, + 720, 812, 818, 862, 867, 933, 29, 43, 71, 83, 92, 101, 106, 143, 173, 283, 311, 312, 346, 392, + 409, 420, 437, 443, 557, 566, 593, 642, 659, 672, 692, 707, 712, 737, 757, 869, 879, 911, 998, + 60, 102, 241, 327, 353, 363, 399, 425, 482, 558, 565, 624, 679, 718, 735, 749, 769, 798, 898, + 963, 982, 58, 86, 166, 183, 184, 202, 212, 219, 233, 286, 359, 431, 466, 615, 636, 648, 689, 729 + , 801, 815, 840, 845, 850, 855, 884, 916, 930, 950, 964, 974, 981, 995, 1015, 57, 85, 99, 120, + 171, 199, 204, 229, 318, 329, 339, 368, 404, 448, 458, 465, 499, 654, 671, 685, 784, 789, 823, + 872, 882, 915, 932, 949, 997, 1007, 116, 142, 159, 172, 277, 408, 436, 442, 455, 481, 491, 547, + 572, 587, 617, 630, 658, 665, 706, 723, 736, 756, 776, 781, 816, 860, 894, 897, 910, 947, 991, + 114, 221, 240, 269, 281, 309, 315, 332, 342, 344, 378, 398, 424, 441, 475, 487, 531, 618, 629, + 678, 695, 734, 743, 748, 808, 833, 843, 929, 943, 962, 973, 113, 182, 189, 218, 227, 232, 301, + 364, 374, 430, 457, 523, 553, 562, 602, 607, 688, 728, 753, 796, 830, 865, 875, 927, 980, 994, + 1014, 55, 79, 91, 109, 170, 187, 198, 215, 228, 284, 415, 464, 498, 554, 561, 601, 670, 675, 684 + , 715, 745, 765, 779, 848, 853, 877, 887, 909, 914, 948, 979, 996, 1006, 1013, 47, 110, 158, 249 + , 316, 325, 335, 337, 361, 371, 397, 447, 454, 480, 490, 497, 538, 543, 611, 632, 664, 722, 787, + 811, 821, 880, 896, 913, 946, 961, 971, 990, 1011, 31, 94, 220, 245, 357, 367, 429, 440, 474, + 486, 537, 595, 623, 651, 681, 694, 701, 742, 759, 813, 819, 858, 863, 892, 928, 942, 945, 972, + 989, 993, 1003, 1023, 62, 93, 107, 188, 207, 226, 237, 243, 313, 340, 347, 376, 456, 471, 473, + 507, 567, 568, 626, 752, 890, 907, 926, 1005, 61, 103, 124, 175, 186, 214, 372, 414, 453, 463, + 489, 503, 559, 625, 638, 674, 691, 714, 731, 739, 744, 764, 794, 799, 828, 908, 925, 939, 959, + 978, 1012, 59, 87, 122, 248, 287, 350, 396, 413, 446, 485, 495, 496, 637, 751, 826, 841, 851, + 885, 912, 941, 960, 970, 977, 1010, 118, 121, 235, 244, 319, 369, 382, 428, 445, 574, 650, 667, + 680, 700, 758, 761, 785, 873, 883, 944, 988, 992, 1002, 1009, 1022, 117, 206, 223, 231, 236, 242 + , 470, 472, 506, 573, 631, 687, 777, 817, 856, 861, 895, 906, 987, 1004, 1021, 115, 174, 191, 333 + , 343, 345, 379, 452, 462, 469, 488, 502, 505, 619, 690, 697, 730, 738, 755, 809, 888, 924, 938, + 958, 969, 1019, 253, 365, 375, 412, 484, 494, 501, 563, 603, 750, 767, 792, 797, 831, 923, 940, + 957, 976, 1001, 234, 251, 285, 348, 444, 479, 555, 634, 666, 760, 824, 849, 905, 955, 1008, 111, + 222, 230, 247, 317, 380, 461, 511, 539, 633, 686, 703, 747, 881, 937, 986, 1020, 95, 190, 468, + 493, 504, 570, 696, 754, 859, 893, 968, 985, 1018, 63, 126, 252, 341, 377, 500, 569, 627, 683, + 766, 891, 922, 956, 1000, 1017, 125, 239, 250, 373, 478, 639, 795, 829, 904, 921, 954, 123, 246, + 351, 460, 477, 510, 702, 746, 763, 827, 936, 953, 119, 383, 492, 509, 575, 984, 682, 699, 857, + 1016, 238, 255, 889, 920, 476, 762, 793, 952, 349, 508, 635, 825, 381, 698, 254, 571, 127 }; + +static int cbest_11[1023] = { 1, + 2, 1026, 4, 513, 8, 16, 1282, 32, 64, 641, 128, 256, 512, 1346, 1024, 3, 673, 1027, 5, 10, 20, 40, 80, 160, 320, + 640, 6, 9, 515, 1030, 1280, 1539, 17, 517, 1034, 1283, 12, 18, 33, 521, 1042, 1362, 34, 65, 529, 1058, 1286, 1795, + 24, 36, 66, 129, 545, 643, 1090, 1290, 1667, 68, 130, 257, 577, 645, 672, 1154, 1298, 1344, 48, 72, 132, 258, 336, + 649, 681, 1314, 1347, 136, 168, 260, 514, 657, 769, 1538, 1923, 84, 96, 144, 264, 516, 1025, 1350, 1410, 1859, 42, + 272, 520, 705, 1032, 1354, 11, 21, 41, 81, 161, 192, 288, 321, 528, 675, 1028, 1537, 1699, 1794, 7, 22, 82, 162, + 322, 544, 642, 677, 897, 1031, 1046, 1066, 1106, 1186, 1281, 1366, 1378, 1666, 14, 44, 164, 324, 384, 523, 533, + 553, 576, 593, 644, 833, 1035, 1040, 1288, 1360, 1987, 13, 19, 28, 88, 328, 519, 648, 680, 689, 1043, 1056, 1284, + 1363, 1474, 1543, 1793, 1955, 26, 35, 56, 176, 656, 768, 1038, 1059, 1088, 1287, 1302, 1322, 1442, 1547, 1665, + 1922, 25, 37, 52, 67, 112, 340, 352, 525, 531, 737, 1091, 1152, 1291, 1296, 1555, 1858, 1875, 38, 69, 74, 104, 131, + 224, 547, 651, 661, 683, 704, 721, 961, 1050, 1062, 1155, 1299, 1312, 1345, 1370, 1571, 1799, 49, 70, 73, 133, 138, + 148, 170, 208, 259, 337, 448, 537, 549, 579, 647, 674, 929, 1094, 1294, 1315, 1352, 1536, 1603, 1671, 1698, 1803, + 1921, 50, 134, 137, 169, 261, 266, 276, 296, 338, 416, 581, 676, 896, 1074, 1098, 1158, 1348, 1394, 1408, 1675, + 1707, 1811, 1857, 2019, 76, 85, 97, 145, 262, 265, 522, 532, 552, 561, 585, 592, 653, 659, 685, 771, 832, 849, + 1064, 1162, 1194, 1306, 1318, 1351, 1386, 1411, 1506, 1683, 1827, 1986, 2003, 43, 86, 98, 140, 146, 172, 273, 344, + 518, 688, 773, 1033, 1110, 1122, 1170, 1355, 1490, 1542, 1697, 1792, 1927, 1954, 100, 193, 268, 274, 289, 597, 609, + 665, 697, 707, 777, 1029, 1044, 1104, 1184, 1330, 1364, 1376, 1414, 1546, 1664, 1731, 1863, 1931, 1963, 23, 46, 83, + 92, 152, 163, 184, 194, 290, 323, 368, 524, 530, 555, 693, 709, 736, 753, 785, 993, 1036, 1047, 1067, 1107, 1187, + 1218, 1320, 1358, 1367, 1379, 1418, 1450, 1545, 1554, 1867, 1874, 1939, 1985, 15, 30, 45, 60, 90, 120, 165, 180, + 196, 240, 280, 292, 325, 330, 360, 385, 480, 546, 650, 660, 679, 682, 713, 720, 745, 801, 899, 960, 977, 1041, + 1289, 1361, 1426, 1472, 1541, 1570, 1703, 1798, 1953, 29, 58, 89, 116, 166, 200, 232, 326, 329, 386, 464, 535, 536, + 548, 578, 595, 646, 835, 901, 928, 1048, 1057, 1070, 1190, 1285, 1300, 1368, 1382, 1440, 1475, 1559, 1579, 1602, + 1619, 1670, 1802, 1879, 1891, 1920, 27, 57, 177, 304, 388, 527, 557, 580, 691, 725, 837, 905, 937, 1039, 1054, + 1089, 1114, 1292, 1303, 1323, 1374, 1443, 1553, 1674, 1706, 1715, 1801, 1810, 1856, 1873, 1991, 2018, 2035, 53, + 106, 113, 178, 212, 332, 341, 353, 392, 424, 541, 560, 584, 601, 652, 658, 684, 770, 841, 848, 913, 1060, 1082, + 1096, 1153, 1202, 1297, 1402, 1478, 1522, 1569, 1673, 1682, 1705, 1797, 1826, 1959, 1995, 2002, 2027, 39, 54, 75, + 105, 114, 225, 342, 354, 400, 539, 569, 739, 772, 1051, 1063, 1078, 1092, 1138, 1160, 1192, 1304, 1313, 1326, 1371, + 1384, 1398, 1446, 1482, 1514, 1551, 1601, 1669, 1696, 1763, 1815, 1835, 1926, 71, 139, 149, 171, 209, 226, 298, + 356, 449, 565, 596, 608, 625, 663, 664, 696, 706, 723, 741, 776, 853, 865, 963, 1072, 1095, 1130, 1156, 1250, 1295, + 1310, 1353, 1392, 1687, 1730, 1747, 1809, 1862, 1930, 1962, 1971, 2007, 2017, 51, 78, 108, 135, 150, 210, 228, 267, + 277, 297, 339, 348, 417, 450, 551, 554, 587, 617, 655, 687, 692, 708, 752, 784, 931, 965, 992, 1009, 1075, 1099, + 1159, 1174, 1234, 1316, 1338, 1349, 1395, 1409, 1458, 1494, 1504, 1544, 1563, 1575, 1681, 1825, 1866, 1883, 1929, + 1938, 1961, 1984, 2001, 77, 142, 174, 263, 278, 346, 376, 418, 452, 496, 583, 669, 678, 701, 712, 729, 744, 761, + 800, 898, 933, 969, 976, 1001, 1065, 1108, 1120, 1163, 1168, 1195, 1307, 1319, 1334, 1356, 1387, 1416, 1448, 1488, + 1507, 1540, 1607, 1702, 1807, 1865, 1925, 1952, 87, 99, 141, 147, 156, 173, 188, 216, 248, 270, 300, 345, 372, 420, + 456, 488, 534, 563, 594, 667, 699, 757, 779, 789, 809, 834, 851, 900, 1102, 1111, 1123, 1171, 1328, 1412, 1491, + 1558, 1578, 1587, 1611, 1618, 1679, 1711, 1729, 1861, 1878, 1890, 1907, 1943, 2023, 94, 101, 124, 154, 186, 244, + 269, 275, 284, 526, 556, 589, 690, 724, 775, 836, 904, 936, 945, 981, 1045, 1068, 1105, 1166, 1185, 1198, 1216, + 1331, 1365, 1377, 1390, 1415, 1430, 1510, 1552, 1577, 1714, 1800, 1819, 1831, 1872, 1899, 1937, 1990, 2034, 47, 62, + 93, 102, 122, 153, 185, 195, 282, 291, 312, 362, 369, 432, 468, 540, 599, 600, 611, 715, 747, 840, 857, 912, 1037, + 1052, 1112, 1126, 1219, 1321, 1359, 1372, 1419, 1424, 1451, 1568, 1623, 1635, 1672, 1691, 1701, 1704, 1723, 1796, + 1958, 1994, 2011, 2026, 2043, 31, 61, 91, 121, 181, 197, 202, 234, 241, 281, 293, 308, 331, 361, 370, 481, 538, + 568, 613, 695, 711, 738, 755, 781, 787, 995, 1080, 1118, 1178, 1188, 1210, 1380, 1400, 1427, 1473, 1498, 1530, + 1550, 1557, 1600, 1617, 1668, 1719, 1735, 1762, 1779, 1814, 1834, 1843, 1877, 1889, 1935, 1967, 1993, 2025, 2039, + 59, 117, 167, 182, 198, 201, 233, 242, 294, 327, 387, 465, 482, 559, 564, 605, 624, 662, 722, 740, 803, 852, 864, + 881, 907, 917, 939, 962, 979, 997, 1049, 1071, 1086, 1146, 1191, 1206, 1222, 1266, 1301, 1324, 1369, 1383, 1406, + 1422, 1441, 1454, 1480, 1512, 1526, 1549, 1686, 1713, 1739, 1746, 1771, 1808, 1833, 1871, 1970, 1989, 2006, 2016, + 2033, 118, 305, 334, 364, 389, 394, 404, 426, 466, 484, 543, 550, 573, 586, 603, 616, 633, 654, 686, 717, 749, 793, + 805, 843, 873, 903, 930, 964, 1008, 1055, 1115, 1128, 1142, 1200, 1226, 1258, 1293, 1308, 1375, 1476, 1520, 1562, + 1574, 1680, 1824 }; + diff --git a/src/erasure-code/jerasure/jerasure/src/galois.c b/src/erasure-code/jerasure/jerasure/src/galois.c new file mode 100644 index 000000000000..398a64944f79 --- /dev/null +++ b/src/erasure-code/jerasure/jerasure/src/galois.c @@ -0,0 +1,353 @@ +/* * + * Copyright (c) 2014, James S. Plank and Kevin Greenan + * All rights reserved. + * + * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure + * Coding Techniques + * + * Revision 2.0: Galois Field backend now links to GF-Complete + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the University of Tennessee nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* Jerasure's authors: + + Revision 2.x - 2014: James S. Plank and Kevin M. Greenan + Revision 1.2 - 2008: James S. Plank, Scott Simmerman and Catherine D. Schuman. + Revision 1.0 - 2007: James S. Plank + */ + +#include +#include +#include + +#include "galois.h" + +#define MAX_GF_INSTANCES 64 +gf_t *gfp_array[MAX_GF_INSTANCES] = { 0 }; +int gfp_is_composite[MAX_GF_INSTANCES] = { 0 }; + +gf_t *galois_get_field_ptr(int w) +{ + if (gfp_array[w] != NULL) { + return gfp_array[w]; + } + + return NULL; +} + +gf_t* galois_init_field(int w, + int mult_type, + int region_type, + int divide_type, + uint64_t prim_poly, + int arg1, + int arg2) +{ + int scratch_size; + void *scratch_memory; + gf_t *gfp; + + if (w <= 0 || w > 32) { + fprintf(stderr, "ERROR -- cannot init default Galois field for w=%d\n", w); + exit(1); + } + + gfp = (gf_t *) malloc(sizeof(gf_t)); + if (!gfp) { + fprintf(stderr, "ERROR -- cannot allocate memory for Galois field w=%d\n", w); + exit(1); + } + + scratch_size = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2); + if (!scratch_size) { + fprintf(stderr, "ERROR -- cannot get scratch size for base field w=%d\n", w); + exit(1); + } + + scratch_memory = malloc(scratch_size); + if (!scratch_memory) { + fprintf(stderr, "ERROR -- cannot get scratch memory for base field w=%d\n", w); + exit(1); + } + + if(!gf_init_hard(gfp, + w, + mult_type, + region_type, + divide_type, + prim_poly, + arg1, + arg2, + NULL, + scratch_memory)) + { + fprintf(stderr, "ERROR -- cannot init default Galois field for w=%d\n", w); + exit(1); + } + + gfp_is_composite[w] = 0; + return gfp; +} + +gf_t* galois_init_composite_field(int w, + int region_type, + int divide_type, + int degree, + gf_t* base_gf) +{ + int scratch_size; + void *scratch_memory; + gf_t *gfp; + + if (w <= 0 || w > 32) { + fprintf(stderr, "ERROR -- cannot init composite field for w=%d\n", w); + exit(1); + } + + gfp = (gf_t *) malloc(sizeof(gf_t)); + if (!gfp) { + fprintf(stderr, "ERROR -- cannot allocate memory for Galois field w=%d\n", w); + exit(1); + } + + scratch_size = gf_scratch_size(w, GF_MULT_COMPOSITE, region_type, divide_type, degree, 0); + if (!scratch_size) { + fprintf(stderr, "ERROR -- cannot get scratch size for composite field w=%d\n", w); + exit(1); + } + + scratch_memory = malloc(scratch_size); + if (!scratch_memory) { + fprintf(stderr, "ERROR -- cannot get scratch memory for composite field w=%d\n", w); + exit(1); + } + + if(!gf_init_hard(gfp, + w, + GF_MULT_COMPOSITE, + region_type, + divide_type, + 0, + degree, + 0, + base_gf, + scratch_memory)) + { + fprintf(stderr, "ERROR -- cannot init default composite field for w=%d\n", w); + exit(1); + } + gfp_is_composite[w] = 1; + return gfp; +} + +static void galois_init_default_field(int w) +{ + if (w <= 0 || w > 32) { + fprintf(stderr, "ERROR -- cannot init default Galois field for w=%d\n", w); + exit(1); + } + + if (gfp_array[w] == NULL) { + gfp_array[w] = (gf_t*)malloc(sizeof(gf_t)); + if (gfp_array[w] == NULL) { + fprintf(stderr, "ERROR -- cannot allocate memory for Galois field w=%d\n", w); + exit(1); + } + } + + if (!gf_init_easy(gfp_array[w], w)) { + fprintf(stderr, "ERROR -- cannot init default Galois field for w=%d\n", w); + exit(1); + } +} + + +static int is_valid_gf(gf_t *gf, int w) +{ + // TODO: I assume we may eventually + // want to do w=64 and 128, so w + // will be needed to perform this check + (void)w; + + if (gf == NULL) { + return 0; + } + if (gf->multiply.w32 == NULL) { + return 0; + } + if (gf->multiply_region.w32 == NULL) { + return 0; + } + if (gf->divide.w32 == NULL) { + return 0; + } + if (gf->inverse.w32 == NULL) { + return 0; + } + if (gf->extract_word.w32 == NULL) { + return 0; + } + + return 1; +} + +void galois_change_technique(gf_t *gf, int w) +{ + if (w <= 0 || w > 32) { + fprintf(stderr, "ERROR -- cannot support Galois field for w=%d\n", w); + exit(1); + } + + if (!is_valid_gf(gf, w)) { + fprintf(stderr, "ERROR -- overriding with invalid Galois field for w=%d\n", w); + exit(1); + } + + if (gfp_array[w] != NULL) { + gf_free(gfp_array[w], gfp_is_composite[w]); + } + + gfp_array[w] = gf; +} + +int galois_single_multiply(int x, int y, int w) +{ + if (x == 0 || y == 0) return 0; + + if (gfp_array[w] == NULL) { + galois_init_default_field(w); + } + + if (w <= 32) { + return gfp_array[w]->multiply.w32(gfp_array[w], x, y); + } else { + fprintf(stderr, "ERROR -- Galois field not implemented for w=%d\n", w); + return 0; + } +} + +int galois_single_divide(int x, int y, int w) +{ + if (x == 0) return 0; + if (y == 0) return -1; + + if (gfp_array[w] == NULL) { + galois_init_default_field(w); + } + + if (w <= 32) { + return gfp_array[w]->divide.w32(gfp_array[w], x, y); + } else { + fprintf(stderr, "ERROR -- Galois field not implemented for w=%d\n", w); + return 0; + } +} + +void galois_w08_region_multiply(char *region, /* Region to multiply */ + int multby, /* Number to multiply by */ + int nbytes, /* Number of bytes in region */ + char *r2, /* If r2 != NULL, products go here */ + int add) +{ + if (gfp_array[8] == NULL) { + galois_init_default_field(8); + } + gfp_array[8]->multiply_region.w32(gfp_array[8], region, r2, multby, nbytes, add); +} + +void galois_w16_region_multiply(char *region, /* Region to multiply */ + int multby, /* Number to multiply by */ + int nbytes, /* Number of bytes in region */ + char *r2, /* If r2 != NULL, products go here */ + int add) +{ + if (gfp_array[16] == NULL) { + galois_init_default_field(16); + } + gfp_array[16]->multiply_region.w32(gfp_array[16], region, r2, multby, nbytes, add); +} + + +void galois_w32_region_multiply(char *region, /* Region to multiply */ + int multby, /* Number to multiply by */ + int nbytes, /* Number of bytes in region */ + char *r2, /* If r2 != NULL, products go here */ + int add) +{ + if (gfp_array[32] == NULL) { + galois_init_default_field(32); + } + gfp_array[32]->multiply_region.w32(gfp_array[32], region, r2, multby, nbytes, add); +} + +void galois_w8_region_xor(void *src, void *dest, int nbytes) +{ + if (gfp_array[8] == NULL) { + galois_init_default_field(8); + } + gfp_array[8]->multiply_region.w32(gfp_array[32], src, dest, 1, nbytes, 1); +} + +void galois_w16_region_xor(void *src, void *dest, int nbytes) +{ + if (gfp_array[16] == NULL) { + galois_init_default_field(16); + } + gfp_array[16]->multiply_region.w32(gfp_array[16], src, dest, 1, nbytes, 1); +} + +void galois_w32_region_xor(void *src, void *dest, int nbytes) +{ + if (gfp_array[32] == NULL) { + galois_init_default_field(32); + } + gfp_array[32]->multiply_region.w32(gfp_array[32], src, dest, 1, nbytes, 1); +} + +void galois_region_xor(char *src, char *dest, int nbytes) +{ + if (nbytes >= 16) { + galois_w32_region_xor(src, dest, nbytes); + } else { + int i = 0; + for (i = 0; i < nbytes; i++) { + *dest ^= *src; + dest++; + src++; + } + } +} + +int galois_inverse(int y, int w) +{ + if (y == 0) return -1; + return galois_single_divide(1, y, w); +} diff --git a/src/erasure-code/jerasure/jerasure/src/jerasure.c b/src/erasure-code/jerasure/jerasure/src/jerasure.c new file mode 100644 index 000000000000..571b156e7264 --- /dev/null +++ b/src/erasure-code/jerasure/jerasure/src/jerasure.c @@ -0,0 +1,1387 @@ +/* * + * Copyright (c) 2014, James S. Plank and Kevin Greenan + * All rights reserved. + * + * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure + * Coding Techniques + * + * Revision 2.0: Galois Field backend now links to GF-Complete + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the University of Tennessee nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* Jerasure's authors: + + Revision 2.x - 2014: James S. Plank and Kevin M. Greenan + Revision 1.2 - 2008: James S. Plank, Scott Simmerman and Catherine D. Schuman. + Revision 1.0 - 2007: James S. Plank + */ + +#include +#include +#include + +#include "galois.h" +#include "jerasure.h" + +#define talloc(type, num) (type *) malloc(sizeof(type)*(num)) + +static double jerasure_total_xor_bytes = 0; +static double jerasure_total_gf_bytes = 0; +static double jerasure_total_memcpy_bytes = 0; + +void jerasure_print_matrix(int *m, int rows, int cols, int w) +{ + int i, j; + int fw; + char s[30]; + unsigned int w2; + + if (w == 32) { + fw = 10; + } else { + w2 = (1 << w); + sprintf(s, "%u", w2-1); + fw = strlen(s); + } + + for (i = 0; i < rows; i++) { + for (j = 0; j < cols; j++) { + if (j != 0) printf(" "); + printf("%*u", fw, m[i*cols+j]); + } + printf("\n"); + } +} + +void jerasure_print_bitmatrix(int *m, int rows, int cols, int w) +{ + int i, j; + + for (i = 0; i < rows; i++) { + if (i != 0 && i%w == 0) printf("\n"); + for (j = 0; j < cols; j++) { + if (j != 0 && j%w == 0) printf(" "); + printf("%d", m[i*cols+j]); + } + printf("\n"); + } +} + +int jerasure_make_decoding_matrix(int k, int m, int w, int *matrix, int *erased, int *decoding_matrix, int *dm_ids) +{ + int i, j, *tmpmat; + + j = 0; + for (i = 0; j < k; i++) { + if (erased[i] == 0) { + dm_ids[j] = i; + j++; + } + } + + tmpmat = talloc(int, k*k); + if (tmpmat == NULL) { return -1; } + for (i = 0; i < k; i++) { + if (dm_ids[i] < k) { + for (j = 0; j < k; j++) tmpmat[i*k+j] = 0; + tmpmat[i*k+dm_ids[i]] = 1; + } else { + for (j = 0; j < k; j++) { + tmpmat[i*k+j] = matrix[(dm_ids[i]-k)*k+j]; + } + } + } + + i = jerasure_invert_matrix(tmpmat, decoding_matrix, k, w); + free(tmpmat); + return i; +} + +/* Internal Routine */ +int jerasure_make_decoding_bitmatrix(int k, int m, int w, int *matrix, int *erased, int *decoding_matrix, int *dm_ids) +{ + int i, j, *tmpmat; + int index, mindex; + + j = 0; + for (i = 0; j < k; i++) { + if (erased[i] == 0) { + dm_ids[j] = i; + j++; + } + } + + tmpmat = talloc(int, k*k*w*w); + if (tmpmat == NULL) { return -1; } + for (i = 0; i < k; i++) { + if (dm_ids[i] < k) { + index = i*k*w*w; + for (j = 0; j < k*w*w; j++) tmpmat[index+j] = 0; + index = i*k*w*w+dm_ids[i]*w; + for (j = 0; j < w; j++) { + tmpmat[index] = 1; + index += (k*w+1); + } + } else { + index = i*k*w*w; + mindex = (dm_ids[i]-k)*k*w*w; + for (j = 0; j < k*w*w; j++) { + tmpmat[index+j] = matrix[mindex+j]; + } + } + } + + i = jerasure_invert_bitmatrix(tmpmat, decoding_matrix, k*w); + free(tmpmat); + return i; +} + +int jerasure_matrix_decode(int k, int m, int w, int *matrix, int row_k_ones, int *erasures, + char **data_ptrs, char **coding_ptrs, int size) +{ + int i, edd, lastdrive; + int *tmpids; + int *erased, *decoding_matrix, *dm_ids; + + if (w != 8 && w != 16 && w != 32) return -1; + + erased = jerasure_erasures_to_erased(k, m, erasures); + if (erased == NULL) return -1; + + /* Find the number of data drives failed */ + + lastdrive = k; + + edd = 0; + for (i = 0; i < k; i++) { + if (erased[i]) { + edd++; + lastdrive = i; + } + } + + /* You only need to create the decoding matrix in the following cases: + + 1. edd > 0 and row_k_ones is false. + 2. edd > 0 and row_k_ones is true and coding device 0 has been erased. + 3. edd > 1 + + We're going to use lastdrive to denote when to stop decoding data. + At this point in the code, it is equal to the last erased data device. + However, if we can't use the parity row to decode it (i.e. row_k_ones=0 + or erased[k] = 1, we're going to set it to k so that the decoding + pass will decode all data. + */ + + if (!row_k_ones || erased[k]) lastdrive = k; + + dm_ids = NULL; + decoding_matrix = NULL; + + if (edd > 1 || (edd > 0 && (!row_k_ones || erased[k]))) { + dm_ids = talloc(int, k); + if (dm_ids == NULL) { + free(erased); + return -1; + } + + decoding_matrix = talloc(int, k*k); + if (decoding_matrix == NULL) { + free(erased); + free(dm_ids); + return -1; + } + + if (jerasure_make_decoding_matrix(k, m, w, matrix, erased, decoding_matrix, dm_ids) < 0) { + free(erased); + free(dm_ids); + free(decoding_matrix); + return -1; + } + } + + /* Decode the data drives. + If row_k_ones is true and coding device 0 is intact, then only decode edd-1 drives. + This is done by stopping at lastdrive. + We test whether edd > 0 so that we can exit the loop early if we're done. + */ + + for (i = 0; edd > 0 && i < lastdrive; i++) { + if (erased[i]) { + jerasure_matrix_dotprod(k, w, decoding_matrix+(i*k), dm_ids, i, data_ptrs, coding_ptrs, size); + edd--; + } + } + + /* Then if necessary, decode drive lastdrive */ + + if (edd > 0) { + tmpids = talloc(int, k); + for (i = 0; i < k; i++) { + tmpids[i] = (i < lastdrive) ? i : i+1; + } + jerasure_matrix_dotprod(k, w, matrix, tmpids, lastdrive, data_ptrs, coding_ptrs, size); + free(tmpids); + } + + /* Finally, re-encode any erased coding devices */ + + for (i = 0; i < m; i++) { + if (erased[k+i]) { + jerasure_matrix_dotprod(k, w, matrix+(i*k), NULL, i+k, data_ptrs, coding_ptrs, size); + } + } + + free(erased); + if (dm_ids != NULL) free(dm_ids); + if (decoding_matrix != NULL) free(decoding_matrix); + + return 0; +} + + +int *jerasure_matrix_to_bitmatrix(int k, int m, int w, int *matrix) +{ + int *bitmatrix; + int rowelts, rowindex, colindex, elt, i, j, l, x; + + bitmatrix = talloc(int, k*m*w*w); + if (matrix == NULL) { return NULL; } + + rowelts = k * w; + rowindex = 0; + + for (i = 0; i < m; i++) { + colindex = rowindex; + for (j = 0; j < k; j++) { + elt = matrix[i*k+j]; + for (x = 0; x < w; x++) { + for (l = 0; l < w; l++) { + bitmatrix[colindex+x+l*rowelts] = ((elt & (1 << l)) ? 1 : 0); + } + elt = galois_single_multiply(elt, 2, w); + } + colindex += w; + } + rowindex += rowelts * w; + } + return bitmatrix; +} + +void jerasure_matrix_encode(int k, int m, int w, int *matrix, + char **data_ptrs, char **coding_ptrs, int size) +{ + int i; + + if (w != 8 && w != 16 && w != 32) { + fprintf(stderr, "ERROR: jerasure_matrix_encode() and w is not 8, 16 or 32\n"); + exit(1); + } + + for (i = 0; i < m; i++) { + jerasure_matrix_dotprod(k, w, matrix+(i*k), NULL, k+i, data_ptrs, coding_ptrs, size); + } +} + +void jerasure_bitmatrix_dotprod(int k, int w, int *bitmatrix_row, + int *src_ids, int dest_id, + char **data_ptrs, char **coding_ptrs, int size, int packetsize) +{ + int j, sindex, pstarted, index, x, y; + char *dptr, *pptr, *bdptr, *bpptr; + + if (size%(w*packetsize) != 0) { + fprintf(stderr, "jerasure_bitmatrix_dotprod - size%c(w*packetsize)) must = 0\n", '%'); + exit(1); + } + + bpptr = (dest_id < k) ? data_ptrs[dest_id] : coding_ptrs[dest_id-k]; + + for (sindex = 0; sindex < size; sindex += (packetsize*w)) { + index = 0; + for (j = 0; j < w; j++) { + pstarted = 0; + pptr = bpptr + sindex + j*packetsize; + for (x = 0; x < k; x++) { + if (src_ids == NULL) { + bdptr = data_ptrs[x]; + } else if (src_ids[x] < k) { + bdptr = data_ptrs[src_ids[x]]; + } else { + bdptr = coding_ptrs[src_ids[x]-k]; + } + for (y = 0; y < w; y++) { + if (bitmatrix_row[index]) { + dptr = bdptr + sindex + y*packetsize; + if (!pstarted) { + memcpy(pptr, dptr, packetsize); + jerasure_total_memcpy_bytes += packetsize; + pstarted = 1; + } else { + galois_region_xor(dptr, pptr, packetsize); + jerasure_total_xor_bytes += packetsize; + } + } + index++; + } + } + } + } +} + +void jerasure_do_parity(int k, char **data_ptrs, char *parity_ptr, int size) +{ + int i; + + memcpy(parity_ptr, data_ptrs[0], size); + jerasure_total_memcpy_bytes += size; + + for (i = 1; i < k; i++) { + galois_region_xor(data_ptrs[i], parity_ptr, size); + jerasure_total_xor_bytes += size; + } +} + +int jerasure_invert_matrix(int *mat, int *inv, int rows, int w) +{ + int cols, i, j, k, x, rs2; + int row_start, tmp, inverse; + + cols = rows; + + k = 0; + for (i = 0; i < rows; i++) { + for (j = 0; j < cols; j++) { + inv[k] = (i == j) ? 1 : 0; + k++; + } + } + + /* First -- convert into upper triangular */ + for (i = 0; i < cols; i++) { + row_start = cols*i; + + /* Swap rows if we ave a zero i,i element. If we can't swap, then the + matrix was not invertible */ + + if (mat[row_start+i] == 0) { + for (j = i+1; j < rows && mat[cols*j+i] == 0; j++) ; + if (j == rows) return -1; + rs2 = j*cols; + for (k = 0; k < cols; k++) { + tmp = mat[row_start+k]; + mat[row_start+k] = mat[rs2+k]; + mat[rs2+k] = tmp; + tmp = inv[row_start+k]; + inv[row_start+k] = inv[rs2+k]; + inv[rs2+k] = tmp; + } + } + + /* Multiply the row by 1/element i,i */ + tmp = mat[row_start+i]; + if (tmp != 1) { + inverse = galois_single_divide(1, tmp, w); + for (j = 0; j < cols; j++) { + mat[row_start+j] = galois_single_multiply(mat[row_start+j], inverse, w); + inv[row_start+j] = galois_single_multiply(inv[row_start+j], inverse, w); + } + } + + /* Now for each j>i, add A_ji*Ai to Aj */ + k = row_start+i; + for (j = i+1; j != cols; j++) { + k += cols; + if (mat[k] != 0) { + if (mat[k] == 1) { + rs2 = cols*j; + for (x = 0; x < cols; x++) { + mat[rs2+x] ^= mat[row_start+x]; + inv[rs2+x] ^= inv[row_start+x]; + } + } else { + tmp = mat[k]; + rs2 = cols*j; + for (x = 0; x < cols; x++) { + mat[rs2+x] ^= galois_single_multiply(tmp, mat[row_start+x], w); + inv[rs2+x] ^= galois_single_multiply(tmp, inv[row_start+x], w); + } + } + } + } + } + + /* Now the matrix is upper triangular. Start at the top and multiply down */ + + for (i = rows-1; i >= 0; i--) { + row_start = i*cols; + for (j = 0; j < i; j++) { + rs2 = j*cols; + if (mat[rs2+i] != 0) { + tmp = mat[rs2+i]; + mat[rs2+i] = 0; + for (k = 0; k < cols; k++) { + inv[rs2+k] ^= galois_single_multiply(tmp, inv[row_start+k], w); + } + } + } + } + return 0; +} + +int jerasure_invertible_matrix(int *mat, int rows, int w) +{ + int cols, i, j, k, x, rs2; + int row_start, tmp, inverse; + + cols = rows; + + /* First -- convert into upper triangular */ + for (i = 0; i < cols; i++) { + row_start = cols*i; + + /* Swap rows if we ave a zero i,i element. If we can't swap, then the + matrix was not invertible */ + + if (mat[row_start+i] == 0) { + for (j = i+1; j < rows && mat[cols*j+i] == 0; j++) ; + if (j == rows) return 0; + rs2 = j*cols; + for (k = 0; k < cols; k++) { + tmp = mat[row_start+k]; + mat[row_start+k] = mat[rs2+k]; + mat[rs2+k] = tmp; + } + } + + /* Multiply the row by 1/element i,i */ + tmp = mat[row_start+i]; + if (tmp != 1) { + inverse = galois_single_divide(1, tmp, w); + for (j = 0; j < cols; j++) { + mat[row_start+j] = galois_single_multiply(mat[row_start+j], inverse, w); + } + } + + /* Now for each j>i, add A_ji*Ai to Aj */ + k = row_start+i; + for (j = i+1; j != cols; j++) { + k += cols; + if (mat[k] != 0) { + if (mat[k] == 1) { + rs2 = cols*j; + for (x = 0; x < cols; x++) { + mat[rs2+x] ^= mat[row_start+x]; + } + } else { + tmp = mat[k]; + rs2 = cols*j; + for (x = 0; x < cols; x++) { + mat[rs2+x] ^= galois_single_multiply(tmp, mat[row_start+x], w); + } + } + } + } + } + return 1; +} + +/* Converts a list-style version of the erasures into an array of k+m elements + where the element = 1 if the index has been erased, and zero otherwise */ + +int *jerasure_erasures_to_erased(int k, int m, int *erasures) +{ + int td; + int t_non_erased; + int *erased; + int i; + + td = k+m; + erased = talloc(int, td); + if (erased == NULL) return NULL; + t_non_erased = td; + + for (i = 0; i < td; i++) erased[i] = 0; + + for (i = 0; erasures[i] != -1; i++) { + if (erased[erasures[i]] == 0) { + erased[erasures[i]] = 1; + t_non_erased--; + if (t_non_erased < k) { + free(erased); + return NULL; + } + } + } + return erased; +} + +void jerasure_free_schedule(int **schedule) +{ + int i; + + for (i = 0; schedule[i][0] >= 0; i++) free(schedule[i]); + free(schedule[i]); + free(schedule); +} + +void jerasure_free_schedule_cache(int k, int m, int ***cache) +{ + int e1, e2; + + if (m != 2) { + fprintf(stderr, "jerasure_free_schedule_cache(): m must equal 2\n"); + exit(1); + } + + for (e1 = 0; e1 < k+m; e1++) { + for (e2 = 0; e2 < e1; e2++) { + jerasure_free_schedule(cache[e1*(k+m)+e2]); + } + jerasure_free_schedule(cache[e1*(k+m)+e1]); + } + free(cache); +} + +void jerasure_matrix_dotprod(int k, int w, int *matrix_row, + int *src_ids, int dest_id, + char **data_ptrs, char **coding_ptrs, int size) +{ + int init; + char *dptr, *sptr; + int i; + + if (w != 1 && w != 8 && w != 16 && w != 32) { + fprintf(stderr, "ERROR: jerasure_matrix_dotprod() called and w is not 1, 8, 16 or 32\n"); + exit(1); + } + + init = 0; + + dptr = (dest_id < k) ? data_ptrs[dest_id] : coding_ptrs[dest_id-k]; + + /* First copy or xor any data that does not need to be multiplied by a factor */ + + for (i = 0; i < k; i++) { + if (matrix_row[i] == 1) { + if (src_ids == NULL) { + sptr = data_ptrs[i]; + } else if (src_ids[i] < k) { + sptr = data_ptrs[src_ids[i]]; + } else { + sptr = coding_ptrs[src_ids[i]-k]; + } + if (init == 0) { + memcpy(dptr, sptr, size); + jerasure_total_memcpy_bytes += size; + init = 1; + } else { + galois_region_xor(sptr, dptr, size); + jerasure_total_xor_bytes += size; + } + } + } + + /* Now do the data that needs to be multiplied by a factor */ + + for (i = 0; i < k; i++) { + if (matrix_row[i] != 0 && matrix_row[i] != 1) { + if (src_ids == NULL) { + sptr = data_ptrs[i]; + } else if (src_ids[i] < k) { + sptr = data_ptrs[src_ids[i]]; + } else { + sptr = coding_ptrs[src_ids[i]-k]; + } + switch (w) { + case 8: galois_w08_region_multiply(sptr, matrix_row[i], size, dptr, init); break; + case 16: galois_w16_region_multiply(sptr, matrix_row[i], size, dptr, init); break; + case 32: galois_w32_region_multiply(sptr, matrix_row[i], size, dptr, init); break; + } + jerasure_total_gf_bytes += size; + init = 1; + } + } +} + + +int jerasure_bitmatrix_decode(int k, int m, int w, int *bitmatrix, int row_k_ones, int *erasures, + char **data_ptrs, char **coding_ptrs, int size, int packetsize) +{ + int i; + int *erased; + int *decoding_matrix; + int *dm_ids; + int edd, *tmpids, lastdrive; + + erased = jerasure_erasures_to_erased(k, m, erasures); + if (erased == NULL) return -1; + + /* See jerasure_matrix_decode for the logic of this routine. This one works just like + it, but calls the bitmatrix ops instead */ + + lastdrive = k; + + edd = 0; + for (i = 0; i < k; i++) { + if (erased[i]) { + edd++; + lastdrive = i; + } + } + + if (row_k_ones != 1 || erased[k]) lastdrive = k; + + dm_ids = NULL; + decoding_matrix = NULL; + + if (edd > 1 || (edd > 0 && (row_k_ones != 1 || erased[k]))) { + + dm_ids = talloc(int, k); + if (dm_ids == NULL) { + free(erased); + return -1; + } + + decoding_matrix = talloc(int, k*k*w*w); + if (decoding_matrix == NULL) { + free(erased); + free(dm_ids); + return -1; + } + + if (jerasure_make_decoding_bitmatrix(k, m, w, bitmatrix, erased, decoding_matrix, dm_ids) < 0) { + free(erased); + free(dm_ids); + free(decoding_matrix); + return -1; + } + } + + for (i = 0; edd > 0 && i < lastdrive; i++) { + if (erased[i]) { + jerasure_bitmatrix_dotprod(k, w, decoding_matrix+i*k*w*w, dm_ids, i, data_ptrs, coding_ptrs, size, packetsize); + edd--; + } + } + + if (edd > 0) { + tmpids = talloc(int, k); + for (i = 0; i < k; i++) { + tmpids[i] = (i < lastdrive) ? i : i+1; + } + jerasure_bitmatrix_dotprod(k, w, bitmatrix, tmpids, lastdrive, data_ptrs, coding_ptrs, size, packetsize); + free(tmpids); + } + + for (i = 0; i < m; i++) { + if (erased[k+i]) { + jerasure_bitmatrix_dotprod(k, w, bitmatrix+i*k*w*w, NULL, k+i, data_ptrs, coding_ptrs, size, packetsize); + } + } + + free(erased); + if (dm_ids != NULL) free(dm_ids); + if (decoding_matrix != NULL) free(decoding_matrix); + + return 0; +} + +static char **set_up_ptrs_for_scheduled_decoding(int k, int m, int *erasures, char **data_ptrs, char **coding_ptrs) +{ + int ddf, cdf; + int *erased; + char **ptrs; + int i, j, x; + + ddf = 0; + cdf = 0; + for (i = 0; erasures[i] != -1; i++) { + if (erasures[i] < k) ddf++; else cdf++; + } + + erased = jerasure_erasures_to_erased(k, m, erasures); + if (erased == NULL) return NULL; + + /* Set up ptrs. It will be as follows: + + - If data drive i has not failed, then ptrs[i] = data_ptrs[i]. + - If data drive i has failed, then ptrs[i] = coding_ptrs[j], where j is the + lowest unused non-failed coding drive. + - Elements k to k+ddf-1 are data_ptrs[] of the failed data drives. + - Elements k+ddf to k+ddf+cdf-1 are coding_ptrs[] of the failed data drives. + + The array row_ids contains the ids of ptrs. + The array ind_to_row_ids contains the row_id of drive i. + + However, we're going to set row_ids and ind_to_row in a different procedure. + */ + + ptrs = talloc(char *, k+m); + + j = k; + x = k; + for (i = 0; i < k; i++) { + if (erased[i] == 0) { + ptrs[i] = data_ptrs[i]; + } else { + while (erased[j]) j++; + ptrs[i] = coding_ptrs[j-k]; + j++; + ptrs[x] = data_ptrs[i]; + x++; + } + } + for (i = k; i < k+m; i++) { + if (erased[i]) { + ptrs[x] = coding_ptrs[i-k]; + x++; + } + } + free(erased); + return ptrs; +} + +static int set_up_ids_for_scheduled_decoding(int k, int m, int *erasures, int *row_ids, int *ind_to_row) +{ + int ddf, cdf; + int *erased; + int i, j, x; + + ddf = 0; + cdf = 0; + for (i = 0; erasures[i] != -1; i++) { + if (erasures[i] < k) ddf++; else cdf++; + } + + erased = jerasure_erasures_to_erased(k, m, erasures); + if (erased == NULL) return -1; + + /* See set_up_ptrs_for_scheduled_decoding for how these are set */ + + j = k; + x = k; + for (i = 0; i < k; i++) { + if (erased[i] == 0) { + row_ids[i] = i; + ind_to_row[i] = i; + } else { + while (erased[j]) j++; + row_ids[i] = j; + ind_to_row[j] = i; + j++; + row_ids[x] = i; + ind_to_row[i] = x; + x++; + } + } + for (i = k; i < k+m; i++) { + if (erased[i]) { + row_ids[x] = i; + ind_to_row[i] = x; + x++; + } + } + free(erased); + return 0; +} + +static int **jerasure_generate_decoding_schedule(int k, int m, int w, int *bitmatrix, int *erasures, int smart) +{ + int i, j, x, drive, y, index, z; + int *decoding_matrix, *inverse, *real_decoding_matrix; + int *ptr; + int *row_ids; + int *ind_to_row; + int ddf, cdf; + int **schedule; + int *b1, *b2; + + /* First, figure out the number of data drives that have failed, and the + number of coding drives that have failed: ddf and cdf */ + + ddf = 0; + cdf = 0; + for (i = 0; erasures[i] != -1; i++) { + if (erasures[i] < k) ddf++; else cdf++; + } + + row_ids = talloc(int, k+m); + ind_to_row = talloc(int, k+m); + + if (set_up_ids_for_scheduled_decoding(k, m, erasures, row_ids, ind_to_row) < 0) return NULL; + + /* Now, we're going to create one decoding matrix which is going to + decode everything with one call. The hope is that the scheduler + will do a good job. This matrix has w*e rows, where e is the + number of erasures (ddf+cdf) */ + + real_decoding_matrix = talloc(int, k*w*(cdf+ddf)*w); + + /* First, if any data drives have failed, then initialize the first + ddf*w rows of the decoding matrix from the standard decoding + matrix inversion */ + + if (ddf > 0) { + + decoding_matrix = talloc(int, k*k*w*w); + ptr = decoding_matrix; + for (i = 0; i < k; i++) { + if (row_ids[i] == i) { + bzero(ptr, k*w*w*sizeof(int)); + for (x = 0; x < w; x++) { + ptr[x+i*w+x*k*w] = 1; + } + } else { + memcpy(ptr, bitmatrix+k*w*w*(row_ids[i]-k), k*w*w*sizeof(int)); + } + ptr += (k*w*w); + } + inverse = talloc(int, k*k*w*w); + jerasure_invert_bitmatrix(decoding_matrix, inverse, k*w); + +/* printf("\nMatrix to invert\n"); + jerasure_print_bitmatrix(decoding_matrix, k*w, k*w, w); + printf("\n"); + printf("\nInverse\n"); + jerasure_print_bitmatrix(inverse, k*w, k*w, w); + printf("\n"); */ + + free(decoding_matrix); + ptr = real_decoding_matrix; + for (i = 0; i < ddf; i++) { + memcpy(ptr, inverse+k*w*w*row_ids[k+i], sizeof(int)*k*w*w); + ptr += (k*w*w); + } + free(inverse); + } + + /* Next, here comes the hard part. For each coding node that needs + to be decoded, you start by putting its rows of the distribution + matrix into the decoding matrix. If there were no failed data + nodes, then you're done. However, if there have been failed + data nodes, then you need to modify the columns that correspond + to the data nodes. You do that by first zeroing them. Then + whereever there is a one in the distribution matrix, you XOR + in the corresponding row from the failed data node's entry in + the decoding matrix. The whole process kind of makes my head + spin, but it works. + */ + + for (x = 0; x < cdf; x++) { + drive = row_ids[x+ddf+k]-k; + ptr = real_decoding_matrix + k*w*w*(ddf+x); + memcpy(ptr, bitmatrix+drive*k*w*w, sizeof(int)*k*w*w); + + for (i = 0; i < k; i++) { + if (row_ids[i] != i) { + for (j = 0; j < w; j++) { + bzero(ptr+j*k*w+i*w, sizeof(int)*w); + } + } + } + + /* There's the yucky part */ + + index = drive*k*w*w; + for (i = 0; i < k; i++) { + if (row_ids[i] != i) { + b1 = real_decoding_matrix+(ind_to_row[i]-k)*k*w*w; + for (j = 0; j < w; j++) { + b2 = ptr + j*k*w; + for (y = 0; y < w; y++) { + if (bitmatrix[index+j*k*w+i*w+y]) { + for (z = 0; z < k*w; z++) { + b2[z] = b2[z] ^ b1[z+y*k*w]; + } + } + } + } + } + } + } + +/* + printf("\n\nReal Decoding Matrix\n\n"); + jerasure_print_bitmatrix(real_decoding_matrix, (ddf+cdf)*w, k*w, w); + printf("\n"); */ + if (smart) { + schedule = jerasure_smart_bitmatrix_to_schedule(k, ddf+cdf, w, real_decoding_matrix); + } else { + schedule = jerasure_dumb_bitmatrix_to_schedule(k, ddf+cdf, w, real_decoding_matrix); + } + free(row_ids); + free(ind_to_row); + free(real_decoding_matrix); + return schedule; +} + +int jerasure_schedule_decode_lazy(int k, int m, int w, int *bitmatrix, int *erasures, + char **data_ptrs, char **coding_ptrs, int size, int packetsize, + int smart) +{ + int i, tdone; + char **ptrs; + int **schedule; + + ptrs = set_up_ptrs_for_scheduled_decoding(k, m, erasures, data_ptrs, coding_ptrs); + if (ptrs == NULL) return -1; + + schedule = jerasure_generate_decoding_schedule(k, m, w, bitmatrix, erasures, smart); + if (schedule == NULL) { + free(ptrs); + return -1; + } + + for (tdone = 0; tdone < size; tdone += packetsize*w) { + jerasure_do_scheduled_operations(ptrs, schedule, packetsize); + for (i = 0; i < k+m; i++) ptrs[i] += (packetsize*w); + } + + jerasure_free_schedule(schedule); + free(ptrs); + + return 0; +} + +int jerasure_schedule_decode_cache(int k, int m, int w, int ***scache, int *erasures, + char **data_ptrs, char **coding_ptrs, int size, int packetsize) +{ + int i, tdone; + char **ptrs; + int **schedule; + int index; + + if (erasures[1] == -1) { + index = erasures[0]*(k+m) + erasures[0]; + } else if (erasures[2] == -1) { + index = erasures[0]*(k+m) + erasures[1]; + } else { + return -1; + } + + schedule = scache[index]; + + ptrs = set_up_ptrs_for_scheduled_decoding(k, m, erasures, data_ptrs, coding_ptrs); + if (ptrs == NULL) return -1; + + + for (tdone = 0; tdone < size; tdone += packetsize*w) { + jerasure_do_scheduled_operations(ptrs, schedule, packetsize); + for (i = 0; i < k+m; i++) ptrs[i] += (packetsize*w); + } + + free(ptrs); + + return 0; +} + +/* This only works when m = 2 */ + +int ***jerasure_generate_schedule_cache(int k, int m, int w, int *bitmatrix, int smart) +{ + int ***scache; + int erasures[3]; + int e1, e2; + + /* Ok -- this is yucky, but it's how I'm doing it. You will make an index out + of erasures, which will be e1*(k+m)+(e2). If there is no e2, then e2 = e1. + Isn't that clever and confusing. Sorry. + + We're not going to worry about ordering -- in other words, the schedule for + e1,e2 will be the same as e2,e1. They will have the same pointer -- the + schedule will not be duplicated. */ + + if (m != 2) return NULL; + + scache = talloc(int **, (k+m)*(k+m+1)); + if (scache == NULL) return NULL; + + for (e1 = 0; e1 < k+m; e1++) { + erasures[0] = e1; + for (e2 = 0; e2 < e1; e2++) { + erasures[1] = e2; + erasures[2] = -1; + scache[e1*(k+m)+e2] = jerasure_generate_decoding_schedule(k, m, w, bitmatrix, erasures, smart); + scache[e2*(k+m)+e1] = scache[e1*(k+m)+e2]; + } + erasures[1] = -1; + scache[e1*(k+m)+e1] = jerasure_generate_decoding_schedule(k, m, w, bitmatrix, erasures, smart); + } + return scache; + +} + +int jerasure_invert_bitmatrix(int *mat, int *inv, int rows) +{ + int cols, i, j, k; + int tmp; + + cols = rows; + + k = 0; + for (i = 0; i < rows; i++) { + for (j = 0; j < cols; j++) { + inv[k] = (i == j) ? 1 : 0; + k++; + } + } + + /* First -- convert into upper triangular */ + + for (i = 0; i < cols; i++) { + + /* Swap rows if we have a zero i,i element. If we can't swap, then the + matrix was not invertible */ + + if ((mat[i*cols+i]) == 0) { + for (j = i+1; j < rows && (mat[j*cols+i]) == 0; j++) ; + if (j == rows) return -1; + for (k = 0; k < cols; k++) { + tmp = mat[i*cols+k]; mat[i*cols+k] = mat[j*cols+k]; mat[j*cols+k] = tmp; + tmp = inv[i*cols+k]; inv[i*cols+k] = inv[j*cols+k]; inv[j*cols+k] = tmp; + } + } + + /* Now for each j>i, add A_ji*Ai to Aj */ + for (j = i+1; j != rows; j++) { + if (mat[j*cols+i] != 0) { + for (k = 0; k < cols; k++) { + mat[j*cols+k] ^= mat[i*cols+k]; + inv[j*cols+k] ^= inv[i*cols+k]; + } + } + } + } + + /* Now the matrix is upper triangular. Start at the top and multiply down */ + + for (i = rows-1; i >= 0; i--) { + for (j = 0; j < i; j++) { + if (mat[j*cols+i]) { + for (k = 0; k < cols; k++) { + mat[j*cols+k] ^= mat[i*cols+k]; + inv[j*cols+k] ^= inv[i*cols+k]; + } + } + } + } + return 0; +} + +int jerasure_invertible_bitmatrix(int *mat, int rows) +{ + int cols, i, j, k; + int tmp; + + cols = rows; + + /* First -- convert into upper triangular */ + + for (i = 0; i < cols; i++) { + + /* Swap rows if we have a zero i,i element. If we can't swap, then the + matrix was not invertible */ + + if ((mat[i*cols+i]) == 0) { + for (j = i+1; j < rows && (mat[j*cols+i]) == 0; j++) ; + if (j == rows) return 0; + for (k = 0; k < cols; k++) { + tmp = mat[i*cols+k]; mat[i*cols+k] = mat[j*cols+k]; mat[j*cols+k] = tmp; + } + } + + /* Now for each j>i, add A_ji*Ai to Aj */ + for (j = i+1; j != rows; j++) { + if (mat[j*cols+i] != 0) { + for (k = 0; k < cols; k++) { + mat[j*cols+k] ^= mat[i*cols+k]; + } + } + } + } + return 1; +} + + +int *jerasure_matrix_multiply(int *m1, int *m2, int r1, int c1, int r2, int c2, int w) +{ + int *product, i, j, k; + + product = (int *) malloc(sizeof(int)*r1*c2); + for (i = 0; i < r1*c2; i++) product[i] = 0; + + for (i = 0; i < r1; i++) { + for (j = 0; j < c2; j++) { + for (k = 0; k < r2; k++) { + product[i*c2+j] ^= galois_single_multiply(m1[i*c1+k], m2[k*c2+j], w); + } + } + } + return product; +} + +void jerasure_get_stats(double *fill_in) +{ + fill_in[0] = jerasure_total_xor_bytes; + fill_in[1] = jerasure_total_gf_bytes; + fill_in[2] = jerasure_total_memcpy_bytes; + jerasure_total_xor_bytes = 0; + jerasure_total_gf_bytes = 0; + jerasure_total_memcpy_bytes = 0; +} + +void jerasure_do_scheduled_operations(char **ptrs, int **operations, int packetsize) +{ + char *sptr; + char *dptr; + int op; + + for (op = 0; operations[op][0] >= 0; op++) { + sptr = ptrs[operations[op][0]] + operations[op][1]*packetsize; + dptr = ptrs[operations[op][2]] + operations[op][3]*packetsize; + if (operations[op][4]) { +/* printf("%d,%d %d,%d\n", operations[op][0], + operations[op][1], + operations[op][2], + operations[op][3]); + printf("xor(0x%x, 0x%x -> 0x%x, %d)\n", sptr, dptr, dptr, packetsize); */ + galois_region_xor(sptr, dptr, packetsize); + jerasure_total_xor_bytes += packetsize; + } else { +/* printf("memcpy(0x%x <- 0x%x)\n", dptr, sptr); */ + memcpy(dptr, sptr, packetsize); + jerasure_total_memcpy_bytes += packetsize; + } + } +} + +void jerasure_schedule_encode(int k, int m, int w, int **schedule, + char **data_ptrs, char **coding_ptrs, int size, int packetsize) +{ + char **ptr_copy; + int i, tdone; + + ptr_copy = talloc(char *, (k+m)); + for (i = 0; i < k; i++) ptr_copy[i] = data_ptrs[i]; + for (i = 0; i < m; i++) ptr_copy[i+k] = coding_ptrs[i]; + for (tdone = 0; tdone < size; tdone += packetsize*w) { + jerasure_do_scheduled_operations(ptr_copy, schedule, packetsize); + for (i = 0; i < k+m; i++) ptr_copy[i] += (packetsize*w); + } + free(ptr_copy); +} + +int **jerasure_dumb_bitmatrix_to_schedule(int k, int m, int w, int *bitmatrix) +{ + int **operations; + int op; + int index, optodo, i, j; + + operations = talloc(int *, k*m*w*w+1); + op = 0; + + index = 0; + for (i = 0; i < m*w; i++) { + optodo = 0; + for (j = 0; j < k*w; j++) { + if (bitmatrix[index]) { + operations[op] = talloc(int, 5); + operations[op][4] = optodo; + operations[op][0] = j/w; + operations[op][1] = j%w; + operations[op][2] = k+i/w; + operations[op][3] = i%w; + optodo = 1; + op++; + + } + index++; + } + } + operations[op] = talloc(int, 5); + operations[op][0] = -1; + return operations; +} + +int **jerasure_smart_bitmatrix_to_schedule(int k, int m, int w, int *bitmatrix) +{ + int **operations; + int op; + int i, j; + int *diff, *from, *b1, *flink, *blink; + int *ptr, no, row; + int optodo; + int bestrow, bestdiff, top; + +/* printf("Scheduling:\n\n"); + jerasure_print_bitmatrix(bitmatrix, m*w, k*w, w); */ + + operations = talloc(int *, k*m*w*w+1); + op = 0; + + diff = talloc(int, m*w); + from = talloc(int, m*w); + flink = talloc(int, m*w); + blink = talloc(int, m*w); + + ptr = bitmatrix; + + bestdiff = k*w+1; + top = 0; + for (i = 0; i < m*w; i++) { + no = 0; + for (j = 0; j < k*w; j++) { + no += *ptr; + ptr++; + } + diff[i] = no; + from[i] = -1; + flink[i] = i+1; + blink[i] = i-1; + if (no < bestdiff) { + bestdiff = no; + bestrow = i; + } + } + + flink[m*w-1] = -1; + + while (top != -1) { + row = bestrow; + /* printf("Doing row %d - %d from %d\n", row, diff[row], from[row]); */ + + if (blink[row] == -1) { + top = flink[row]; + if (top != -1) blink[top] = -1; + } else { + flink[blink[row]] = flink[row]; + if (flink[row] != -1) { + blink[flink[row]] = blink[row]; + } + } + + ptr = bitmatrix + row*k*w; + if (from[row] == -1) { + optodo = 0; + for (j = 0; j < k*w; j++) { + if (ptr[j]) { + operations[op] = talloc(int, 5); + operations[op][4] = optodo; + operations[op][0] = j/w; + operations[op][1] = j%w; + operations[op][2] = k+row/w; + operations[op][3] = row%w; + optodo = 1; + op++; + } + } + } else { + operations[op] = talloc(int, 5); + operations[op][4] = 0; + operations[op][0] = k+from[row]/w; + operations[op][1] = from[row]%w; + operations[op][2] = k+row/w; + operations[op][3] = row%w; + op++; + b1 = bitmatrix + from[row]*k*w; + for (j = 0; j < k*w; j++) { + if (ptr[j] ^ b1[j]) { + operations[op] = talloc(int, 5); + operations[op][4] = 1; + operations[op][0] = j/w; + operations[op][1] = j%w; + operations[op][2] = k+row/w; + operations[op][3] = row%w; + optodo = 1; + op++; + } + } + } + bestdiff = k*w+1; + for (i = top; i != -1; i = flink[i]) { + no = 1; + b1 = bitmatrix + i*k*w; + for (j = 0; j < k*w; j++) no += (ptr[j] ^ b1[j]); + if (no < diff[i]) { + from[i] = row; + diff[i] = no; + } + if (diff[i] < bestdiff) { + bestdiff = diff[i]; + bestrow = i; + } + } + } + + operations[op] = talloc(int, 5); + operations[op][0] = -1; + free(from); + free(diff); + free(blink); + free(flink); + + return operations; +} + +void jerasure_bitmatrix_encode(int k, int m, int w, int *bitmatrix, + char **data_ptrs, char **coding_ptrs, int size, int packetsize) +{ + int i; + + if (packetsize%sizeof(long) != 0) { + fprintf(stderr, "jerasure_bitmatrix_encode - packetsize(%d) %c sizeof(long) != 0\n", packetsize, '%'); + exit(1); + } + if (size%(packetsize*w) != 0) { + fprintf(stderr, "jerasure_bitmatrix_encode - size(%d) %c (packetsize(%d)*w(%d))) != 0\n", + size, '%', packetsize, w); + exit(1); + } + + for (i = 0; i < m; i++) { + jerasure_bitmatrix_dotprod(k, w, bitmatrix+i*k*w*w, NULL, k+i, data_ptrs, coding_ptrs, size, packetsize); + } +} + +/* + * Exported function for use by autoconf to perform quick + * spot-check. + */ +int jerasure_autoconf_test() +{ + int x = galois_single_multiply(1, 2, 8); + if (x != 2) { + return -1; + } + return 0; +} + diff --git a/src/erasure-code/jerasure/jerasure/src/liberation.c b/src/erasure-code/jerasure/jerasure/src/liberation.c new file mode 100644 index 000000000000..11a1c4fea7ee --- /dev/null +++ b/src/erasure-code/jerasure/jerasure/src/liberation.c @@ -0,0 +1,262 @@ +/* * + * Copyright (c) 2014, James S. Plank and Kevin Greenan + * All rights reserved. + * + * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure + * Coding Techniques + * + * Revision 2.0: Galois Field backend now links to GF-Complete + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the University of Tennessee nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* Jerasure's authors: + + Revision 2.x - 2014: James S. Plank and Kevin M. Greenan + Revision 1.2 - 2008: James S. Plank, Scott Simmerman and Catherine D. Schuman. + Revision 1.0 - 2007: James S. Plank + */ + +#include +#include +#include + +#include "galois.h" +#include "jerasure.h" +#include "liberation.h" + +#define talloc(type, num) (type *) malloc(sizeof(type)*(num)) + +int *liberation_coding_bitmatrix(int k, int w) +{ + int *matrix, i, j, index; + + if (k > w) return NULL; + matrix = talloc(int, 2*k*w*w); + if (matrix == NULL) return NULL; + bzero(matrix, sizeof(int)*2*k*w*w); + + /* Set up identity matrices */ + + for(i = 0; i < w; i++) { + index = i*k*w+i; + for (j = 0; j < k; j++) { + matrix[index] = 1; + index += w; + } + } + + /* Set up liberation matrices */ + + for (j = 0; j < k; j++) { + index = k*w*w+j*w; + for (i = 0; i < w; i++) { + matrix[index+(j+i)%w] = 1; + index += (k*w); + } + if (j > 0) { + i = (j*((w-1)/2))%w; + matrix[k*w*w+j*w+i*k*w+(i+j-1)%w] = 1; + } + } + return matrix; +} + + +int *liber8tion_coding_bitmatrix(int k) +{ + int *matrix, i, j, index; + int w; + + w = 8; + if (k > w) return NULL; + matrix = talloc(int, 2*k*w*w); + if (matrix == NULL) return NULL; + bzero(matrix, sizeof(int)*2*k*w*w); + + /* Set up identity matrices */ + + for(i = 0; i < w; i++) { + index = i*k*w+i; + for (j = 0; j < k; j++) { + matrix[index] = 1; + index += w; + } + } + + /* Set up liber8tion matrices */ + + index = k*w*w; + + if (k == 0) return matrix; + matrix[index+0*k*w+0*w+0] = 1; + matrix[index+1*k*w+0*w+1] = 1; + matrix[index+2*k*w+0*w+2] = 1; + matrix[index+3*k*w+0*w+3] = 1; + matrix[index+4*k*w+0*w+4] = 1; + matrix[index+5*k*w+0*w+5] = 1; + matrix[index+6*k*w+0*w+6] = 1; + matrix[index+7*k*w+0*w+7] = 1; + + if (k == 1) return matrix; + matrix[index+0*k*w+1*w+7] = 1; + matrix[index+1*k*w+1*w+3] = 1; + matrix[index+2*k*w+1*w+0] = 1; + matrix[index+3*k*w+1*w+2] = 1; + matrix[index+4*k*w+1*w+6] = 1; + matrix[index+5*k*w+1*w+1] = 1; + matrix[index+6*k*w+1*w+5] = 1; + matrix[index+7*k*w+1*w+4] = 1; + matrix[index+4*k*w+1*w+7] = 1; + + if (k == 2) return matrix; + matrix[index+0*k*w+2*w+6] = 1; + matrix[index+1*k*w+2*w+2] = 1; + matrix[index+2*k*w+2*w+4] = 1; + matrix[index+3*k*w+2*w+0] = 1; + matrix[index+4*k*w+2*w+7] = 1; + matrix[index+5*k*w+2*w+3] = 1; + matrix[index+6*k*w+2*w+1] = 1; + matrix[index+7*k*w+2*w+5] = 1; + matrix[index+1*k*w+2*w+3] = 1; + + if (k == 3) return matrix; + matrix[index+0*k*w+3*w+2] = 1; + matrix[index+1*k*w+3*w+5] = 1; + matrix[index+2*k*w+3*w+7] = 1; + matrix[index+3*k*w+3*w+6] = 1; + matrix[index+4*k*w+3*w+0] = 1; + matrix[index+5*k*w+3*w+3] = 1; + matrix[index+6*k*w+3*w+4] = 1; + matrix[index+7*k*w+3*w+1] = 1; + matrix[index+5*k*w+3*w+4] = 1; + + if (k == 4) return matrix; + matrix[index+0*k*w+4*w+5] = 1; + matrix[index+1*k*w+4*w+6] = 1; + matrix[index+2*k*w+4*w+1] = 1; + matrix[index+3*k*w+4*w+7] = 1; + matrix[index+4*k*w+4*w+2] = 1; + matrix[index+5*k*w+4*w+4] = 1; + matrix[index+6*k*w+4*w+3] = 1; + matrix[index+7*k*w+4*w+0] = 1; + matrix[index+2*k*w+4*w+0] = 1; + + if (k == 5) return matrix; + matrix[index+0*k*w+5*w+1] = 1; + matrix[index+1*k*w+5*w+2] = 1; + matrix[index+2*k*w+5*w+3] = 1; + matrix[index+3*k*w+5*w+4] = 1; + matrix[index+4*k*w+5*w+5] = 1; + matrix[index+5*k*w+5*w+6] = 1; + matrix[index+6*k*w+5*w+7] = 1; + matrix[index+7*k*w+5*w+0] = 1; + matrix[index+7*k*w+5*w+2] = 1; + + if (k == 6) return matrix; + matrix[index+0*k*w+6*w+3] = 1; + matrix[index+1*k*w+6*w+0] = 1; + matrix[index+2*k*w+6*w+6] = 1; + matrix[index+3*k*w+6*w+5] = 1; + matrix[index+4*k*w+6*w+1] = 1; + matrix[index+5*k*w+6*w+7] = 1; + matrix[index+6*k*w+6*w+4] = 1; + matrix[index+7*k*w+6*w+2] = 1; + matrix[index+6*k*w+6*w+5] = 1; + + if (k == 7) return matrix; + matrix[index+0*k*w+7*w+4] = 1; + matrix[index+1*k*w+7*w+7] = 1; + matrix[index+2*k*w+7*w+1] = 1; + matrix[index+3*k*w+7*w+5] = 1; + matrix[index+4*k*w+7*w+3] = 1; + matrix[index+5*k*w+7*w+2] = 1; + matrix[index+6*k*w+7*w+0] = 1; + matrix[index+7*k*w+7*w+6] = 1; + matrix[index+3*k*w+7*w+1] = 1; + + return matrix; +} + +int *blaum_roth_coding_bitmatrix(int k, int w) +{ + int *matrix, i, j, index, l, m, p; + + if (k > w) return NULL ; + + matrix = talloc(int, 2*k*w*w); + if (matrix == NULL) return NULL; + bzero(matrix, sizeof(int)*2*k*w*w); + + /* Set up identity matrices */ + + for(i = 0; i < w; i++) { + index = i*k*w+i; + for (j = 0; j < k; j++) { + matrix[index] = 1; + index += w; + } + } + + /* Set up blaum_roth matrices -- Ignore identity */ + + p = w+1; + for (j = 0; j < k; j++) { + index = k*w*w+j*w; + if (j == 0) { + for (l = 0; l < w; l++) { + matrix[index+l] = 1; + index += k*w; + } + } else { + i = j; + for (l = 1; l <= w; l++) { + if (l != p-i) { + m = l+i; + if (m >= p) m -= p; + m--; + matrix[index+m] = 1; + } else { + matrix[index+i-1] = 1; + if (i%2 == 0) { + m = i/2; + } else { + m = (p/2) + 1 + (i/2); + } + m--; + matrix[index+m] = 1; + } + index += k*w; + } + } + } + + return matrix; +} diff --git a/src/erasure-code/jerasure/jerasure/src/reed_sol.c b/src/erasure-code/jerasure/jerasure/src/reed_sol.c new file mode 100644 index 000000000000..c0dfe83832a7 --- /dev/null +++ b/src/erasure-code/jerasure/jerasure/src/reed_sol.c @@ -0,0 +1,301 @@ +/* * + * Copyright (c) 2014, James S. Plank and Kevin Greenan + * All rights reserved. + * + * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure + * Coding Techniques + * + * Revision 2.0: Galois Field backend now links to GF-Complete + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the University of Tennessee nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* Jerasure's authors: + + Revision 2.x - 2014: James S. Plank and Kevin M. Greenan + Revision 1.2 - 2008: James S. Plank, Scott Simmerman and Catherine D. Schuman. + Revision 1.0 - 2007: James S. Plank + */ + +#include +#include +#include + +#include +#include "galois.h" +#include "jerasure.h" +#include "reed_sol.h" + +#define talloc(type, num) (type *) malloc(sizeof(type)*(num)) + +int *reed_sol_r6_coding_matrix(int k, int w) +{ + int *matrix; + int i, tmp; + + if (w != 8 && w != 16 && w != 32) return NULL; + + matrix = talloc(int, 2*k); + if (matrix == NULL) return NULL; + + for (i = 0; i < k; i++) matrix[i] = 1; + matrix[k] = 1; + tmp = 1; + for (i = 1; i < k; i++) { + tmp = galois_single_multiply(tmp, 2, w); + matrix[k+i] = tmp; + } + return matrix; +} + +int *reed_sol_vandermonde_coding_matrix(int k, int m, int w) +{ + int i, j; + int *vdm, *dist; + + vdm = reed_sol_big_vandermonde_distribution_matrix(k+m, k, w); + if (vdm == NULL) return NULL; + dist = talloc(int, m*k); + if (dist == NULL) { + free(vdm); + return NULL; + } + + i = k*k; + for (j = 0; j < m*k; j++) { + dist[j] = vdm[i]; + i++; + } + free(vdm); + return dist; +} + +static int prim08 = -1; +static gf_t GF08; + +void reed_sol_galois_w08_region_multby_2(char *region, int nbytes) +{ + if (prim08 == -1) { + prim08 = galois_single_multiply((1 << 7), 2, 8); + if (!gf_init_hard(&GF08, 8, GF_MULT_BYTWO_b, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, + prim08, 0, 0, NULL, NULL)) { + fprintf(stderr, "Error: Can't initialize the GF for reed_sol_galois_w08_region_multby_2\n"); + exit(1); + } + } + GF08.multiply_region.w32(&GF08, region, region, 2, nbytes, 0); +} + +static int prim16 = -1; +static gf_t GF16; + +void reed_sol_galois_w16_region_multby_2(char *region, int nbytes) +{ + if (prim16 == -1) { + prim16 = galois_single_multiply((1 << 15), 2, 16); + if (!gf_init_hard(&GF16, 16, GF_MULT_BYTWO_b, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, + prim16, 0, 0, NULL, NULL)) { + fprintf(stderr, "Error: Can't initialize the GF for reed_sol_galois_w16_region_multby_2\n"); + exit(1); + } + } + GF16.multiply_region.w32(&GF16, region, region, 2, nbytes, 0); +} + +static int prim32 = -1; +static gf_t GF32; + +void reed_sol_galois_w32_region_multby_2(char *region, int nbytes) +{ + if (prim32 == -1) { + prim32 = galois_single_multiply((1 << 31), 2, 32); + if (!gf_init_hard(&GF32, 32, GF_MULT_BYTWO_b, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, + prim32, 0, 0, NULL, NULL)) { + fprintf(stderr, "Error: Can't initialize the GF for reed_sol_galois_w32_region_multby_2\n"); + exit(1); + } + } + GF32.multiply_region.w32(&GF32, region, region, 2, nbytes, 0); +} + +int reed_sol_r6_encode(int k, int w, char **data_ptrs, char **coding_ptrs, int size) +{ + int i; + + /* First, put the XOR into coding region 0 */ + + memcpy(coding_ptrs[0], data_ptrs[0], size); + + for (i = 1; i < k; i++) galois_region_xor(data_ptrs[i], coding_ptrs[0], size); + + /* Next, put the sum of (2^j)*Dj into coding region 1 */ + + memcpy(coding_ptrs[1], data_ptrs[k-1], size); + + for (i = k-2; i >= 0; i--) { + switch (w) { + case 8: reed_sol_galois_w08_region_multby_2(coding_ptrs[1], size); break; + case 16: reed_sol_galois_w16_region_multby_2(coding_ptrs[1], size); break; + case 32: reed_sol_galois_w32_region_multby_2(coding_ptrs[1], size); break; + default: return 0; + } + + galois_region_xor(data_ptrs[i], coding_ptrs[1], size); + } + return 1; +} + +int *reed_sol_extended_vandermonde_matrix(int rows, int cols, int w) +{ + int *vdm; + int i, j, k; + + if (w < 30 && (1 << w) < rows) return NULL; + if (w < 30 && (1 << w) < cols) return NULL; + + vdm = talloc(int, rows*cols); + if (vdm == NULL) { return NULL; } + + vdm[0] = 1; + for (j = 1; j < cols; j++) vdm[j] = 0; + if (rows == 1) return vdm; + + i=(rows-1)*cols; + for (j = 0; j < cols-1; j++) vdm[i+j] = 0; + vdm[i+j] = 1; + if (rows == 2) return vdm; + + for (i = 1; i < rows-1; i++) { + k = 1; + for (j = 0; j < cols; j++) { + vdm[i*cols+j] = k; + k = galois_single_multiply(k, i, w); + } + } + return vdm; +} + +int *reed_sol_big_vandermonde_distribution_matrix(int rows, int cols, int w) +{ + int *dist; + int i, j, k; + int sindex, srindex, siindex, tmp; + + if (cols >= rows) return NULL; + + dist = reed_sol_extended_vandermonde_matrix(rows, cols, w); + if (dist == NULL) return NULL; + + sindex = 0; + for (i = 1; i < cols; i++) { + sindex += cols; + + /* Find an appropriate row -- where i,i != 0 */ + srindex = sindex+i; + for (j = i; j < rows && dist[srindex] == 0; j++) srindex += cols; + if (j >= rows) { /* This should never happen if rows/w are correct */ + fprintf(stderr, "reed_sol_big_vandermonde_distribution_matrix(%d,%d,%d) - couldn't make matrix\n", + rows, cols, w); + exit(1); + } + + /* If necessary, swap rows */ + if (j != i) { + srindex -= i; + for (k = 0; k < cols; k++) { + tmp = dist[srindex+k]; + dist[srindex+k] = dist[sindex+k]; + dist[sindex+k] = tmp; + } + } + + /* If Element i,i is not equal to 1, multiply the column by 1/i */ + + if (dist[sindex+i] != 1) { + tmp = galois_single_divide(1, dist[sindex+i], w); + srindex = i; + for (j = 0; j < rows; j++) { + dist[srindex] = galois_single_multiply(tmp, dist[srindex], w); + srindex += cols; + } + } + + /* Now, for each element in row i that is not in column 1, you need + to make it zero. Suppose that this is column j, and the element + at i,j = e. Then you want to replace all of column j with + (col-j + col-i*e). Note, that in row i, col-i = 1 and col-j = e. + So (e + 1e) = 0, which is indeed what we want. */ + + for (j = 0; j < cols; j++) { + tmp = dist[sindex+j]; + if (j != i && tmp != 0) { + srindex = j; + siindex = i; + for (k = 0; k < rows; k++) { + dist[srindex] = dist[srindex] ^ galois_single_multiply(tmp, dist[siindex], w); + srindex += cols; + siindex += cols; + } + } + } + } + /* We desire to have row k be all ones. To do that, multiply + the entire column j by 1/dist[k,j]. Then row j by 1/dist[j,j]. */ + + sindex = cols*cols; + for (j = 0; j < cols; j++) { + tmp = dist[sindex]; + if (tmp != 1) { + tmp = galois_single_divide(1, tmp, w); + srindex = sindex; + for (i = cols; i < rows; i++) { + dist[srindex] = galois_single_multiply(tmp, dist[srindex], w); + srindex += cols; + } + } + sindex++; + } + + /* Finally, we'd like the first column of each row to be all ones. To + do that, we multiply the row by the inverse of the first element. */ + + sindex = cols*(cols+1); + for (i = cols+1; i < rows; i++) { + tmp = dist[sindex]; + if (tmp != 1) { + tmp = galois_single_divide(1, tmp, w); + for (j = 0; j < cols; j++) dist[sindex+j] = galois_single_multiply(dist[sindex+j], tmp, w); + } + sindex += cols; + } + + return dist; +} + -- 2.47.3