From aa76d203ca89686faf819f591ed51eb238bdd0f9 Mon Sep 17 00:00:00 2001 From: ArnaudBienner Date: Mon, 20 Aug 2018 17:19:16 +0200 Subject: [PATCH 01/73] Add syntax coloring to README.md examples --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 30318a9f..6a0ad191 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ they modify xxhash behavior. They are all disabled by default. Calling xxhash 64-bit variant from a C program : -``` +```c #include "xxhash.h" unsigned long long calcul_hash(const void* buffer, size_t length) @@ -112,7 +112,7 @@ unsigned long long calcul_hash(const void* buffer, size_t length) ``` Using streaming variant is more involved, but makes it possible to provide data in multiple rounds : -``` +```c #include "stdlib.h" /* abort() */ #include "xxhash.h" From 52a97a1a081a0dc2a7b4d5fe56e4c0063e2fb053 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Mon, 17 Sep 2018 11:17:41 -0700 Subject: [PATCH 02/73] minor Makefile improvements - more warnings enabled (inspired by zstd list) - -fPIC is a CFLAGS rather than an LDFLAGS flag (though it doesn't change the outcome, since everything is compiler in a single command line) --- Makefile | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index 6dd738f2..5fb3e24d 100644 --- a/Makefile +++ b/Makefile @@ -41,13 +41,15 @@ else NOSSE4 := endif -CFLAGS ?= -O2 $(NOSSE4) # disables potential auto-vectorization -CFLAGS += -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \ - -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \ - -Wstrict-prototypes -Wundef - -FLAGS = $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(MOREFLAGS) -XXHSUM_VERSION=$(LIBVER) +CFLAGS ?= -O2 $(NOSSE4) # disables potential auto-vectorization +DEBUGFLAGS+=-Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \ + -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \ + -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \ + -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \ + -Wredundant-decls -Wstrict-overflow=5 +CFLAGS += $(DEBUGFLAGS) +FLAGS = $(CFLAGS) $(CPPFLAGS) $(MOREFLAGS) +XXHSUM_VERSION = $(LIBVER) MD2ROFF = ronn MD2ROFF_FLAGS = --roff --warnings --manual="User Commands" --organization="xxhsum $(XXHSUM_VERSION)" @@ -76,6 +78,7 @@ LIBXXH = libxxhash.$(SHARED_EXT_VER) .PHONY: default +default: DEBUGFLAGS= default: lib xxhsum_and_links .PHONY: all @@ -83,12 +86,13 @@ all: lib xxhsum xxhsum_inlinedXXH xxhsum32: CFLAGS += -m32 xxhsum xxhsum32: xxhash.c xxhsum.c - $(CC) $(FLAGS) $^ -o $@$(EXT) + $(CC) $(FLAGS) $^ $(LDFLAGS) -o $@$(EXT) .PHONY: xxhsum_and_links -xxhsum_and_links: xxhsum - ln -sf xxhsum xxh32sum - ln -sf xxhsum xxh64sum +xxhsum_and_links: xxhsum xxh32sum xxh64sum + +xxh32sum xxh64sum: xxhsum + ln -sf $^ $@ xxhsum_inlinedXXH: xxhsum.c $(CC) $(FLAGS) -DXXH_PRIVATE_API $^ -o $@$(EXT) @@ -103,11 +107,11 @@ libxxhash.a: xxhash.o $(LIBXXH): LDFLAGS += -shared ifeq (,$(filter Windows%,$(OS))) -$(LIBXXH): LDFLAGS += -fPIC +$(LIBXXH): CFLAGS += -fPIC endif $(LIBXXH): xxhash.c @echo compiling dynamic library $(LIBVER) - @$(CC) $(FLAGS) $^ $(LDFLAGS) $(SONAME_FLAGS) -o $@ + $(CC) $(FLAGS) $^ $(LDFLAGS) $(SONAME_FLAGS) -o $@ @echo creating versioned links @ln -sf $@ libxxhash.$(SHARED_EXT_MAJOR) @ln -sf $@ libxxhash.$(SHARED_EXT) From fbd68c5f09a9da4662c4b58f1520cfda70e455a9 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Mon, 17 Sep 2018 11:40:06 -0700 Subject: [PATCH 03/73] updated C++ test --- Makefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 5fb3e24d..857906cf 100644 --- a/Makefile +++ b/Makefile @@ -170,15 +170,15 @@ test-xxhsum-c: xxhsum armtest: clean @echo ---- test ARM compilation ---- - $(MAKE) xxhsum CC=arm-linux-gnueabi-gcc MOREFLAGS="-Werror -static" + CC=arm-linux-gnueabi-gcc MOREFLAGS="-Werror -static" $(MAKE) xxhsum clangtest: clean @echo ---- test clang compilation ---- - $(MAKE) all CC=clang MOREFLAGS="-Werror -Wconversion -Wno-sign-conversion" + CC=clang MOREFLAGS="-Werror -Wconversion -Wno-sign-conversion" $(MAKE) all -gpptest: clean +cxxtest: clean @echo ---- test g++ compilation ---- - $(MAKE) all CC=g++ CFLAGS="-O3 -Wall -Wextra -Wundef -Wshadow -Wcast-align -Werror" + CC="$(CXX) -Wno-deprecated" $(MAKE) all CFLAGS="-O3 -Wall -Wextra -Wundef -Wshadow -Wcast-align -Werror -fPIC" c90test: clean @echo ---- test strict C90 compilation [xxh32 only] ---- @@ -213,7 +213,7 @@ preview-man: clean-man man test: all namespaceTest check test-xxhsum-c c90test -test-all: test test32 armtest clangtest gpptest usan listL120 trailingWhitespace staticAnalyze +test-all: test test32 armtest clangtest cxxtest usan listL120 trailingWhitespace staticAnalyze .PHONY: listL120 listL120: # extract lines >= 120 characters in *.{c,h}, by Takayuki Matsuoka (note : $$, for Makefile compatibility) From bce5f457b0bd05b30c4ae5ffdb975c27feb8c718 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Mon, 17 Sep 2018 12:10:03 -0700 Subject: [PATCH 04/73] fixed pointer arithmetic on NULL --- xxhash.c | 170 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 86 insertions(+), 84 deletions(-) diff --git a/xxhash.c b/xxhash.c index da06ea72..05be9b29 100644 --- a/xxhash.c +++ b/xxhash.c @@ -448,12 +448,9 @@ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int s } -FORCE_INLINE -XXH_errorcode XXH32_update_endian (XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian) +FORCE_INLINE XXH_errorcode +XXH32_update_endian(XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian) { - const BYTE* p = (const BYTE*)input; - const BYTE* const bEnd = p + len; - if (input==NULL) #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) return XXH_OK; @@ -461,50 +458,54 @@ XXH_errorcode XXH32_update_endian (XXH32_state_t* state, const void* input, size return XXH_ERROR; #endif - state->total_len_32 += (unsigned)len; - state->large_len |= (len>=16) | (state->total_len_32>=16); + { const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; - if (state->memsize + len < 16) { /* fill in tmp buffer */ - XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len); - state->memsize += (unsigned)len; - return XXH_OK; - } + state->total_len_32 += (unsigned)len; + state->large_len |= (len>=16) | (state->total_len_32>=16); - if (state->memsize) { /* some data left from previous update */ - XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize); - { const U32* p32 = state->mem32; - state->v1 = XXH32_round(state->v1, XXH_readLE32(p32, endian)); p32++; - state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian)); p32++; - state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian)); p32++; - state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian)); + if (state->memsize + len < 16) { /* fill in tmp buffer */ + XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len); + state->memsize += (unsigned)len; + return XXH_OK; } - p += 16-state->memsize; - state->memsize = 0; - } - - if (p <= bEnd-16) { - const BYTE* const limit = bEnd - 16; - U32 v1 = state->v1; - U32 v2 = state->v2; - U32 v3 = state->v3; - U32 v4 = state->v4; - do { - v1 = XXH32_round(v1, XXH_readLE32(p, endian)); p+=4; - v2 = XXH32_round(v2, XXH_readLE32(p, endian)); p+=4; - v3 = XXH32_round(v3, XXH_readLE32(p, endian)); p+=4; - v4 = XXH32_round(v4, XXH_readLE32(p, endian)); p+=4; - } while (p<=limit); + if (state->memsize) { /* some data left from previous update */ + XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize); + { const U32* p32 = state->mem32; + state->v1 = XXH32_round(state->v1, XXH_readLE32(p32, endian)); p32++; + state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian)); p32++; + state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian)); p32++; + state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian)); + } + p += 16-state->memsize; + state->memsize = 0; + } - state->v1 = v1; - state->v2 = v2; - state->v3 = v3; - state->v4 = v4; - } + if (p <= bEnd-16) { + const BYTE* const limit = bEnd - 16; + U32 v1 = state->v1; + U32 v2 = state->v2; + U32 v3 = state->v3; + U32 v4 = state->v4; + + do { + v1 = XXH32_round(v1, XXH_readLE32(p, endian)); p+=4; + v2 = XXH32_round(v2, XXH_readLE32(p, endian)); p+=4; + v3 = XXH32_round(v3, XXH_readLE32(p, endian)); p+=4; + v4 = XXH32_round(v4, XXH_readLE32(p, endian)); p+=4; + } while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } - if (p < bEnd) { - XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); - state->memsize = (unsigned)(bEnd-p); + if (p < bEnd) { + XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } } return XXH_OK; @@ -908,12 +909,9 @@ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long return XXH_OK; } -FORCE_INLINE -XXH_errorcode XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian) +FORCE_INLINE XXH_errorcode +XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian) { - const BYTE* p = (const BYTE*)input; - const BYTE* const bEnd = p + len; - if (input==NULL) #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) return XXH_OK; @@ -921,47 +919,51 @@ XXH_errorcode XXH64_update_endian (XXH64_state_t* state, const void* input, size return XXH_ERROR; #endif - state->total_len += len; + { const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; - if (state->memsize + len < 32) { /* fill in tmp buffer */ - XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len); - state->memsize += (U32)len; - return XXH_OK; - } + state->total_len += len; - if (state->memsize) { /* tmp buffer is full */ - XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize); - state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0, endian)); - state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1, endian)); - state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2, endian)); - state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3, endian)); - p += 32-state->memsize; - state->memsize = 0; - } - - if (p+32 <= bEnd) { - const BYTE* const limit = bEnd - 32; - U64 v1 = state->v1; - U64 v2 = state->v2; - U64 v3 = state->v3; - U64 v4 = state->v4; + if (state->memsize + len < 32) { /* fill in tmp buffer */ + XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len); + state->memsize += (U32)len; + return XXH_OK; + } - do { - v1 = XXH64_round(v1, XXH_readLE64(p, endian)); p+=8; - v2 = XXH64_round(v2, XXH_readLE64(p, endian)); p+=8; - v3 = XXH64_round(v3, XXH_readLE64(p, endian)); p+=8; - v4 = XXH64_round(v4, XXH_readLE64(p, endian)); p+=8; - } while (p<=limit); + if (state->memsize) { /* tmp buffer is full */ + XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize); + state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0, endian)); + state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1, endian)); + state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2, endian)); + state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3, endian)); + p += 32-state->memsize; + state->memsize = 0; + } - state->v1 = v1; - state->v2 = v2; - state->v3 = v3; - state->v4 = v4; - } + if (p+32 <= bEnd) { + const BYTE* const limit = bEnd - 32; + U64 v1 = state->v1; + U64 v2 = state->v2; + U64 v3 = state->v3; + U64 v4 = state->v4; + + do { + v1 = XXH64_round(v1, XXH_readLE64(p, endian)); p+=8; + v2 = XXH64_round(v2, XXH_readLE64(p, endian)); p+=8; + v3 = XXH64_round(v3, XXH_readLE64(p, endian)); p+=8; + v4 = XXH64_round(v4, XXH_readLE64(p, endian)); p+=8; + } while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } - if (p < bEnd) { - XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); - state->memsize = (unsigned)(bEnd-p); + if (p < bEnd) { + XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } } return XXH_OK; From df35d637c4b64127b78c9f821fb3c0abaecedb82 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Mon, 17 Sep 2018 12:12:44 -0700 Subject: [PATCH 05/73] fixed minor printf formatting --- xxhsum.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xxhsum.c b/xxhsum.c index 69931f72..5704f7db 100644 --- a/xxhsum.c +++ b/xxhsum.c @@ -227,7 +227,7 @@ static void BMK_benchHash(hashFunction h, const char* hName, const void* buffer, U32 r=0; clock_t cStart; - DISPLAYLEVEL(2, "%1i-%-17.17s : %10u ->\r", iterationNb, hName, (U32)bufferSize); + DISPLAYLEVEL(2, "%1u-%-17.17s : %10u ->\r", iterationNb, hName, (U32)bufferSize); cStart = clock(); while (clock() == cStart); /* starts clock() at its exact beginning */ cStart = clock(); @@ -239,7 +239,7 @@ static void BMK_benchHash(hashFunction h, const char* hName, const void* buffer, if (r==0) DISPLAYLEVEL(3,".\r"); /* do something with r to avoid compiler "optimizing" away hash function */ { double const timeS = ((double)BMK_clockSpan(cStart) / CLOCKS_PER_SEC) / nbh_perIteration; if (timeS < fastestH) fastestH = timeS; - DISPLAYLEVEL(2, "%1i-%-17.17s : %10u -> %8.0f it/s (%7.1f MB/s) \r", + DISPLAYLEVEL(2, "%1u-%-17.17s : %10u -> %8.0f it/s (%7.1f MB/s) \r", iterationNb, hName, (U32)bufferSize, (double)1 / fastestH, ((double)bufferSize / (1<<20)) / fastestH ); @@ -1140,7 +1140,7 @@ static int usage_advanced(const char* exename) DISPLAY( " -V, --version : display version\n"); DISPLAY( " -h, --help : display long help and exit\n"); DISPLAY( " -b : benchmark mode \n"); - DISPLAY( " -i# : number of iterations (benchmark mode; default %i)\n", g_nbIterations); + DISPLAY( " -i# : number of iterations (benchmark mode; default %u)\n", g_nbIterations); DISPLAY( "\n"); DISPLAY( "The following four options are useful only when verifying checksums (-c):\n"); DISPLAY( "--strict : don't print OK for each successfully verified file\n"); From 68652df700793b86dcbfa945f9ecdeb4c45159b5 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Mon, 17 Sep 2018 12:28:59 -0700 Subject: [PATCH 06/73] fixed ptr arithmetic on NULL --- xxhsum.c | 139 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 73 insertions(+), 66 deletions(-) diff --git a/xxhsum.c b/xxhsum.c index 5704f7db..7790458f 100644 --- a/xxhsum.c +++ b/xxhsum.c @@ -306,37 +306,40 @@ static int BMK_benchFiles(const char** fileNamesTable, int nbFiles, U32 specific for (fileIdx=0; fileIdx 10 KB) { - DISPLAYLEVEL(1, "%u KB", (U32)(keySize >> 10)); - } else { - DISPLAYLEVEL(1, "%u bytes", (U32)keySize); - } - DISPLAYLEVEL(1, "... \n"); + { void* const alignedBuffer = ((char*)buffer+15) - (((size_t)((char*)buffer+15)) & 0xF); /* align on next 16 bytes */ - { int const result = BMK_benchMem(alignedBuffer, keySize, specificTest); - free(buffer); - return result; + /* bench */ + DISPLAYLEVEL(1, "Sample of "); + if (keySize > 10 KB) { + DISPLAYLEVEL(1, "%u KB", (U32)(keySize >> 10)); + } else { + DISPLAYLEVEL(1, "%u bytes", (U32)keySize); + } + DISPLAYLEVEL(1, "... \n"); + + { int const result = BMK_benchMem(alignedBuffer, keySize, specificTest); + free(buffer); + return result; + } } } @@ -813,41 +818,43 @@ static CanonicalFromStringResult canonicalFromString(unsigned char* dst, static ParseLineResult parseLine(ParsedLine* parsedLine, const char* line) { const char* const firstSpace = strchr(line, ' '); - const char* const secondSpace = firstSpace + 1; + if (firstSpace == NULL) return ParseLine_invalidFormat; - parsedLine->filename = NULL; - parsedLine->xxhBits = 0; + { const char* const secondSpace = firstSpace + 1; + if (*secondSpace != ' ') return ParseLine_invalidFormat; - if (firstSpace == NULL || *secondSpace != ' ') return ParseLine_invalidFormat; + parsedLine->filename = NULL; + parsedLine->xxhBits = 0; - switch (firstSpace - line) - { - case 8: - { XXH32_canonical_t* xxh32c = &parsedLine->canonical.xxh32; - if (canonicalFromString(xxh32c->digest, sizeof(xxh32c->digest), line) - != CanonicalFromString_ok) { - return ParseLine_invalidFormat; + switch (firstSpace - line) + { + case 8: + { XXH32_canonical_t* xxh32c = &parsedLine->canonical.xxh32; + if (canonicalFromString(xxh32c->digest, sizeof(xxh32c->digest), line) + != CanonicalFromString_ok) { + return ParseLine_invalidFormat; + } + parsedLine->xxhBits = 32; + break; } - parsedLine->xxhBits = 32; - break; - } - case 16: - { XXH64_canonical_t* xxh64c = &parsedLine->canonical.xxh64; - if (canonicalFromString(xxh64c->digest, sizeof(xxh64c->digest), line) - != CanonicalFromString_ok) { - return ParseLine_invalidFormat; + case 16: + { XXH64_canonical_t* xxh64c = &parsedLine->canonical.xxh64; + if (canonicalFromString(xxh64c->digest, sizeof(xxh64c->digest), line) + != CanonicalFromString_ok) { + return ParseLine_invalidFormat; + } + parsedLine->xxhBits = 64; + break; } - parsedLine->xxhBits = 64; - break; + + default: + return ParseLine_invalidFormat; + break; } - default: - return ParseLine_invalidFormat; - break; + parsedLine->filename = secondSpace + 1; } - - parsedLine->filename = secondSpace + 1; return ParseLine_ok; } From 3eb9d18ddb6250b67c77db58e82a4e283e126129 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Mon, 17 Sep 2018 12:39:18 -0700 Subject: [PATCH 07/73] explicitly states when not checking a return value although, cppcheck seems to overdo this warning, as it also warns for function with `void` return type (??) --- xxhsum.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/xxhsum.c b/xxhsum.c index 7790458f..38aaceb4 100644 --- a/xxhsum.c +++ b/xxhsum.c @@ -408,14 +408,14 @@ static void BMK_testSequence64(void* sentence, size_t len, U64 seed, U64 Nresult Dresult = XXH64(sentence, len, seed); BMK_checkResult64(Dresult, Nresult); - XXH64_reset(&state, seed); - XXH64_update(&state, sentence, len); + (void)XXH64_reset(&state, seed); + (void)XXH64_update(&state, sentence, len); Dresult = XXH64_digest(&state); BMK_checkResult64(Dresult, Nresult); - XXH64_reset(&state, seed); + (void)XXH64_reset(&state, seed); for (pos=0; pos Date: Mon, 17 Sep 2018 12:51:12 -0700 Subject: [PATCH 08/73] minor unused last pointer change style warning --- xxhash.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/xxhash.c b/xxhash.c index 05be9b29..ff28749e 100644 --- a/xxhash.c +++ b/xxhash.c @@ -293,9 +293,9 @@ XXH32_finalize(U32 h32, const void* ptr, size_t len, { const BYTE* p = (const BYTE*)ptr; -#define PROCESS1 \ - h32 += (*p) * PRIME32_5; \ - p++; \ + +#define PROCESS1 \ + h32 += (*p++) * PRIME32_5; \ h32 = XXH_rotl32(h32, 11) * PRIME32_1 ; #define PROCESS4 \ @@ -704,9 +704,8 @@ XXH64_finalize(U64 h64, const void* ptr, size_t len, { const BYTE* p = (const BYTE*)ptr; -#define PROCESS1_64 \ - h64 ^= (*p) * PRIME64_5; \ - p++; \ +#define PROCESS1_64 \ + h64 ^= (*p++) * PRIME64_5; \ h64 = XXH_rotl64(h64, 11) * PRIME64_1; #define PROCESS4_64 \ From 79b52d94ba00f286b8ef2ae8ade1e840dc33d31c Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Mon, 17 Sep 2018 13:47:54 -0700 Subject: [PATCH 09/73] added cppcheck test to Makefile and transitively to .travis.yml --- .travis.yml | 1 + Makefile | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 895da855..3c37a826 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,3 +7,4 @@ before_install: - sudo apt-get install -qq clang - sudo apt-get install -qq g++-multilib - sudo apt-get install -qq gcc-multilib + - sudo apt-get install -qq cppcheck diff --git a/Makefile b/Makefile index 857906cf..32657417 100644 --- a/Makefile +++ b/Makefile @@ -190,10 +190,16 @@ usan: clean @echo ---- check undefined behavior - sanitize ---- $(MAKE) clean test CC=$(CC) MOREFLAGS="-g -fsanitize=undefined -fno-sanitize-recover=all" +.PHONY: staticAnalyze staticAnalyze: clean @echo ---- static analyzer - scan-build ---- CFLAGS="-g -Werror" scan-build --status-bugs -v $(MAKE) all +.PHONY: cppcheck +cppcheck: + @echo ---- static analyzer - cppcheck ---- + cppcheck . --force --enable=warning,portability,performance,style --error-exitcode=1 > /dev/null + namespaceTest: $(CC) -c xxhash.c $(CC) -DXXH_NAMESPACE=TEST_ -c xxhash.c -o xxhash2.o @@ -213,7 +219,7 @@ preview-man: clean-man man test: all namespaceTest check test-xxhsum-c c90test -test-all: test test32 armtest clangtest cxxtest usan listL120 trailingWhitespace staticAnalyze +test-all: test test32 armtest clangtest cxxtest usan listL120 trailingWhitespace staticAnalyze cppcheck .PHONY: listL120 listL120: # extract lines >= 120 characters in *.{c,h}, by Takayuki Matsuoka (note : $$, for Makefile compatibility) From 542430e0ec256b681640719580ef0e02b3dc93ba Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Sat, 29 Sep 2018 23:09:23 -0700 Subject: [PATCH 10/73] fixed compilation issues under msys2/mingw64 --- .gitignore | 1 + Makefile | 3 ++- xxhsum.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 59 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 36639c6e..d1c970d7 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ libxxhash.* xxh32sum xxh64sum xxhsum +xxhsum.exe xxhsum32 xxhsum_privateXXH xxhsum_inlinedXXH diff --git a/Makefile b/Makefile index 32657417..83db5c3c 100644 --- a/Makefile +++ b/Makefile @@ -94,8 +94,9 @@ xxhsum_and_links: xxhsum xxh32sum xxh64sum xxh32sum xxh64sum: xxhsum ln -sf $^ $@ +xxhsum_inlinedXXH: CPPFLAGS += -DXXH_INLINE_ALL xxhsum_inlinedXXH: xxhsum.c - $(CC) $(FLAGS) -DXXH_PRIVATE_API $^ -o $@$(EXT) + $(CC) $(FLAGS) $^ -o $@$(EXT) # library diff --git a/xxhsum.c b/xxhsum.c index 38aaceb4..7336e1f6 100644 --- a/xxhsum.c +++ b/xxhsum.c @@ -63,15 +63,65 @@ /* ************************************ * OS-Specific Includes **************************************/ -#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) || defined(__CYGWIN__) -# include /* _O_BINARY */ -# include /* _setmode, _isatty */ -# define SET_BINARY_MODE(file) _setmode(_fileno(file), _O_BINARY) +#if !defined(_WIN32) && (defined(__unix__) || defined(__unix) || (defined(__APPLE__) && defined(__MACH__)) /* UNIX-like OS */ \ + || defined(__midipix__) || defined(__VMS)) +# if (defined(__APPLE__) && defined(__MACH__)) || defined(__SVR4) || defined(_AIX) || defined(__hpux) /* POSIX.1-2001 (SUSv3) conformant */ \ + || defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) /* BSD distros */ +# define PLATFORM_POSIX_VERSION 200112L +# else +# if defined(__linux__) || defined(__linux) +# ifndef _POSIX_C_SOURCE +# define _POSIX_C_SOURCE 200112L /* use feature test macro */ +# endif +# endif +# include /* declares _POSIX_VERSION */ +# if defined(_POSIX_VERSION) /* POSIX compliant */ +# define PLATFORM_POSIX_VERSION _POSIX_VERSION +# else +# define PLATFORM_POSIX_VERSION 0 +# endif +# endif +#endif +#if !defined(PLATFORM_POSIX_VERSION) +# define PLATFORM_POSIX_VERSION -1 +#endif + +#if (defined(__linux__) && (PLATFORM_POSIX_VERSION >= 1)) \ + || (PLATFORM_POSIX_VERSION >= 200112L) \ + || defined(__DJGPP__) \ + || defined(__MSYS__) +# include /* isatty */ +# define IS_CONSOLE(stdStream) isatty(fileno(stdStream)) +#elif defined(MSDOS) || defined(OS2) || defined(__CYGWIN__) +# include /* _isatty */ # define IS_CONSOLE(stdStream) _isatty(_fileno(stdStream)) +#elif defined(WIN32) || defined(_WIN32) +# include /* _isatty */ +# include /* DeviceIoControl, HANDLE, FSCTL_SET_SPARSE */ +# include /* FILE */ +static __inline int IS_CONSOLE(FILE* stdStream) { + DWORD dummy; + return _isatty(_fileno(stdStream)) && GetConsoleMode((HANDLE)_get_osfhandle(_fileno(stdStream)), &dummy); +} +#else +# define IS_CONSOLE(stdStream) 0 +#endif + +#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) +# include /* _O_BINARY */ +# include /* _setmode, _fileno, _get_osfhandle */ +# if !defined(__DJGPP__) +# include /* DeviceIoControl, HANDLE, FSCTL_SET_SPARSE */ +# include /* FSCTL_SET_SPARSE */ +# define SET_BINARY_MODE(file) { int const unused=_setmode(_fileno(file), _O_BINARY); (void)unused; } +# define SET_SPARSE_FILE_MODE(file) { DWORD dw; DeviceIoControl((HANDLE) _get_osfhandle(_fileno(file)), FSCTL_SET_SPARSE, 0, 0, 0, 0, &dw, 0); } +# else +# define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY) +# define SET_SPARSE_FILE_MODE(file) +# endif #else -# include /* isatty, STDIN_FILENO */ # define SET_BINARY_MODE(file) -# define IS_CONSOLE(stdStream) isatty(STDIN_FILENO) +# define SET_SPARSE_FILE_MODE(file) #endif #if !defined(S_ISREG) From 7a407f64f987731c5ec3472caa9754d55ca5be3c Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Sat, 29 Sep 2018 23:13:01 -0700 Subject: [PATCH 11/73] show script lines during lib compilation --- .gitignore | 1 + Makefile | 9 +++------ 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index d1c970d7..9a49cfd1 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ xxhsum.exe xxhsum32 xxhsum_privateXXH xxhsum_inlinedXXH +xxhsum_inlinedXXH.exe # Mac OS-X artefacts *.dSYM diff --git a/Makefile b/Makefile index 83db5c3c..55eb82f3 100644 --- a/Makefile +++ b/Makefile @@ -103,19 +103,16 @@ xxhsum_inlinedXXH: xxhsum.c libxxhash.a: ARFLAGS = rcs libxxhash.a: xxhash.o - @echo compiling static library - @$(AR) $(ARFLAGS) $@ $^ + $(AR) $(ARFLAGS) $@ $^ $(LIBXXH): LDFLAGS += -shared ifeq (,$(filter Windows%,$(OS))) $(LIBXXH): CFLAGS += -fPIC endif $(LIBXXH): xxhash.c - @echo compiling dynamic library $(LIBVER) $(CC) $(FLAGS) $^ $(LDFLAGS) $(SONAME_FLAGS) -o $@ - @echo creating versioned links - @ln -sf $@ libxxhash.$(SHARED_EXT_MAJOR) - @ln -sf $@ libxxhash.$(SHARED_EXT) + ln -sf $@ libxxhash.$(SHARED_EXT_MAJOR) + ln -sf $@ libxxhash.$(SHARED_EXT) libxxhash : $(LIBXXH) From 2ec7fddf1c1c7d19212726f3d3dfaaaa6f12d4a8 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Sat, 29 Sep 2018 23:54:24 -0700 Subject: [PATCH 12/73] minor optimization : shared xxhash.o compilation --- Makefile | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 55eb82f3..e88013b6 100644 --- a/Makefile +++ b/Makefile @@ -84,8 +84,10 @@ default: lib xxhsum_and_links .PHONY: all all: lib xxhsum xxhsum_inlinedXXH +xxhsum : xxhash.o xxhsum.o + xxhsum32: CFLAGS += -m32 -xxhsum xxhsum32: xxhash.c xxhsum.c +xxhsum32: xxhash.c xxhsum.c $(CC) $(FLAGS) $^ $(LDFLAGS) -o $@$(EXT) .PHONY: xxhsum_and_links @@ -116,6 +118,7 @@ $(LIBXXH): xxhash.c libxxhash : $(LIBXXH) +.PHONY: lib lib: libxxhash.a libxxhash @@ -175,12 +178,16 @@ clangtest: clean CC=clang MOREFLAGS="-Werror -Wconversion -Wno-sign-conversion" $(MAKE) all cxxtest: clean - @echo ---- test g++ compilation ---- + @echo ---- test C++ compilation ---- CC="$(CXX) -Wno-deprecated" $(MAKE) all CFLAGS="-O3 -Wall -Wextra -Wundef -Wshadow -Wcast-align -Werror -fPIC" -c90test: clean +.PHONY: c90test +c90test: CPPFLAGS += -DXXH_NO_LONG_LONG +c90test: CFLAGS += -std=c90 -Werror -pedantic +c90test: xxhash.c @echo ---- test strict C90 compilation [xxh32 only] ---- - $(CC) -std=c90 -Werror -pedantic -DXXH_NO_LONG_LONG -c xxhash.c + $(RM) xxhash.o + $(CC) $(FLAGS) $^ $(LDFLAGS) -c $(RM) xxhash.o usan: CC=clang @@ -198,6 +205,7 @@ cppcheck: @echo ---- static analyzer - cppcheck ---- cppcheck . --force --enable=warning,portability,performance,style --error-exitcode=1 > /dev/null +.PHONY: namespaceTest namespaceTest: $(CC) -c xxhash.c $(CC) -DXXH_NAMESPACE=TEST_ -c xxhash.c -o xxhash2.o @@ -207,6 +215,7 @@ namespaceTest: xxhsum.1: xxhsum.1.md cat $^ | $(MD2ROFF) $(MD2ROFF_FLAGS) | sed -n '/^\.\\\".*/!p' > $@ +.PHONY: man man: xxhsum.1 clean-man: From eec5700f4d62113b47ee548edbc4746f61ffb098 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Thu, 11 Oct 2018 17:07:38 -0700 Subject: [PATCH 13/73] added some notes of constant selection as suggested in #151. --- doc/xxhash_spec.md | 29 +++++++++++++++++------------ xxhash.c | 20 ++++++++++---------- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/doc/xxhash_spec.md b/doc/xxhash_spec.md index e673334b..7e634e03 100644 --- a/doc/xxhash_spec.md +++ b/doc/xxhash_spec.md @@ -16,7 +16,7 @@ Distribution of this document is unlimited. ### Version -0.1.0 (15/01/18) +0.1.1 (10/10/18) Table of Contents @@ -63,13 +63,15 @@ The algorithm collect and transform input in _stripes_ of 16 bytes. The transfor The algorithm uses 32-bits addition, multiplication, rotate, shift and xor operations. Many operations require some 32-bits prime number constants, all defined below : - static const u32 PRIME32_1 = 2654435761U; - static const u32 PRIME32_2 = 2246822519U; - static const u32 PRIME32_3 = 3266489917U; - static const u32 PRIME32_4 = 668265263U; - static const u32 PRIME32_5 = 374761393U; + static const u32 PRIME32_1 = 2654435761U; // 0b10011110001101110111100110110001 + static const u32 PRIME32_2 = 2246822519U; // 0b10000101111010111100101001110111 + static const u32 PRIME32_3 = 3266489917U; // 0b11000010101100101010111000111101 + static const u32 PRIME32_4 = 668265263U; // 0b00100111110101001110101100101111 + static const u32 PRIME32_5 = 374761393U; // 0b00010110010101100110011110110001 -### Step 1. Initialise internal accumulators +These constants are prime numbers, and feature a good mix of bits 1 and 0, neither too regular, nor too dissymmetric. These properties help dispersion capabilities. + +### Step 1. Initialize internal accumulators Each accumulator gets an initial value based on optional `seed` input. Since the `seed` is optional, it can be `0`. @@ -170,11 +172,13 @@ The algorithm collects and transforms input in _stripes_ of 32 bytes. The transf The algorithm uses 64-bit addition, multiplication, rotate, shift and xor operations. Many operations require some 64-bit prime number constants, all defined below : - static const u64 PRIME64_1 = 11400714785074694791ULL; - static const u64 PRIME64_2 = 14029467366897019727ULL; - static const u64 PRIME64_3 = 1609587929392839161ULL; - static const u64 PRIME64_4 = 9650029242287828579ULL; - static const u64 PRIME64_5 = 2870177450012600261ULL; + static const u64 PRIME64_1 = 11400714785074694791ULL; // 0b1001111000110111011110011011000110000101111010111100101010000111 + static const u64 PRIME64_2 = 14029467366897019727ULL; // 0b1100001010110010101011100011110100100111110101001110101101001111 + static const u64 PRIME64_3 = 1609587929392839161ULL; // 0b0001011001010110011001111011000110011110001101110111100111111001 + static const u64 PRIME64_4 = 9650029242287828579ULL; // 0b1000010111101011110010100111011111000010101100101010111001100011 + static const u64 PRIME64_5 = 2870177450012600261ULL; // 0b0010011111010100111010110010111100010110010101100110011111000101 + +These constants are prime numbers, and feature a good mix of bits 1 and 0, neither too regular, nor too dissymmetric. These properties help dispersion capabilities. ### Step 1. Initialise internal accumulators @@ -308,4 +312,5 @@ It links to the [github project page](https://github.com/Cyan4973/xxHash) where Version changes -------------------- +v0.1.1 : added a note on rationale for selection of constants v0.1.0 : initial release diff --git a/xxhash.c b/xxhash.c index ff28749e..13669b2a 100644 --- a/xxhash.c +++ b/xxhash.c @@ -260,11 +260,11 @@ XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } /* ******************************************************************* * 32-bit hash functions *********************************************************************/ -static const U32 PRIME32_1 = 2654435761U; -static const U32 PRIME32_2 = 2246822519U; -static const U32 PRIME32_3 = 3266489917U; -static const U32 PRIME32_4 = 668265263U; -static const U32 PRIME32_5 = 374761393U; +static const U32 PRIME32_1 = 2654435761U; /* 0b10011110001101110111100110110001 */ +static const U32 PRIME32_2 = 2246822519U; /* 0b10000101111010111100101001110111 */ +static const U32 PRIME32_3 = 3266489917U; /* 0b11000010101100101010111000111101 */ +static const U32 PRIME32_4 = 668265263U; /* 0b00100111110101001110101100101111 */ +static const U32 PRIME32_5 = 374761393U; /* 0b00010110010101100110011110110001 */ static U32 XXH32_round(U32 seed, U32 input) { @@ -663,11 +663,11 @@ static U64 XXH_readBE64(const void* ptr) /*====== xxh64 ======*/ -static const U64 PRIME64_1 = 11400714785074694791ULL; -static const U64 PRIME64_2 = 14029467366897019727ULL; -static const U64 PRIME64_3 = 1609587929392839161ULL; -static const U64 PRIME64_4 = 9650029242287828579ULL; -static const U64 PRIME64_5 = 2870177450012600261ULL; +static const U64 PRIME64_1 = 11400714785074694791ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111 */ +static const U64 PRIME64_2 = 14029467366897019727ULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111 */ +static const U64 PRIME64_3 = 1609587929392839161ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001 */ +static const U64 PRIME64_4 = 9650029242287828579ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011 */ +static const U64 PRIME64_5 = 2870177450012600261ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101 */ static U64 XXH64_round(U64 acc, U64 input) { From 0f2dd4a1cb103e3fc8c55c855b821eb24c6d82c3 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 16 Oct 2018 21:47:53 -0700 Subject: [PATCH 14/73] fixed minor cast warning fix #139 --- xxhsum.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xxhsum.c b/xxhsum.c index 7336e1f6..d9f5be2d 100644 --- a/xxhsum.c +++ b/xxhsum.c @@ -267,7 +267,7 @@ static U32 localXXH64(const void* buffer, size_t bufferSize, U32 seed) { return static void BMK_benchHash(hashFunction h, const char* hName, const void* buffer, size_t bufferSize) { - U32 nbh_perIteration = ((300 MB) / (bufferSize+1)) + 1; /* first loop conservatively aims for 300 MB/s */ + U32 nbh_perIteration = (U32)((300 MB) / (bufferSize+1)) + 1; /* first loop conservatively aims for 300 MB/s */ U32 iterationNb; double fastestH = 100000000.; From c99e0c1c700f6cee34a8cd31fc4222c1420cd0df Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Mon, 4 Feb 2019 13:57:55 -0800 Subject: [PATCH 15/73] ensure rotl macro arguments are in parenthesis to support non-singleton arguments --- xxhash.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xxhash.c b/xxhash.c index 13669b2a..2cb986f1 100644 --- a/xxhash.c +++ b/xxhash.c @@ -191,8 +191,8 @@ static U32 XXH_read32(const void* memPtr) # define XXH_rotl32(x,r) _rotl(x,r) # define XXH_rotl64(x,r) _rotl64(x,r) #else -# define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r))) -# define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r))) +# define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) +# define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) #endif #if defined(_MSC_VER) /* Visual Studio */ From d6f83c47f9160c849ea7a8bd958cd9e6d6ed0261 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Mon, 4 Feb 2019 14:05:34 -0800 Subject: [PATCH 16/73] renamed FORCE_INLINE into XXH_FORCE_INLINE to reduce risks of symbol duplication when XXH_INLINE_ALL is used --- xxhash.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/xxhash.c b/xxhash.c index 2cb986f1..ce38f71b 100644 --- a/xxhash.c +++ b/xxhash.c @@ -122,16 +122,16 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcp ***************************************/ #ifdef _MSC_VER /* Visual Studio */ # pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ -# define FORCE_INLINE static __forceinline +# define XXH_FORCE_INLINE static __forceinline #else # if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ # ifdef __GNUC__ -# define FORCE_INLINE static inline __attribute__((always_inline)) +# define XXH_FORCE_INLINE static inline __attribute__((always_inline)) # else -# define FORCE_INLINE static inline +# define XXH_FORCE_INLINE static inline # endif # else -# define FORCE_INLINE static +# define XXH_FORCE_INLINE static # endif /* __STDC_VERSION__ */ #endif @@ -231,7 +231,8 @@ static int XXH_isLittleEndian(void) *****************************/ typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; -FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align) +XXH_FORCE_INLINE U32 +XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align) { if (align==XXH_unaligned) return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); @@ -239,7 +240,7 @@ FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_a return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr); } -FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian) +XXH_FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian) { return XXH_readLE32_align(ptr, endian, XXH_unaligned); } @@ -348,7 +349,7 @@ XXH32_finalize(U32 h32, const void* ptr, size_t len, } -FORCE_INLINE U32 +XXH_FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align) { @@ -448,7 +449,7 @@ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int s } -FORCE_INLINE XXH_errorcode +XXH_FORCE_INLINE XXH_errorcode XXH32_update_endian(XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian) { if (input==NULL) @@ -523,7 +524,7 @@ XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* } -FORCE_INLINE U32 +XXH_FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian) { U32 h32; @@ -642,7 +643,8 @@ static U64 XXH_swap64 (U64 x) } #endif -FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align) +XXH_FORCE_INLINE U64 +XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align) { if (align==XXH_unaligned) return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); @@ -650,7 +652,7 @@ FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_a return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr); } -FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian) +XXH_FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian) { return XXH_readLE64_align(ptr, endian, XXH_unaligned); } @@ -807,7 +809,7 @@ XXH64_finalize(U64 h64, const void* ptr, size_t len, return 0; /* unreachable, but some compilers complain without it */ } -FORCE_INLINE U64 +XXH_FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align) { @@ -908,7 +910,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long return XXH_OK; } -FORCE_INLINE XXH_errorcode +XXH_FORCE_INLINE XXH_errorcode XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian) { if (input==NULL) @@ -978,7 +980,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* return XXH64_update_endian(state_in, input, len, XXH_bigEndian); } -FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian) +XXH_FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian) { U64 h64; From 1b0f7b371d63c9b1997fdfb95b926d614ba3d920 Mon Sep 17 00:00:00 2001 From: LambdAurora Date: Tue, 12 Feb 2019 22:15:58 +0100 Subject: [PATCH 17/73] Added export of public symbols on Windows. --- cmake_unofficial/CMakeLists.txt | 1 + xxhash.h | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/cmake_unofficial/CMakeLists.txt b/cmake_unofficial/CMakeLists.txt index 1ca7a06d..bfdd5481 100644 --- a/cmake_unofficial/CMakeLists.txt +++ b/cmake_unofficial/CMakeLists.txt @@ -57,6 +57,7 @@ include_directories("${XXHASH_DIR}") # libxxhash add_library(xxhash "${XXHASH_DIR}/xxhash.c") +target_compile_definitions(xxhash PUBLIC XXH_EXPORT) set_target_properties(xxhash PROPERTIES SOVERSION "${XXHASH_VERSION_STRING}" VERSION "${XXHASH_VERSION_STRING}") diff --git a/xxhash.h b/xxhash.h index d6bad943..486a6d81 100644 --- a/xxhash.h +++ b/xxhash.h @@ -107,7 +107,15 @@ typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; # define XXH_PUBLIC_API static # endif #else -# define XXH_PUBLIC_API /* do nothing */ +# ifdef WIN32 +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# else +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif #endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ /*! XXH_NAMESPACE, aka Namespace Emulation : From 3c2844854655207191932d1095f40d53c2d039ac Mon Sep 17 00:00:00 2001 From: LambdAurora Date: Tue, 12 Feb 2019 22:29:57 +0100 Subject: [PATCH 18/73] Added missing condition to export symbols on Windows with CMake. --- cmake_unofficial/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cmake_unofficial/CMakeLists.txt b/cmake_unofficial/CMakeLists.txt index bfdd5481..f66e72ef 100644 --- a/cmake_unofficial/CMakeLists.txt +++ b/cmake_unofficial/CMakeLists.txt @@ -57,7 +57,9 @@ include_directories("${XXHASH_DIR}") # libxxhash add_library(xxhash "${XXHASH_DIR}/xxhash.c") -target_compile_definitions(xxhash PUBLIC XXH_EXPORT) +if (BUILD_SHARED_LIBS) + target_compile_definitions(xxhash PUBLIC XXH_EXPORT) +endif () set_target_properties(xxhash PROPERTIES SOVERSION "${XXHASH_VERSION_STRING}" VERSION "${XXHASH_VERSION_STRING}") From c56b856e5850fe984fedf1d98645f294748a0192 Mon Sep 17 00:00:00 2001 From: LambdAurora Date: Tue, 12 Feb 2019 22:59:45 +0100 Subject: [PATCH 19/73] Fixed undefined reference when building with MinGW. --- xxhash.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xxhash.h b/xxhash.h index 486a6d81..84942e8e 100644 --- a/xxhash.h +++ b/xxhash.h @@ -107,7 +107,7 @@ typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; # define XXH_PUBLIC_API static # endif #else -# ifdef WIN32 +# if defined(WIN32) && !defined(__GNUC__) # ifdef XXH_EXPORT # define XXH_PUBLIC_API __declspec(dllexport) # else From 45f39e6d34c776956e37f33e66592ea1a8bb2524 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 26 Feb 2019 12:36:23 -0800 Subject: [PATCH 20/73] first implementation of XXH3_64b currently can only be used for benchmarking (`-b`) --- xxh3.h | 400 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ xxhash.c | 10 ++ xxhsum.c | 84 ++++++++---- 3 files changed, 471 insertions(+), 23 deletions(-) create mode 100644 xxh3.h diff --git a/xxh3.h b/xxh3.h new file mode 100644 index 00000000..f425545f --- /dev/null +++ b/xxh3.h @@ -0,0 +1,400 @@ +#ifndef XXH3_H +#define XXH3_H + + +#define XXH_INLINE_ALL +#include "xxhash.h" + +#define NDEBUG +#include + +//#include +#define TRACE(...) //printf(__VA_ARGS__) + + +// ========================================== +// Vectorization detection +// ========================================== + +// macro enums +#define XXH_SCALAR 0 +#define XXH_SSE2 1 +#define XXH_AVX2 2 + +#ifndef XXH_VECTOR /* can be defined on command line */ +# if defined(__AVX2__) +# define XXH_VECTOR XXH_AVX2 +# elif defined(__SSE2__) +# define XXH_VECTOR XXH_SSE2 +# else +# define XXH_VECTOR XXH_SCALAR +# endif +#endif + + +// ========================================== +// Short keys +// ========================================== + +static U64 XXH3_mixHigh(U64 val) { + return val ^ (val >> 47); +} + +static U64 XXH3_finalMerge_2u64(U64 ll1, U64 ll2, U64 mul) +{ + U64 const llcomb1 = XXH3_mixHigh((ll1 ^ ll2) * mul); + U64 const llcomb2 = XXH3_mixHigh((ll2 ^ llcomb1) * mul); + return llcomb2 * mul; +} + +static U64 XXH3_finalMerge_4u64(U64 ll1, U64 ll2, U64 ll3, U64 ll4, U64 mul) +{ + U64 const llcomb1 = XXH_rotl64(ll1 + ll2, 43) + XXH_rotl64(ll3, 30) + ll4; + U64 const llcomb2 = ll1 + XXH_rotl64(ll2 + PRIME64_3, 18) + ll3; + + return XXH3_finalMerge_2u64(llcomb1, llcomb2, mul); +} + +static U64 XXH3_finalMerge_8u64(U64 ll1, U64 ll2, U64 ll3, U64 ll4, + U64 ll5, U64 ll6, U64 ll7, U64 ll8, + U64 mul) +{ + U64 const ll11 = XXH_rotl64(ll1 + ll7, 21) + (XXH_rotl64(ll2, 34) + ll3) * 9; + U64 const ll12 = ((ll1 + ll2) ^ ll4) + ll6 + 1; + U64 const ll13 = XXH_rotl64(ll5 + ll6, 22) + ll3; + U64 const ll14 = ll5 + XXH_rotl64(ll8, 11) + ll3; + + U64 const ll21 = XXH_swap64((ll11 + ll12) * mul) + ll8; + U64 const ll31 = (XXH_swap64((ll12 + ll21) * mul) + ll7) * mul; + U64 const ll41 = XXH_swap64((ll13 + ll14) * mul + ll31) + ll2; + U64 const ll51 = XXH3_mixHigh((ll14 + ll41) * mul + ll4 + ll8) * mul; + + return ll51 + ll13; +} + + +static inline U64 XXH3_len_1to3_64b(const void* data, size_t len) +{ + assert(data != NULL); + assert(len > 0 && len <= 3); + BYTE const c1 = ((const BYTE*)data)[0]; + BYTE const c2 = ((const BYTE*)data)[len >> 1]; + BYTE const c3 = ((const BYTE*)data)[len - 1]; + U32 const l1 = (U32)(c1) + ((U32)(c2) << 8); + U32 const l2 = (U32)(len) + ((U32)(c3) << 2); + U64 const ll3 = (l1 * PRIME64_2) ^ (l2 * PRIME64_1); + return XXH3_mixHigh(ll3) * PRIME64_3; +} + +static inline U64 XXH3_len_4to8_64b(const void* data, size_t len) +{ + assert(data != NULL); + assert(len >= 4 && len <= 8); + U64 const mul = PRIME64_2 + (len * 2); /* keep it odd */ + U64 const ll1 = XXH_read32(data); + U64 const ll2 = XXH_read32((const BYTE*)data + len - 4) + PRIME64_1; + return XXH3_finalMerge_2u64((len-1) + (ll1 << 3), ll2, mul); +} + +static inline U64 XXH3_len_9to16_64b(const void* data, size_t len) +{ + assert(data != NULL); + assert(len >= 9 && len <= 16); + U64 const ll1 = XXH_read64(data) + PRIME64_1; + U64 const ll2 = XXH_read64((const BYTE*)data + len - 8); + U64 const mul = PRIME64_2 + len * 2; /* keep it odd */ + U64 const llcomb3 = ll1 * mul + XXH_rotl64(ll2, 23); + U64 const llcomb4 = ll2 * mul + XXH_rotl64(ll1, 37); + return XXH3_finalMerge_2u64(llcomb3, llcomb4, mul); +} + +static inline U64 XXH3_len_1to16_64b(const void* data, size_t len) +{ + assert(data != NULL); + assert(len > 0 && len <= 16); + if (len > 8) return XXH3_len_9to16_64b(data, len); + if (len >= 4) return XXH3_len_4to8_64b(data, len); + return XXH3_len_1to3_64b(data, len); +} + + +static U64 XXH3_len_17to32_64b(const void* data, size_t len) +{ + assert(data != NULL); + assert(len > 16 && len <= 32); + const BYTE* const p = (const BYTE*)data; + + U64 const mul = PRIME64_3 + len * 2; /* keep it odd */ + U64 const ll1 = XXH_read64(p) * PRIME64_1; + U64 const ll2 = XXH_read64(p + 8); + U64 const ll3 = XXH_read64(p + len - 8) * mul; + U64 const ll4 = XXH_read64(p + len - 16) * PRIME64_2; + + return XXH3_finalMerge_4u64(ll1, ll2, ll3, ll4, mul); +} + + +static U64 XXH3_len_33to64_64b(const void* data, size_t len) +{ + assert(data != NULL); + assert(len > 33 && len <= 64); + const BYTE* const p = (const BYTE*)data; + + U64 const mul = PRIME64_2 + len * 2; /* keep it odd */ + + U64 const ll1 = XXH_read64(p); + U64 const ll2 = XXH_read64(p + 8); + U64 const ll3 = XXH_read64(p + 16); + U64 const ll4 = XXH_read64(p + 24); + U64 const ll5 = XXH_read64(p + len - 32); + U64 const ll6 = XXH_read64(p + len - 24); + U64 const ll7 = XXH_read64(p + len - 16); + U64 const ll8 = XXH_read64(p + len - 8); + + return XXH3_finalMerge_8u64(ll1, ll2, ll3, ll4, ll5, ll6, ll7, ll8, mul); +} + + +static U64 XXH3_len_65to96_64b(const void* data, size_t len) +{ + assert(data != NULL); + assert(len > 64 && len <= 96); + const BYTE* const p = (const BYTE*)data; + + U64 const ll1 = XXH3_len_33to64_64b(data, 64); + U64 const ll2 = XXH3_len_17to32_64b(p + len - 32, 32); + return XXH3_finalMerge_2u64(ll1, ll2, PRIME64_1 + 2*len); +} + +static U64 XXH3_len_97to128_64b(const void* data, size_t len) +{ + assert(data != NULL); + assert(len > 96 && len <= 128); + const BYTE* const p = (const BYTE*)data; + + U64 const ll1 = XXH3_len_33to64_64b(data, 64); + U64 const ll2 = XXH3_len_33to64_64b(p + 64, len - 64); + return XXH3_finalMerge_2u64(ll1, ll2, PRIME64_1 + 2*len); +} + + +// ========================================== +// Long keys +// ========================================== + +#if __GNUC__ +#include +#define ALIGN(n) __attribute__ ((aligned(n))) +#elif _MSC_VER +#include +#define ALIGN(n) __declspec(align(n)) +#else +#define ALIGN(n) +#endif + +#define STRIPE_LEN 64 +#define STRIPE_ELTS (STRIPE_LEN / sizeof(U32)) +#define KEYSET_DEFAULT_SIZE 48 // minimum 32 + + +ALIGN(64) static const U32 kKey[KEYSET_DEFAULT_SIZE] = { + 0xb8fe6c39,0x23a44bbe,0x7c01812c,0xf721ad1c, + 0xded46de9,0x839097db,0x7240a4a4,0xb7b3671f, + 0xcb79e64e,0xccc0e578,0x825ad07d,0xccff7221, + 0xb8084674,0xf743248e,0xe03590e6,0x813a264c, + 0x3c2852bb,0x91c300cb,0x88d0658b,0x1b532ea3, + 0x71644897,0xa20df94e,0x3819ef46,0xa9deacd8, + 0xa8fa763f,0xe39c343f,0xf9dcbbc7,0xc70b4f1d, + 0x8a51e04b,0xcdb45931,0xc89f7ec9,0xd9787364, + + 0xeac5ac83,0x34d3ebc3,0xc581a0ff,0xfa1363eb, + 0x170ddd51,0xb7f0da49,0xd3165526,0x29d4689e, + 0x2b16be58,0x7d47a1fc,0x8ff8b8d1,0x7ad031ce, + 0x45cb3a8f,0x95160428,0xafd7fbca,0xbb4b407e, +}; + +#define ACC_NB (STRIPE_LEN / sizeof(U64)) + +inline static void +XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict key) +{ + +#if (XXH_VECTOR == XXH_AVX2) + + assert(((size_t)acc) & 31 == 0); + + __m256i* const xacc = (__m256i *) acc; + const __m256i* const xdata = (const __m256i *) data; + ALIGN(32) const __m256i* const xkey = (const __m256i *) key; + + for (size_t i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { + __m256i const d = _mm256_loadu_si256 (xdata+i); + __m256i const k = _mm256_loadu_si256 (xkey+i); + __m256i const dk = _mm256_add_epi32 (d,k); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */ + __m256i const res = _mm256_mul_epu32 (dk, _mm256_shuffle_epi32 (dk,0x31)); /* uint64 res[4] = {dk0*dk1, dk2*dk3, ...} */ + xacc[i] = _mm256_add_epi64(res, xacc[i]); /* xacc must be aligned on 32 bytes boundaries */ + } + +#elif (XXH_VECTOR == XXH_SSE2) + + assert(((size_t)acc) & 15 == 0); + + __m128i* const xacc = (__m128i *) acc; + const __m128i* const xdata = (const __m128i *) data; + ALIGN(16) const __m128i* const xkey = (const __m128i *) key; + + for (size_t i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { + __m128i const d = _mm_loadu_si128 (xdata+i); + __m128i const k = _mm_loadu_si128 (xkey+i); + __m128i const dk = _mm_add_epi32 (d,k); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ + __m128i const res = _mm_mul_epu32 (dk, _mm_shuffle_epi32 (dk,0x31)); /* uint64 res[2] = {dk0*dk1,dk2*dk3} */ + xacc[i] = _mm_add_epi64(res, xacc[i]); /* xacc must be aligned on 16 bytes boundaries */ + } + +#else // scalar variant + + U64* const xacc = (U64*) acc; + const U32* const xdata = (const U32*) data; + const U32* const xkey = (const U32*) key; + + int i; + for (i=0; i < (int)ACC_NB; i++) { + int const left = 2*i; + int const right= 2*i + 1; + xacc[i] += (xdata[left] + xkey[left]) * (U64)(xdata[right] + xkey[right]); + } + +#endif +} + +static void XXH3_scrambleAcc(void* acc, const void* key) +{ +#if (XXH_VECTOR == XXH_AVX2) + + __m256i const xor_p5 = _mm256_set1_epi64x(PRIME64_5); + + assert(((size_t)acc) & 31 == 0); + __m256i* const xacc = (__m256i*) acc; + const __m256i* const xkey = (const __m256i *) key; + + for (size_t i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { + __m256i data = xacc[i]; + __m256i const shifted = _mm256_srli_epi64(data, 47); + data = _mm256_xor_si256(data, shifted); + data = _mm256_xor_si256(data, xor_p5); + + __m256i const k = _mm256_loadu_si256 (xkey+i); + __m256i const dk = _mm256_mul_epu32 (data,k); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ + + __m256i const d2 = _mm256_shuffle_epi32 (data,0x31); + __m256i const k2 = _mm256_shuffle_epi32 (k,0x31); + __m256i const dk2 = _mm256_mul_epu32 (d2,k2); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ + + xacc[i] = _mm256_xor_si256(dk, dk2); + } + +#elif (XXH_VECTOR == XXH_SSE2) + + __m128i const xor_p5 = _mm_set1_epi64((__m64)PRIME64_5); + + assert(((size_t)acc) & 15 == 0); + __m128i* const xacc = (__m128i*) acc; + const __m128i* const xkey = (const __m128i *) key; + + for (size_t i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { + __m128i data = xacc[i]; + __m128i const shifted = _mm_srli_epi64(data, 47); + data = _mm_xor_si128(data, shifted); + data = _mm_xor_si128(data, xor_p5); + + __m128i const k = _mm_loadu_si128 (xkey+i); + __m128i const dk = _mm_mul_epu32 (data,k); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ + + __m128i const d2 = _mm_shuffle_epi32 (data,0x31); + __m128i const k2 = _mm_shuffle_epi32 (k,0x31); + __m128i const dk2 = _mm_mul_epu32 (d2,k2); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ + + xacc[i] = _mm_xor_si128(dk, dk2); + } + +#else /* scalar variant */ + + U64* const xacc = (U64*) acc + const U32* const xkey = (const U32*) key; + + int i; + for (i=0; i < (int)ACC_NB; i++) { + int const left = 2*i; + int const right= 2*i + 1; + xacc[i] ^= xacc[i] >> 47; + xacc[i] ^= PRIME64_5; + + U64 p1 = (xacc[i] >> 32) * xkey[left]; + U64 p2 = (xacc[i] & 0xFFFFFFFF) * xkey[right]; + xacc[i] = p1 ^ p2; + } + +#endif +} + +static void XXH3_accumulate(U64* acc, const void* restrict data, const U32* restrict key, size_t nbStripes) +{ + for (size_t n = 0; n < nbStripes; n++ ) { + XXH3_accumulate_512(acc, (const BYTE*)data + n*STRIPE_LEN, key); + key += 2; + } +} + + +__attribute__((noinline)) static U64 // it seems better for XXH3_64b that hashLong is not inlined : may mess up the switch case ? +XXH3_hashLong(const void* data, size_t len) +{ + ALIGN(64) U64 acc[ACC_NB] = { len, PRIME64_1, PRIME64_2, PRIME64_3, -len }; + + #define NB_KEYS ((KEYSET_DEFAULT_SIZE - STRIPE_ELTS) / 2) + + size_t const block_len = STRIPE_LEN * NB_KEYS; + size_t const nb_blocks = len / block_len; + + for (size_t n = 0; n < nb_blocks; n++) { + XXH3_accumulate(acc, (const BYTE*)data + n*block_len, kKey, NB_KEYS); + XXH3_scrambleAcc(acc, kKey + (KEYSET_DEFAULT_SIZE - STRIPE_ELTS)); + } + + /* last partial block */ + assert(len > STRIPE_LEN); + size_t const nbStripes = (len % block_len) / STRIPE_LEN; + assert(nbStripes < NB_KEYS); + XXH3_accumulate(acc, (const BYTE*)data + nb_blocks*block_len, kKey, nbStripes); + + /* last stripe */ + if (len & (STRIPE_LEN - 1)) { + const BYTE* const p = (const BYTE*) data + len - STRIPE_LEN; + XXH3_accumulate_512(acc, p, kKey + nbStripes*2); + } + + /* converge into final hash */ + return XXH3_finalMerge_8u64(acc[0], acc[1], acc[2], acc[3], acc[4], acc[5], acc[6], acc[7], PRIME64_2); +} + + +// ========================================== +// Public prototype +// ========================================== + +XXH_PUBLIC_API U64 XXH3_64b(const void* data, size_t len) +{ + switch ((len-1) / 16) { /* intentional underflow */ + case 0: return XXH3_len_1to16_64b(data, len); + case 1: return XXH3_len_17to32_64b(data, len); + case 2: + case 3: return XXH3_len_33to64_64b(data, len); /* 33-64 */ + default:; + } + if (len==0) return 0; + if (len <= 96) return XXH3_len_65to96_64b(data, len); + if (len <= 128) return XXH3_len_97to128_64b(data, len); + return XXH3_hashLong(data, len); +} + +#endif /* XXH3_H */ diff --git a/xxhash.c b/xxhash.c index ce38f71b..5b678313 100644 --- a/xxhash.c +++ b/xxhash.c @@ -1029,4 +1029,14 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src return XXH_readBE64(src); } + + +/* ******************************************************************* +* XXH3 +* New generation hash designed for speed on small keys and vectorization +*********************************************************************/ + +#include "xxh3.h" + + #endif /* XXH_NO_LONG_LONG */ diff --git a/xxhsum.c b/xxhsum.c index d9f5be2d..da5a566d 100644 --- a/xxhsum.c +++ b/xxhsum.c @@ -265,6 +265,9 @@ static U32 localXXH32(const void* buffer, size_t bufferSize, U32 seed) { return static U32 localXXH64(const void* buffer, size_t bufferSize, U32 seed) { return (U32)XXH64(buffer, bufferSize, seed); } +U64 XXH3_64b(const void* data, size_t len); +static U32 localXXH3_64b(const void* buffer, size_t bufferSize, U32 seed) { (void)seed; return (U32)XXH3_64b(buffer, bufferSize); } + static void BMK_benchHash(hashFunction h, const char* hName, const void* buffer, size_t bufferSize) { U32 nbh_perIteration = (U32)((300 MB) / (bufferSize+1)) + 1; /* first loop conservatively aims for 300 MB/s */ @@ -330,7 +333,15 @@ static int BMK_benchMem(const void* buffer, size_t bufferSize, U32 specificTest) if ((specificTest==0) | (specificTest==4)) BMK_benchHash(localXXH64, "XXH64 unaligned", ((const char*)buffer)+3, bufferSize); - if (specificTest > 4) { + /* Bench XXH3 */ + if ((specificTest==0) | (specificTest==5)) + BMK_benchHash(localXXH3_64b, "XXH3_64bits", buffer, bufferSize); + + /* Bench XXH3 on Unaligned input */ + if ((specificTest==0) | (specificTest==6)) + BMK_benchHash(localXXH3_64b, "XXH3_64b unaligned", ((const char*)buffer)+3, bufferSize); + + if (specificTest > 6) { DISPLAY("benchmark mode invalid \n"); return 1; } @@ -397,15 +408,15 @@ static int BMK_benchFiles(const char** fileNamesTable, int nbFiles, U32 specific -static int BMK_benchInternal(size_t keySize, int specificTest) +static int BMK_benchInternal(size_t keySize, U32 specificTest) { void* const buffer = calloc(keySize+16+3, 1); - if(!buffer) { + if (!buffer) { DISPLAY("\nError: not enough memory!\n"); return 12; } - { void* const alignedBuffer = ((char*)buffer+15) - (((size_t)((char*)buffer+15)) & 0xF); /* align on next 16 bytes */ + { const void* const alignedBuffer = ((char*)buffer+15) - (((size_t)((char*)buffer+15)) & 0xF); /* align on next 16 bytes */ /* bench */ DISPLAYLEVEL(1, "Sample of "); @@ -749,10 +760,10 @@ typedef struct { char* lineBuf; size_t blockSize; char* blockBuf; - int strictMode; - int statusOnly; - int warn; - int quiet; + U32 strictMode; + U32 statusOnly; + U32 warn; + U32 quiet; ParseFileReport report; } ParseFileArg; @@ -766,7 +777,7 @@ typedef struct { static GetLineResult getLine(char** lineBuf, int* lineMax, FILE* inFile) { GetLineResult result = GetLine_ok; - int len = 0; + size_t len = 0; if ((*lineBuf == NULL) || (*lineMax<1)) { free(*lineBuf); /* in case it's != NULL */ @@ -787,9 +798,9 @@ static GetLineResult getLine(char** lineBuf, int* lineMax, FILE* inFile) } /* Make enough space for len+1 (for final NUL) bytes. */ - if (len+1 >= *lineMax) { + if (len+1 >= (size_t)*lineMax) { char* newLineBuf = NULL; - int newBufSize = *lineMax; + size_t newBufSize = (size_t)*lineMax; newBufSize += (newBufSize/2) + 1; /* x 1.5 */ if (newBufSize > MAX_LINE_LENGTH) newBufSize = MAX_LINE_LENGTH; @@ -799,7 +810,7 @@ static GetLineResult getLine(char** lineBuf, int* lineMax, FILE* inFile) if (newLineBuf == NULL) return GetLine_outOfMemory; *lineBuf = newLineBuf; - *lineMax = newBufSize; + *lineMax = (int)newBufSize; } if (c == '\n') break; @@ -1214,24 +1225,51 @@ static int badusage(const char* exename) return 1; } -/*! readU32FromChar() : - @return : unsigned integer value read from input in `char` format, - 0 is no figure at *stringPtr position. - Interprets K, KB, KiB, M, MB and MiB suffix. - Modifies `*stringPtr`, advancing it to position where reading stopped. - Note : function result can overflow if digit string > MAX_UINT */ -static unsigned readU32FromChar(const char** stringPtr) +static void errorOut(const char* msg) +{ + DISPLAY("%s \n", msg); exit(1); +} + +/*! readU32FromCharChecked() : + * @return 0 if success, and store the result in *value. + * allows and interprets K, KB, KiB, M, MB and MiB suffix. + * Will also modify `*stringPtr`, advancing it to position where it stopped reading. + * @return 1 if an overflow error occurs */ +static int readU32FromCharChecked(const char** stringPtr, unsigned* value) { + static unsigned const max = (((unsigned)(-1)) / 10) - 1; unsigned result = 0; - while ((**stringPtr >='0') && (**stringPtr <='9')) - result *= 10, result += **stringPtr - '0', (*stringPtr)++ ; + while ((**stringPtr >='0') && (**stringPtr <='9')) { + if (result > max) return 1; // overflow error + result *= 10; + result += (unsigned)(**stringPtr - '0'); + (*stringPtr)++ ; + } if ((**stringPtr=='K') || (**stringPtr=='M')) { + unsigned const maxK = ((unsigned)(-1)) >> 10; + if (result > maxK) return 1; // overflow error result <<= 10; - if (**stringPtr=='M') result <<= 10; - (*stringPtr)++ ; + if (**stringPtr=='M') { + if (result > maxK) return 1; // overflow error + result <<= 10; + } + (*stringPtr)++; /* skip `K` or `M` */ if (**stringPtr=='i') (*stringPtr)++; if (**stringPtr=='B') (*stringPtr)++; } + *value = result; + return 0; +} + +/*! readU32FromChar() : + * @return : unsigned integer value read from input in `char` format. + * allows and interprets K, KB, KiB, M, MB and MiB suffix. + * Will also modify `*stringPtr`, advancing it to position where it stopped reading. + * Note : function will exit() program if digit sequence overflows */ +static unsigned readU32FromChar(const char** stringPtr) { + static const char errorMsg[] = "error: numeric value too large"; + unsigned result; + if (readU32FromCharChecked(stringPtr, &result)) { errorOut(errorMsg); } return result; } From 43c10239c97b7519589516cbfb2dae1087dcc8e4 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 26 Feb 2019 13:45:56 -0800 Subject: [PATCH 21/73] minor C90 adaptation fixes added -Wconversion flag --- Makefile | 2 +- xxh3.h | 281 +++++++++++++++++++++++++++++-------------------------- xxhash.c | 6 +- xxhash.h | 48 ++++++---- xxhsum.c | 1 - 5 files changed, 179 insertions(+), 159 deletions(-) diff --git a/Makefile b/Makefile index e88013b6..4c426ab0 100644 --- a/Makefile +++ b/Makefile @@ -42,7 +42,7 @@ NOSSE4 := endif CFLAGS ?= -O2 $(NOSSE4) # disables potential auto-vectorization -DEBUGFLAGS+=-Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \ +DEBUGFLAGS+=-Wall -Wextra -Wconversion -Wcast-qual -Wcast-align -Wshadow \ -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \ -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \ -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \ diff --git a/xxh3.h b/xxh3.h index f425545f..c8d70880 100644 --- a/xxh3.h +++ b/xxh3.h @@ -2,21 +2,17 @@ #define XXH3_H +#undef XXH_INLINE_ALL /* in case it's already defined */ #define XXH_INLINE_ALL #include "xxhash.h" #define NDEBUG #include -//#include -#define TRACE(...) //printf(__VA_ARGS__) - -// ========================================== -// Vectorization detection -// ========================================== - -// macro enums +/* ========================================== + * Vectorization detection + * ========================================== */ #define XXH_SCALAR 0 #define XXH_SSE2 1 #define XXH_AVX2 2 @@ -32,9 +28,9 @@ #endif -// ========================================== -// Short keys -// ========================================== +/* ========================================== + * Short keys + * ========================================== */ static U64 XXH3_mixHigh(U64 val) { return val ^ (val >> 47); @@ -42,17 +38,17 @@ static U64 XXH3_mixHigh(U64 val) { static U64 XXH3_finalMerge_2u64(U64 ll1, U64 ll2, U64 mul) { - U64 const llcomb1 = XXH3_mixHigh((ll1 ^ ll2) * mul); - U64 const llcomb2 = XXH3_mixHigh((ll2 ^ llcomb1) * mul); - return llcomb2 * mul; + U64 const ll11 = XXH3_mixHigh((ll1 ^ ll2) * mul); + U64 const ll21 = XXH3_mixHigh((ll2 ^ ll11) * mul); + return ll21 * mul; } static U64 XXH3_finalMerge_4u64(U64 ll1, U64 ll2, U64 ll3, U64 ll4, U64 mul) { - U64 const llcomb1 = XXH_rotl64(ll1 + ll2, 43) + XXH_rotl64(ll3, 30) + ll4; - U64 const llcomb2 = ll1 + XXH_rotl64(ll2 + PRIME64_3, 18) + ll3; + U64 const ll11 = XXH_rotl64(ll1 + ll2, 43) + XXH_rotl64(ll3, 30) + ll4; + U64 const ll12 = ll1 + XXH_rotl64(ll2 + PRIME64_3, 18) + ll3; - return XXH3_finalMerge_2u64(llcomb1, llcomb2, mul); + return XXH3_finalMerge_2u64(ll11, ll12, mul); } static U64 XXH3_finalMerge_8u64(U64 ll1, U64 ll2, U64 ll3, U64 ll4, @@ -77,44 +73,49 @@ static inline U64 XXH3_len_1to3_64b(const void* data, size_t len) { assert(data != NULL); assert(len > 0 && len <= 3); - BYTE const c1 = ((const BYTE*)data)[0]; - BYTE const c2 = ((const BYTE*)data)[len >> 1]; - BYTE const c3 = ((const BYTE*)data)[len - 1]; - U32 const l1 = (U32)(c1) + ((U32)(c2) << 8); - U32 const l2 = (U32)(len) + ((U32)(c3) << 2); - U64 const ll3 = (l1 * PRIME64_2) ^ (l2 * PRIME64_1); - return XXH3_mixHigh(ll3) * PRIME64_3; + { BYTE const c1 = ((const BYTE*)data)[0]; + BYTE const c2 = ((const BYTE*)data)[len >> 1]; + BYTE const c3 = ((const BYTE*)data)[len - 1]; + U32 const l1 = (U32)(c1) + ((U32)(c2) << 8); + U32 const l2 = (U32)(len) + ((U32)(c3) << 2); + U64 const ll3 = (l1 * PRIME64_2) ^ (l2 * PRIME64_1); + return XXH3_mixHigh(ll3) * PRIME64_3; + } } + static inline U64 XXH3_len_4to8_64b(const void* data, size_t len) { assert(data != NULL); assert(len >= 4 && len <= 8); - U64 const mul = PRIME64_2 + (len * 2); /* keep it odd */ - U64 const ll1 = XXH_read32(data); - U64 const ll2 = XXH_read32((const BYTE*)data + len - 4) + PRIME64_1; - return XXH3_finalMerge_2u64((len-1) + (ll1 << 3), ll2, mul); + { U64 const mul = PRIME64_2 + (len * 2); /* keep it odd */ + U64 const ll1 = XXH_read32(data); + U64 const ll2 = XXH_read32((const BYTE*)data + len - 4) + PRIME64_1; + return XXH3_finalMerge_2u64((len-1) + (ll1 << 3), ll2, mul); + } } static inline U64 XXH3_len_9to16_64b(const void* data, size_t len) { assert(data != NULL); assert(len >= 9 && len <= 16); - U64 const ll1 = XXH_read64(data) + PRIME64_1; - U64 const ll2 = XXH_read64((const BYTE*)data + len - 8); - U64 const mul = PRIME64_2 + len * 2; /* keep it odd */ - U64 const llcomb3 = ll1 * mul + XXH_rotl64(ll2, 23); - U64 const llcomb4 = ll2 * mul + XXH_rotl64(ll1, 37); - return XXH3_finalMerge_2u64(llcomb3, llcomb4, mul); + { U64 const ll1 = XXH_read64(data) + PRIME64_1; + U64 const ll2 = XXH_read64((const BYTE*)data + len - 8); + U64 const mul = PRIME64_2 + len * 2; /* keep it odd */ + U64 const llcomb3 = ll1 * mul + XXH_rotl64(ll2, 23); + U64 const llcomb4 = ll2 * mul + XXH_rotl64(ll1, 37); + return XXH3_finalMerge_2u64(llcomb3, llcomb4, mul); + } } static inline U64 XXH3_len_1to16_64b(const void* data, size_t len) { assert(data != NULL); assert(len > 0 && len <= 16); - if (len > 8) return XXH3_len_9to16_64b(data, len); - if (len >= 4) return XXH3_len_4to8_64b(data, len); - return XXH3_len_1to3_64b(data, len); + { if (len > 8) return XXH3_len_9to16_64b(data, len); + if (len >= 4) return XXH3_len_4to8_64b(data, len); + return XXH3_len_1to3_64b(data, len); + } } @@ -122,15 +123,17 @@ static U64 XXH3_len_17to32_64b(const void* data, size_t len) { assert(data != NULL); assert(len > 16 && len <= 32); - const BYTE* const p = (const BYTE*)data; - U64 const mul = PRIME64_3 + len * 2; /* keep it odd */ - U64 const ll1 = XXH_read64(p) * PRIME64_1; - U64 const ll2 = XXH_read64(p + 8); - U64 const ll3 = XXH_read64(p + len - 8) * mul; - U64 const ll4 = XXH_read64(p + len - 16) * PRIME64_2; + { const BYTE* const p = (const BYTE*)data; - return XXH3_finalMerge_4u64(ll1, ll2, ll3, ll4, mul); + U64 const mul = PRIME64_3 + len * 2; /* keep it odd */ + U64 const ll1 = XXH_read64(p) * PRIME64_1; + U64 const ll2 = XXH_read64(p + 8); + U64 const ll3 = XXH_read64(p + len - 8) * mul; + U64 const ll4 = XXH_read64(p + len - 16) * PRIME64_2; + + return XXH3_finalMerge_4u64(ll1, ll2, ll3, ll4, mul); + } } @@ -138,20 +141,22 @@ static U64 XXH3_len_33to64_64b(const void* data, size_t len) { assert(data != NULL); assert(len > 33 && len <= 64); - const BYTE* const p = (const BYTE*)data; - U64 const mul = PRIME64_2 + len * 2; /* keep it odd */ + { const BYTE* const p = (const BYTE*)data; - U64 const ll1 = XXH_read64(p); - U64 const ll2 = XXH_read64(p + 8); - U64 const ll3 = XXH_read64(p + 16); - U64 const ll4 = XXH_read64(p + 24); - U64 const ll5 = XXH_read64(p + len - 32); - U64 const ll6 = XXH_read64(p + len - 24); - U64 const ll7 = XXH_read64(p + len - 16); - U64 const ll8 = XXH_read64(p + len - 8); + U64 const mul = PRIME64_2 + len * 2; /* keep it odd */ - return XXH3_finalMerge_8u64(ll1, ll2, ll3, ll4, ll5, ll6, ll7, ll8, mul); + U64 const ll1 = XXH_read64(p); + U64 const ll2 = XXH_read64(p + 8); + U64 const ll3 = XXH_read64(p + 16); + U64 const ll4 = XXH_read64(p + 24); + U64 const ll5 = XXH_read64(p + len - 32); + U64 const ll6 = XXH_read64(p + len - 24); + U64 const ll7 = XXH_read64(p + len - 16); + U64 const ll8 = XXH_read64(p + len - 8); + + return XXH3_finalMerge_8u64(ll1, ll2, ll3, ll4, ll5, ll6, ll7, ll8, mul); + } } @@ -159,28 +164,32 @@ static U64 XXH3_len_65to96_64b(const void* data, size_t len) { assert(data != NULL); assert(len > 64 && len <= 96); - const BYTE* const p = (const BYTE*)data; - U64 const ll1 = XXH3_len_33to64_64b(data, 64); - U64 const ll2 = XXH3_len_17to32_64b(p + len - 32, 32); - return XXH3_finalMerge_2u64(ll1, ll2, PRIME64_1 + 2*len); + { const BYTE* const p = (const BYTE*)data; + + U64 const ll1 = XXH3_len_33to64_64b(data, 64); + U64 const ll2 = XXH3_len_17to32_64b(p + len - 32, 32); + return XXH3_finalMerge_2u64(ll1, ll2, PRIME64_1 + 2*len); + } } static U64 XXH3_len_97to128_64b(const void* data, size_t len) { assert(data != NULL); assert(len > 96 && len <= 128); - const BYTE* const p = (const BYTE*)data; - U64 const ll1 = XXH3_len_33to64_64b(data, 64); - U64 const ll2 = XXH3_len_33to64_64b(p + 64, len - 64); - return XXH3_finalMerge_2u64(ll1, ll2, PRIME64_1 + 2*len); + { const BYTE* const p = (const BYTE*)data; + + U64 const ll1 = XXH3_len_33to64_64b(data, 64); + U64 const ll2 = XXH3_len_33to64_64b(p + 64, len - 64); + return XXH3_finalMerge_2u64(ll1, ll2, PRIME64_1 + 2*len); + } } -// ========================================== -// Long keys -// ========================================== +/* ========================================== + * Long keys + * ========================================== */ #if __GNUC__ #include @@ -194,7 +203,7 @@ static U64 XXH3_len_97to128_64b(const void* data, size_t len) #define STRIPE_LEN 64 #define STRIPE_ELTS (STRIPE_LEN / sizeof(U32)) -#define KEYSET_DEFAULT_SIZE 48 // minimum 32 +#define KEYSET_DEFAULT_SIZE 48 /* minimum 32 */ ALIGN(64) static const U32 kKey[KEYSET_DEFAULT_SIZE] = { @@ -218,40 +227,39 @@ ALIGN(64) static const U32 kKey[KEYSET_DEFAULT_SIZE] = { inline static void XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict key) { - #if (XXH_VECTOR == XXH_AVX2) assert(((size_t)acc) & 31 == 0); - - __m256i* const xacc = (__m256i *) acc; - const __m256i* const xdata = (const __m256i *) data; - ALIGN(32) const __m256i* const xkey = (const __m256i *) key; - - for (size_t i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { - __m256i const d = _mm256_loadu_si256 (xdata+i); - __m256i const k = _mm256_loadu_si256 (xkey+i); - __m256i const dk = _mm256_add_epi32 (d,k); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */ - __m256i const res = _mm256_mul_epu32 (dk, _mm256_shuffle_epi32 (dk,0x31)); /* uint64 res[4] = {dk0*dk1, dk2*dk3, ...} */ - xacc[i] = _mm256_add_epi64(res, xacc[i]); /* xacc must be aligned on 32 bytes boundaries */ + { __m256i* const xacc = (__m256i *) acc; + const __m256i* const xdata = (const __m256i *) data; + ALIGN(32) const __m256i* const xkey = (const __m256i *) key; + + for (size_t i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { + __m256i const d = _mm256_loadu_si256 (xdata+i); + __m256i const k = _mm256_loadu_si256 (xkey+i); + __m256i const dk = _mm256_add_epi32 (d,k); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */ + __m256i const res = _mm256_mul_epu32 (dk, _mm256_shuffle_epi32 (dk,0x31)); /* uint64 res[4] = {dk0*dk1, dk2*dk3, ...} */ + xacc[i] = _mm256_add_epi64(res, xacc[i]); /* xacc must be aligned on 32 bytes boundaries */ + } } #elif (XXH_VECTOR == XXH_SSE2) assert(((size_t)acc) & 15 == 0); - - __m128i* const xacc = (__m128i *) acc; - const __m128i* const xdata = (const __m128i *) data; - ALIGN(16) const __m128i* const xkey = (const __m128i *) key; - - for (size_t i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { - __m128i const d = _mm_loadu_si128 (xdata+i); - __m128i const k = _mm_loadu_si128 (xkey+i); - __m128i const dk = _mm_add_epi32 (d,k); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ - __m128i const res = _mm_mul_epu32 (dk, _mm_shuffle_epi32 (dk,0x31)); /* uint64 res[2] = {dk0*dk1,dk2*dk3} */ - xacc[i] = _mm_add_epi64(res, xacc[i]); /* xacc must be aligned on 16 bytes boundaries */ + { __m128i* const xacc = (__m128i *) acc; + const __m128i* const xdata = (const __m128i *) data; + ALIGN(16) const __m128i* const xkey = (const __m128i *) key; + + for (size_t i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { + __m128i const d = _mm_loadu_si128 (xdata+i); + __m128i const k = _mm_loadu_si128 (xkey+i); + __m128i const dk = _mm_add_epi32 (d,k); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ + __m128i const res = _mm_mul_epu32 (dk, _mm_shuffle_epi32 (dk,0x31)); /* uint64 res[2] = {dk0*dk1,dk2*dk3} */ + xacc[i] = _mm_add_epi64(res, xacc[i]); /* xacc must be aligned on 16 bytes boundaries */ + } } -#else // scalar variant +#else /* scalar variant */ U64* const xacc = (U64*) acc; const U32* const xdata = (const U32*) data; @@ -271,55 +279,56 @@ static void XXH3_scrambleAcc(void* acc, const void* key) { #if (XXH_VECTOR == XXH_AVX2) - __m256i const xor_p5 = _mm256_set1_epi64x(PRIME64_5); - assert(((size_t)acc) & 31 == 0); - __m256i* const xacc = (__m256i*) acc; - const __m256i* const xkey = (const __m256i *) key; + { __m256i* const xacc = (__m256i*) acc; + const __m256i* const xkey = (const __m256i *) key; + + __m256i const xor_p5 = _mm256_set1_epi64x(PRIME64_5); - for (size_t i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { - __m256i data = xacc[i]; - __m256i const shifted = _mm256_srli_epi64(data, 47); - data = _mm256_xor_si256(data, shifted); - data = _mm256_xor_si256(data, xor_p5); + for (size_t i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { + __m256i data = xacc[i]; + __m256i const shifted = _mm256_srli_epi64(data, 47); + data = _mm256_xor_si256(data, shifted); + data = _mm256_xor_si256(data, xor_p5); - __m256i const k = _mm256_loadu_si256 (xkey+i); - __m256i const dk = _mm256_mul_epu32 (data,k); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ + { __m256i const k = _mm256_loadu_si256 (xkey+i); + __m256i const dk = _mm256_mul_epu32 (data,k); /* U32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ - __m256i const d2 = _mm256_shuffle_epi32 (data,0x31); - __m256i const k2 = _mm256_shuffle_epi32 (k,0x31); - __m256i const dk2 = _mm256_mul_epu32 (d2,k2); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ + __m256i const d2 = _mm256_shuffle_epi32 (data,0x31); + __m256i const k2 = _mm256_shuffle_epi32 (k,0x31); + __m256i const dk2 = _mm256_mul_epu32 (d2,k2); /* U32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ - xacc[i] = _mm256_xor_si256(dk, dk2); + xacc[i] = _mm256_xor_si256(dk, dk2); + } } } #elif (XXH_VECTOR == XXH_SSE2) - __m128i const xor_p5 = _mm_set1_epi64((__m64)PRIME64_5); - assert(((size_t)acc) & 15 == 0); - __m128i* const xacc = (__m128i*) acc; - const __m128i* const xkey = (const __m128i *) key; + { __m128i* const xacc = (__m128i*) acc; + const __m128i* const xkey = (const __m128i *) key; + __m128i const xor_p5 = _mm_set1_epi64((__m64)PRIME64_5); - for (size_t i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { - __m128i data = xacc[i]; - __m128i const shifted = _mm_srli_epi64(data, 47); - data = _mm_xor_si128(data, shifted); - data = _mm_xor_si128(data, xor_p5); + for (size_t i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { + __m128i data = xacc[i]; + __m128i const shifted = _mm_srli_epi64(data, 47); + data = _mm_xor_si128(data, shifted); + data = _mm_xor_si128(data, xor_p5); - __m128i const k = _mm_loadu_si128 (xkey+i); - __m128i const dk = _mm_mul_epu32 (data,k); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ + { __m128i const k = _mm_loadu_si128 (xkey+i); + __m128i const dk = _mm_mul_epu32 (data,k); /* U32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ - __m128i const d2 = _mm_shuffle_epi32 (data,0x31); - __m128i const k2 = _mm_shuffle_epi32 (k,0x31); - __m128i const dk2 = _mm_mul_epu32 (d2,k2); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ + __m128i const d2 = _mm_shuffle_epi32 (data,0x31); + __m128i const k2 = _mm_shuffle_epi32 (k,0x31); + __m128i const dk2 = _mm_mul_epu32 (d2,k2); /* U32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ - xacc[i] = _mm_xor_si128(dk, dk2); + xacc[i] = _mm_xor_si128(dk, dk2); + } } } #else /* scalar variant */ - U64* const xacc = (U64*) acc + U64* const xacc = (U64*) acc; const U32* const xkey = (const U32*) key; int i; @@ -346,7 +355,7 @@ static void XXH3_accumulate(U64* acc, const void* restrict data, const U32* rest } -__attribute__((noinline)) static U64 // it seems better for XXH3_64b that hashLong is not inlined : may mess up the switch case ? +__attribute__((noinline)) static U64 /* It seems better for XXH3_64b to have hashLong not inlined : may mess up the switch case ? */ XXH3_hashLong(const void* data, size_t len) { ALIGN(64) U64 acc[ACC_NB] = { len, PRIME64_1, PRIME64_2, PRIME64_3, -len }; @@ -363,24 +372,24 @@ XXH3_hashLong(const void* data, size_t len) /* last partial block */ assert(len > STRIPE_LEN); - size_t const nbStripes = (len % block_len) / STRIPE_LEN; - assert(nbStripes < NB_KEYS); - XXH3_accumulate(acc, (const BYTE*)data + nb_blocks*block_len, kKey, nbStripes); - - /* last stripe */ - if (len & (STRIPE_LEN - 1)) { - const BYTE* const p = (const BYTE*) data + len - STRIPE_LEN; - XXH3_accumulate_512(acc, p, kKey + nbStripes*2); - } + { size_t const nbStripes = (len % block_len) / STRIPE_LEN; + assert(nbStripes < NB_KEYS); + XXH3_accumulate(acc, (const BYTE*)data + nb_blocks*block_len, kKey, nbStripes); + + /* last stripe */ + if (len & (STRIPE_LEN - 1)) { + const BYTE* const p = (const BYTE*) data + len - STRIPE_LEN; + XXH3_accumulate_512(acc, p, kKey + nbStripes*2); + } } /* converge into final hash */ return XXH3_finalMerge_8u64(acc[0], acc[1], acc[2], acc[3], acc[4], acc[5], acc[6], acc[7], PRIME64_2); } -// ========================================== -// Public prototype -// ========================================== +/* ========================================== + * Public prototype + * ========================================== */ XXH_PUBLIC_API U64 XXH3_64b(const void* data, size_t len) { @@ -397,4 +406,6 @@ XXH_PUBLIC_API U64 XXH3_64b(const void* data, size_t len) return XXH3_hashLong(data, len); } + + #endif /* XXH3_H */ diff --git a/xxhash.c b/xxhash.c index 5b678313..9e598c5e 100644 --- a/xxhash.c +++ b/xxhash.c @@ -462,12 +462,12 @@ XXH32_update_endian(XXH32_state_t* state, const void* input, size_t len, XXH_end { const BYTE* p = (const BYTE*)input; const BYTE* const bEnd = p + len; - state->total_len_32 += (unsigned)len; - state->large_len |= (len>=16) | (state->total_len_32>=16); + state->total_len_32 += (XXH32_hash_t)len; + state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16)); if (state->memsize + len < 16) { /* fill in tmp buffer */ XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len); - state->memsize += (unsigned)len; + state->memsize += (XXH32_hash_t)len; return XXH_OK; } diff --git a/xxhash.h b/xxhash.h index 84942e8e..56b6aa55 100644 --- a/xxhash.h +++ b/xxhash.h @@ -159,7 +159,7 @@ typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; ***************************************/ #define XXH_VERSION_MAJOR 0 #define XXH_VERSION_MINOR 6 -#define XXH_VERSION_RELEASE 5 +#define XXH_VERSION_RELEASE 6 #define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) XXH_PUBLIC_API unsigned XXH_versionNumber (void); @@ -247,6 +247,16 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr); typedef struct { unsigned char digest[8]; } XXH64_canonical_t; XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash); XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); + + +/*-********************************************************************** +* XXH3 +* New experimental hash +************************************************************************/ + +XXH_PUBLIC_API XXH64_hash_t XXH3_64b(const void* data, size_t len); + + #endif /* XXH_NO_LONG_LONG */ @@ -289,33 +299,33 @@ struct XXH64_state_s { uint64_t v4; uint64_t mem64[4]; uint32_t memsize; - uint32_t reserved[2]; /* never read nor write, might be removed in a future version */ + uint32_t reserved[2]; /* never read nor write, might be removed in a future version */ }; /* typedef'd to XXH64_state_t */ # else struct XXH32_state_s { - unsigned total_len_32; - unsigned large_len; - unsigned v1; - unsigned v2; - unsigned v3; - unsigned v4; - unsigned mem32[4]; - unsigned memsize; - unsigned reserved; /* never read nor write, might be removed in a future version */ + XXH32_hash_t total_len_32; + XXH32_hash_t large_len; + XXH32_hash_t v1; + XXH32_hash_t v2; + XXH32_hash_t v3; + XXH32_hash_t v4; + XXH32_hash_t mem32[4]; + XXH32_hash_t memsize; + XXH32_hash_t reserved; /* never read nor write, might be removed in a future version */ }; /* typedef'd to XXH32_state_t */ # ifndef XXH_NO_LONG_LONG /* remove 64-bit support */ struct XXH64_state_s { - unsigned long long total_len; - unsigned long long v1; - unsigned long long v2; - unsigned long long v3; - unsigned long long v4; - unsigned long long mem64[4]; - unsigned memsize; - unsigned reserved[2]; /* never read nor write, might be removed in a future version */ + XXH64_hash_t total_len; + XXH64_hash_t v1; + XXH64_hash_t v2; + XXH64_hash_t v3; + XXH64_hash_t v4; + XXH64_hash_t mem64[4]; + XXH32_hash_t memsize; + XXH32_hash_t reserved[2]; /* never read nor write, might be removed in a future version */ }; /* typedef'd to XXH64_state_t */ # endif diff --git a/xxhsum.c b/xxhsum.c index da5a566d..af5e46f4 100644 --- a/xxhsum.c +++ b/xxhsum.c @@ -265,7 +265,6 @@ static U32 localXXH32(const void* buffer, size_t bufferSize, U32 seed) { return static U32 localXXH64(const void* buffer, size_t bufferSize, U32 seed) { return (U32)XXH64(buffer, bufferSize, seed); } -U64 XXH3_64b(const void* data, size_t len); static U32 localXXH3_64b(const void* buffer, size_t bufferSize, U32 seed) { (void)seed; return (U32)XXH3_64b(buffer, bufferSize); } static void BMK_benchHash(hashFunction h, const char* hName, const void* buffer, size_t bufferSize) From e0c6a9e8809b8a9f851e0f656aada697d7798c4d Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 26 Feb 2019 15:14:05 -0800 Subject: [PATCH 22/73] fixed xxh3 namespace issue --- xxhash.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/xxhash.h b/xxhash.h index 56b6aa55..1782789e 100644 --- a/xxhash.h +++ b/xxhash.h @@ -254,6 +254,10 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src * New experimental hash ************************************************************************/ +#ifdef XXH_NAMESPACE +# define XXH3_64b XXH_NAME2(XXH_NAMESPACE, XXH3_64b) +#endif + XXH_PUBLIC_API XXH64_hash_t XXH3_64b(const void* data, size_t len); From 94bebd5b86ba76c52036376e603bbb6c103a476a Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 26 Feb 2019 15:24:59 -0800 Subject: [PATCH 23/73] xxh3: more c90 compatibility --- xxh3.h | 35 ++++++++++++++++++++++++++--------- xxhsum.c | 6 +++--- 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/xxh3.h b/xxh3.h index c8d70880..8239677a 100644 --- a/xxh3.h +++ b/xxh3.h @@ -2,6 +2,8 @@ #define XXH3_H +/* === Dependencies === */ + #undef XXH_INLINE_ALL /* in case it's already defined */ #define XXH_INLINE_ALL #include "xxhash.h" @@ -10,6 +12,14 @@ #include +/* === Compiler versions === */ + +#if !(defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) /* C99+ */ +# define restrict /* disable */ +#endif + + + /* ========================================== * Vectorization detection * ========================================== */ @@ -28,6 +38,7 @@ #endif + /* ========================================== * Short keys * ========================================== */ @@ -69,7 +80,7 @@ static U64 XXH3_finalMerge_8u64(U64 ll1, U64 ll2, U64 ll3, U64 ll4, } -static inline U64 XXH3_len_1to3_64b(const void* data, size_t len) +XXH_FORCE_INLINE U64 XXH3_len_1to3_64b(const void* data, size_t len) { assert(data != NULL); assert(len > 0 && len <= 3); @@ -84,7 +95,7 @@ static inline U64 XXH3_len_1to3_64b(const void* data, size_t len) } -static inline U64 XXH3_len_4to8_64b(const void* data, size_t len) +XXH_FORCE_INLINE U64 XXH3_len_4to8_64b(const void* data, size_t len) { assert(data != NULL); assert(len >= 4 && len <= 8); @@ -95,7 +106,7 @@ static inline U64 XXH3_len_4to8_64b(const void* data, size_t len) } } -static inline U64 XXH3_len_9to16_64b(const void* data, size_t len) +XXH_FORCE_INLINE U64 XXH3_len_9to16_64b(const void* data, size_t len) { assert(data != NULL); assert(len >= 9 && len <= 16); @@ -108,7 +119,7 @@ static inline U64 XXH3_len_9to16_64b(const void* data, size_t len) } } -static inline U64 XXH3_len_1to16_64b(const void* data, size_t len) +XXH_FORCE_INLINE U64 XXH3_len_1to16_64b(const void* data, size_t len) { assert(data != NULL); assert(len > 0 && len <= 16); @@ -187,6 +198,7 @@ static U64 XXH3_len_97to128_64b(const void* data, size_t len) } + /* ========================================== * Long keys * ========================================== */ @@ -224,7 +236,7 @@ ALIGN(64) static const U32 kKey[KEYSET_DEFAULT_SIZE] = { #define ACC_NB (STRIPE_LEN / sizeof(U64)) -inline static void +XXH_FORCE_INLINE void XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict key) { #if (XXH_VECTOR == XXH_AVX2) @@ -250,7 +262,8 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k const __m128i* const xdata = (const __m128i *) data; ALIGN(16) const __m128i* const xkey = (const __m128i *) key; - for (size_t i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { __m128i const d = _mm_loadu_si128 (xdata+i); __m128i const k = _mm_loadu_si128 (xkey+i); __m128i const dk = _mm_add_epi32 (d,k); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ @@ -309,7 +322,8 @@ static void XXH3_scrambleAcc(void* acc, const void* key) const __m128i* const xkey = (const __m128i *) key; __m128i const xor_p5 = _mm_set1_epi64((__m64)PRIME64_5); - for (size_t i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { __m128i data = xacc[i]; __m128i const shifted = _mm_srli_epi64(data, 47); data = _mm_xor_si128(data, shifted); @@ -348,7 +362,8 @@ static void XXH3_scrambleAcc(void* acc, const void* key) static void XXH3_accumulate(U64* acc, const void* restrict data, const U32* restrict key, size_t nbStripes) { - for (size_t n = 0; n < nbStripes; n++ ) { + size_t n; + for (n = 0; n < nbStripes; n++ ) { XXH3_accumulate_512(acc, (const BYTE*)data + n*STRIPE_LEN, key); key += 2; } @@ -365,7 +380,8 @@ XXH3_hashLong(const void* data, size_t len) size_t const block_len = STRIPE_LEN * NB_KEYS; size_t const nb_blocks = len / block_len; - for (size_t n = 0; n < nb_blocks; n++) { + size_t n; + for (n = 0; n < nb_blocks; n++) { XXH3_accumulate(acc, (const BYTE*)data + n*block_len, kKey, NB_KEYS); XXH3_scrambleAcc(acc, kKey + (KEYSET_DEFAULT_SIZE - STRIPE_ELTS)); } @@ -387,6 +403,7 @@ XXH3_hashLong(const void* data, size_t len) } + /* ========================================== * Public prototype * ========================================== */ diff --git a/xxhsum.c b/xxhsum.c index af5e46f4..55ef4301 100644 --- a/xxhsum.c +++ b/xxhsum.c @@ -1239,17 +1239,17 @@ static int readU32FromCharChecked(const char** stringPtr, unsigned* value) static unsigned const max = (((unsigned)(-1)) / 10) - 1; unsigned result = 0; while ((**stringPtr >='0') && (**stringPtr <='9')) { - if (result > max) return 1; // overflow error + if (result > max) return 1; /* overflow error */ result *= 10; result += (unsigned)(**stringPtr - '0'); (*stringPtr)++ ; } if ((**stringPtr=='K') || (**stringPtr=='M')) { unsigned const maxK = ((unsigned)(-1)) >> 10; - if (result > maxK) return 1; // overflow error + if (result > maxK) return 1; /* overflow error */ result <<= 10; if (**stringPtr=='M') { - if (result > maxK) return 1; // overflow error + if (result > maxK) return 1; /* overflow error */ result <<= 10; } (*stringPtr)++; /* skip `K` or `M` */ From 7784d41ce3345fc2e40e81991651555e62d1167e Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 26 Feb 2019 16:36:03 -0800 Subject: [PATCH 24/73] fixed ARM compilation error --- xxh3.h | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/xxh3.h b/xxh3.h index 8239677a..8133dde3 100644 --- a/xxh3.h +++ b/xxh3.h @@ -18,6 +18,18 @@ # define restrict /* disable */ #endif +#if defined(__GNUC__) +# if defined(__SSE2__) +# include +# endif +# define ALIGN(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +# include +# define ALIGN(n) __declspec(align(n)) +#else +# define ALIGN(n) // disabled +#endif + /* ========================================== @@ -203,16 +215,6 @@ static U64 XXH3_len_97to128_64b(const void* data, size_t len) * Long keys * ========================================== */ -#if __GNUC__ -#include -#define ALIGN(n) __attribute__ ((aligned(n))) -#elif _MSC_VER -#include -#define ALIGN(n) __declspec(align(n)) -#else -#define ALIGN(n) -#endif - #define STRIPE_LEN 64 #define STRIPE_ELTS (STRIPE_LEN / sizeof(U32)) #define KEYSET_DEFAULT_SIZE 48 /* minimum 32 */ @@ -408,7 +410,7 @@ XXH3_hashLong(const void* data, size_t len) * Public prototype * ========================================== */ -XXH_PUBLIC_API U64 XXH3_64b(const void* data, size_t len) +XXH_PUBLIC_API XXH64_hash_t XXH3_64b(const void* data, size_t len) { switch ((len-1) / 16) { /* intentional underflow */ case 0: return XXH3_len_1to16_64b(data, len); From 2be95459cde521061573f7bd5b6df60d5b678769 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 26 Feb 2019 16:42:50 -0800 Subject: [PATCH 25/73] fixed minor c90 warning --- xxh3.h | 8 ++++---- xxhsum.c | 6 ++++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/xxh3.h b/xxh3.h index 8133dde3..52fcef4b 100644 --- a/xxh3.h +++ b/xxh3.h @@ -354,10 +354,10 @@ static void XXH3_scrambleAcc(void* acc, const void* key) xacc[i] ^= xacc[i] >> 47; xacc[i] ^= PRIME64_5; - U64 p1 = (xacc[i] >> 32) * xkey[left]; - U64 p2 = (xacc[i] & 0xFFFFFFFF) * xkey[right]; - xacc[i] = p1 ^ p2; - } + { U64 p1 = (xacc[i] >> 32) * xkey[left]; + U64 p2 = (xacc[i] & 0xFFFFFFFF) * xkey[right]; + xacc[i] = p1 ^ p2; + } } #endif } diff --git a/xxhsum.c b/xxhsum.c index 55ef4301..657cf783 100644 --- a/xxhsum.c +++ b/xxhsum.c @@ -1266,9 +1266,11 @@ static int readU32FromCharChecked(const char** stringPtr, unsigned* value) * Will also modify `*stringPtr`, advancing it to position where it stopped reading. * Note : function will exit() program if digit sequence overflows */ static unsigned readU32FromChar(const char** stringPtr) { - static const char errorMsg[] = "error: numeric value too large"; unsigned result; - if (readU32FromCharChecked(stringPtr, &result)) { errorOut(errorMsg); } + if (readU32FromCharChecked(stringPtr, &result)) { + static const char errorMsg[] = "error: numeric value too large"; + errorOut(errorMsg); + } return result; } From c6c39030fba6acbb737b2622bdbae9912ad1cc7a Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 26 Feb 2019 16:49:23 -0800 Subject: [PATCH 26/73] ensure warnings are blocking during tests added -Werror flag to target test-all --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index 4c426ab0..99996db6 100644 --- a/Makefile +++ b/Makefile @@ -226,6 +226,7 @@ preview-man: clean-man man test: all namespaceTest check test-xxhsum-c c90test +test-all: CFLAGS += -Werror test-all: test test32 armtest clangtest cxxtest usan listL120 trailingWhitespace staticAnalyze cppcheck .PHONY: listL120 From 5b827f538c5903c8200b22b098b90d95fe7c4c33 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 26 Feb 2019 18:38:20 -0800 Subject: [PATCH 27/73] improved 8-ways mixer --- xxh3.h | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/xxh3.h b/xxh3.h index 52fcef4b..daebdbc9 100644 --- a/xxh3.h +++ b/xxh3.h @@ -27,7 +27,7 @@ # include # define ALIGN(n) __declspec(align(n)) #else -# define ALIGN(n) // disabled +# define ALIGN(n) /* disabled */ #endif @@ -79,16 +79,14 @@ static U64 XXH3_finalMerge_8u64(U64 ll1, U64 ll2, U64 ll3, U64 ll4, U64 mul) { U64 const ll11 = XXH_rotl64(ll1 + ll7, 21) + (XXH_rotl64(ll2, 34) + ll3) * 9; - U64 const ll12 = ((ll1 + ll2) ^ ll4) + ll6 + 1; + U64 const ll12 = XXH_rotl64(((ll1 + ll2) ^ ll4), 17) + ll6 + 1; U64 const ll13 = XXH_rotl64(ll5 + ll6, 22) + ll3; - U64 const ll14 = ll5 + XXH_rotl64(ll8, 11) + ll3; + U64 const ll14 = ll5 + XXH_rotl64(ll8, 23) + ll7; - U64 const ll21 = XXH_swap64((ll11 + ll12) * mul) + ll8; - U64 const ll31 = (XXH_swap64((ll12 + ll21) * mul) + ll7) * mul; - U64 const ll41 = XXH_swap64((ll13 + ll14) * mul + ll31) + ll2; - U64 const ll51 = XXH3_mixHigh((ll14 + ll41) * mul + ll4 + ll8) * mul; + U64 const ll21 = (XXH_swap64((ll11 + ll12) * mul) + ll13) * mul + ll8; + U64 const ll22 = (XXH_swap64((ll12 + ll14) * mul) + ll4) * mul; - return ll51 + ll13; + return XXH3_finalMerge_2u64(ll21, ll22, mul); } @@ -124,10 +122,10 @@ XXH_FORCE_INLINE U64 XXH3_len_9to16_64b(const void* data, size_t len) assert(len >= 9 && len <= 16); { U64 const ll1 = XXH_read64(data) + PRIME64_1; U64 const ll2 = XXH_read64((const BYTE*)data + len - 8); - U64 const mul = PRIME64_2 + len * 2; /* keep it odd */ - U64 const llcomb3 = ll1 * mul + XXH_rotl64(ll2, 23); - U64 const llcomb4 = ll2 * mul + XXH_rotl64(ll1, 37); - return XXH3_finalMerge_2u64(llcomb3, llcomb4, mul); + U64 const mul = PRIME64_2 + (len * 2); /* keep it odd */ + U64 const ll11 = (ll1 * mul) + XXH_rotl64(ll2, 23); + U64 const ll12 = (ll2 * mul) + XXH_rotl64(ll1, 37); + return XXH3_finalMerge_2u64(ll11, ll12, mul); } } @@ -407,7 +405,7 @@ XXH3_hashLong(const void* data, size_t len) /* ========================================== - * Public prototype + * Public entry point * ========================================== */ XXH_PUBLIC_API XXH64_hash_t XXH3_64b(const void* data, size_t len) From fa31d0b02f492a80cf9d972fde7464f19b6230a1 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Wed, 27 Feb 2019 15:03:23 -0800 Subject: [PATCH 28/73] xxh3: fixed last minor quality metric in extended tests --- xxh3.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/xxh3.h b/xxh3.h index daebdbc9..826f4c2f 100644 --- a/xxh3.h +++ b/xxh3.h @@ -69,7 +69,7 @@ static U64 XXH3_finalMerge_2u64(U64 ll1, U64 ll2, U64 mul) static U64 XXH3_finalMerge_4u64(U64 ll1, U64 ll2, U64 ll3, U64 ll4, U64 mul) { U64 const ll11 = XXH_rotl64(ll1 + ll2, 43) + XXH_rotl64(ll3, 30) + ll4; - U64 const ll12 = ll1 + XXH_rotl64(ll2 + PRIME64_3, 18) + ll3; + U64 const ll12 = ll1 + XXH_rotl64(ll2, 18) + ll3 + PRIME64_3; return XXH3_finalMerge_2u64(ll11, ll12, mul); } @@ -79,12 +79,12 @@ static U64 XXH3_finalMerge_8u64(U64 ll1, U64 ll2, U64 ll3, U64 ll4, U64 mul) { U64 const ll11 = XXH_rotl64(ll1 + ll7, 21) + (XXH_rotl64(ll2, 34) + ll3) * 9; - U64 const ll12 = XXH_rotl64(((ll1 + ll2) ^ ll4), 17) + ll6 + 1; - U64 const ll13 = XXH_rotl64(ll5 + ll6, 22) + ll3; - U64 const ll14 = ll5 + XXH_rotl64(ll8, 23) + ll7; + U64 const ll12 = XXH_rotl64(((ll1 + ll2) ^ ll4), 17) + ll6 + PRIME64_5; + U64 const ll13 = XXH_rotl64(ll5 * PRIME64_4 + ll6, 46) + ll3; + U64 const ll14 = XXH_rotl64(ll8, 23) + XXH_rotl64(ll5 + ll7, 12); - U64 const ll21 = (XXH_swap64((ll11 + ll12) * mul) + ll13) * mul + ll8; - U64 const ll22 = (XXH_swap64((ll12 + ll14) * mul) + ll4) * mul; + U64 const ll21 = (XXH_swap64((ll11 + ll12) * PRIME64_1) + ll13) * PRIME64_3 + ll8; + U64 const ll22 = (XXH_swap64((ll12 + ll14) * PRIME64_2) + ll4) * mul; return XXH3_finalMerge_2u64(ll21, ll22, mul); } @@ -373,7 +373,7 @@ static void XXH3_accumulate(U64* acc, const void* restrict data, const U32* rest __attribute__((noinline)) static U64 /* It seems better for XXH3_64b to have hashLong not inlined : may mess up the switch case ? */ XXH3_hashLong(const void* data, size_t len) { - ALIGN(64) U64 acc[ACC_NB] = { len, PRIME64_1, PRIME64_2, PRIME64_3, -len }; + ALIGN(64) U64 acc[ACC_NB] = { 0, PRIME64_1, PRIME64_2, PRIME64_3, PRIME64_4, PRIME64_5 }; #define NB_KEYS ((KEYSET_DEFAULT_SIZE - STRIPE_ELTS) / 2) @@ -399,7 +399,7 @@ XXH3_hashLong(const void* data, size_t len) } } /* converge into final hash */ - return XXH3_finalMerge_8u64(acc[0], acc[1], acc[2], acc[3], acc[4], acc[5], acc[6], acc[7], PRIME64_2); + return XXH3_finalMerge_8u64(acc[0] + len, acc[1], acc[2], acc[3], acc[4], acc[5], acc[6], acc[7] - len, PRIME64_2 + len*2); } From cd626c344b2fc571825e2b08dc974226b920dded Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Wed, 27 Feb 2019 16:05:20 -0800 Subject: [PATCH 29/73] Makefile : switch default optimization to -O3 because gcc is pretty bad at vectorization with -O2. Also : documented the clang problem with XXH32 auto-vectorization which must be prevented for better performance. --- Makefile | 2 +- xxhash.c | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 99996db6..25a5dfdc 100644 --- a/Makefile +++ b/Makefile @@ -41,7 +41,7 @@ else NOSSE4 := endif -CFLAGS ?= -O2 $(NOSSE4) # disables potential auto-vectorization +CFLAGS ?= -O3 $(NOSSE4) # disables potential auto-vectorization DEBUGFLAGS+=-Wall -Wextra -Wconversion -Wcast-qual -Wcast-align -Wshadow \ -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \ -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \ diff --git a/xxhash.c b/xxhash.c index 9e598c5e..02f5cd53 100644 --- a/xxhash.c +++ b/xxhash.c @@ -348,7 +348,6 @@ XXH32_finalize(U32 h32, const void* ptr, size_t len, return h32; /* reaching this point is deemed impossible */ } - XXH_FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align) @@ -371,6 +370,17 @@ XXH32_endian_align(const void* input, size_t len, U32 seed, U32 v3 = seed + 0; U32 v4 = seed - PRIME32_1; + /* note : clang will try to vectorize this loop, using pmulld instruction. + * This is a bad idea, and will result in substantial performance reduction. + * To prevent clang from "optimizing" this loop, + * it's necessary to disable SSE4 on command line (-mno-sse4). + * However, this is a build instruction, so it's outside of source code. + * Whenever xxhash.c is used in a different code base, build flags don't follow. + * It would be better to ensure vectorization is disabled from within the source code. + * Alas, so far, I've not found a working method. + * I tried both `#pragma` and `__attribute__`, but clang still vectorizes. + * Help welcomed. + * In the meantime, vectorization is prevented by the `Makefile` */ do { v1 = XXH32_round(v1, XXH_get32bits(p)); p+=4; v2 = XXH32_round(v2, XXH_get32bits(p)); p+=4; From b348fa896a6a5fc30b67fa0ba1b2f184ea25d742 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Thu, 28 Feb 2019 16:43:44 -0800 Subject: [PATCH 30/73] restored 8-way mixer --- xxh3.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/xxh3.h b/xxh3.h index 826f4c2f..5e9f11ae 100644 --- a/xxh3.h +++ b/xxh3.h @@ -79,14 +79,16 @@ static U64 XXH3_finalMerge_8u64(U64 ll1, U64 ll2, U64 ll3, U64 ll4, U64 mul) { U64 const ll11 = XXH_rotl64(ll1 + ll7, 21) + (XXH_rotl64(ll2, 34) + ll3) * 9; - U64 const ll12 = XXH_rotl64(((ll1 + ll2) ^ ll4), 17) + ll6 + PRIME64_5; - U64 const ll13 = XXH_rotl64(ll5 * PRIME64_4 + ll6, 46) + ll3; - U64 const ll14 = XXH_rotl64(ll8, 23) + XXH_rotl64(ll5 + ll7, 12); + U64 const ll12 = ((ll1 + ll2) ^ ll4) + ll6 + 1; + U64 const ll13 = XXH_rotl64(ll5 + ll6, 22) + ll3; + U64 const ll14 = ll5 + XXH_rotl64(ll8, 11) + ll3; - U64 const ll21 = (XXH_swap64((ll11 + ll12) * PRIME64_1) + ll13) * PRIME64_3 + ll8; - U64 const ll22 = (XXH_swap64((ll12 + ll14) * PRIME64_2) + ll4) * mul; + U64 const ll21 = XXH_swap64((ll11 + ll12) * mul) + ll8; + U64 const ll31 = (XXH_swap64((ll12 + ll21) * mul) + ll7) * mul; + U64 const ll41 = XXH_swap64((ll13 + ll14) * mul + ll31) + ll2; + U64 const ll51 = XXH3_mixHigh((ll14 + ll41) * mul + ll4 + ll8) * mul; - return XXH3_finalMerge_2u64(ll21, ll22, mul); + return ll51 + ll13; } From 8d345470e6ef8d6ee80c91305a3f463021a27582 Mon Sep 17 00:00:00 2001 From: "easyaspi314 (Devin)" Date: Thu, 28 Feb 2019 20:28:29 -0500 Subject: [PATCH 31/73] xxh3: add NEON support Signed-off-by: easyaspi314 (Devin) --- xxh3.h | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/xxh3.h b/xxh3.h index 826f4c2f..28fab7ab 100644 --- a/xxh3.h +++ b/xxh3.h @@ -21,6 +21,10 @@ #if defined(__GNUC__) # if defined(__SSE2__) # include +# elif defined(__ARM_NEON__) || defined(__ARM_NEON) +# define inline __inline__ /* clang bug */ +# include +# undef inline # endif # define ALIGN(n) __attribute__ ((aligned(n))) #elif defined(_MSC_VER) @@ -38,12 +42,16 @@ #define XXH_SCALAR 0 #define XXH_SSE2 1 #define XXH_AVX2 2 +#define XXH_NEON 3 #ifndef XXH_VECTOR /* can be defined on command line */ # if defined(__AVX2__) # define XXH_VECTOR XXH_AVX2 # elif defined(__SSE2__) # define XXH_VECTOR XXH_SSE2 +/* msvc support maybe later */ +# elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__ARM_NEON)) +# define XXH_VECTOR XXH_NEON # else # define XXH_VECTOR XXH_SCALAR # endif @@ -272,6 +280,59 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k } } +#elif (XXH_VECTOR == XXH_NEON) + + assert(((size_t)acc) & 15 == 0); + { uint64x2_t* const xacc = (uint64x2_t *)acc; + const uint32_t* const xdata = (const uint32_t *)data; + ALIGN(16) const uint32_t* const xkey = (const uint32_t *)key; + + size_t i; + for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) { +#if !defined(__aarch64__) && !defined(__arm64__) && !defined(XXH_NO_ARM32_HACK) + /* On 32-bit ARM, we can take advantage of the packed registers. + * This is not portable to aarch64! + * Basically, on 32-bit NEON, registers are stored like so: + * .----------------------------------. + * | q8 | // uint32x4_t + * |-----------------.----------------| + * | d16 (.val[0]) | d17 (.val[1]) | // uint32x2x2_t + * '-----------------'----------------' + * vld2.32 will store its values into two double registers, returning + * a uint32x2_t. In NEON, this will be stored in, for example, d16 and d17. + * Reinterpret cast it to a uint32x4_t and you get q8 for free + * + * On aarch64, this was changed completely. + * + * aarch64 gave us 16 more quad registers, but they also removed this behavior, + * instead matching smaller registers to the lower sections of the higher + * registers and zeroing the rest. + * .----------------------------------..---------------------------------. + * | v8.4s | v9.4s | + * |-----------------.----------------|-----------------.-----------------| + * | v8.2s (.val[0]) | | v9.2s (.val[1]) | | + * '-----------------'----------------'-----------------'-----------------' + * On aarch64, ld2 will put it into v8.2s and v9.2s. Reinterpreting + * is not going to help us here, as half of it will end up being zero. */ + + uint32x2x2_t d = vld2_u32(xdata + i * 4); /* load and swap */ + uint32x2x2_t k = vld2_u32(xkey + i * 4); + /* Not sorry about breaking the strict aliasing rule. + * Using a union causes GCC to spit out nonsense, but an alias cast + * does not. */ + uint32x4_t const dk = vaddq_u32(*(uint32x4_t*)&d, *(uint32x4_t*)&k); + xacc[i] = vmlal_u32(xacc[i], vget_low_u32(dk), vget_high_u32(dk)); +#else + /* Portable, but slightly slower version */ + uint32x2x2_t const d = vld2_u32(xdata + i * 4); + uint32x2x2_t const k = vld2_u32(xkey + i * 4); + uint32x2_t const dkL = vadd_u32(d.val[0], k.val[0]); + uint32x2_t const dkH = vadd_u32(d.val[1], k.val[1]); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ + /* xacc must be aligned on 16 bytes boundaries */ + xacc[i] = vmlal_u32(xacc[i], dkL, dkH); /* uint64 res[2] = {dk0*dk1,dk2*dk3} */ +#endif + } + } #else /* scalar variant */ U64* const xacc = (U64*) acc; @@ -340,6 +401,35 @@ static void XXH3_scrambleAcc(void* acc, const void* key) } } } +#elif (XXH_VECTOR == XXH_NEON) + + assert(((size_t)acc) & 15 == 0); + { uint64x2_t* const xacc = (uint64x2_t*) acc; + const uint32_t* const xkey = (const uint32_t *) key; + uint64x2_t xor_p5 = vdupq_n_u64(PRIME64_5); + size_t i; + /* Clang and GCC like to put NEON constant loads into the loop. */ + __asm__("" : "+w" (xor_p5)); + for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) { + uint64x2_t data = xacc[i]; + uint64x2_t const shifted = vshrq_n_u64(data, 47); + data = veorq_u64(data, shifted); + data = veorq_u64(data, xor_p5); + + { + /* shuffle: 0, 1, 2, 3 -> 0, 2, 1, 3 */ + uint32x2x2_t const d = + vzip_u32( + vget_low_u32(vreinterpretq_u32_u64(data)), + vget_high_u32(vreinterpretq_u32_u64(data)) + ); + uint32x2x2_t const k = vld2_u32 (xkey+i*4); /* load and swap */ + uint64x2_t const dk = vmull_u32(d.val[0],k.val[0]); /* U64 dk[2] = {d0 * k0, d2 * k2} */ + uint64x2_t const dk2 = vmull_u32(d.val[1],k.val[1]); /* U64 dk2[2] = {d1 * k1, d3 * k3} */ + xacc[i] = veorq_u64(dk, dk2); /* xacc[i] = dk ^ dk2; */ + } } + } + #else /* scalar variant */ U64* const xacc = (U64*) acc; From d034ce8269f1bea127354685762bc0000a3ff134 Mon Sep 17 00:00:00 2001 From: "easyaspi314 (Devin)" Date: Sat, 2 Mar 2019 01:00:28 -0500 Subject: [PATCH 32/73] Automatically warn + disable NEON implementation for aarch64 GCC < 7. It generates code that runs at about 1.8 GB/s when Clang 3.8 generates code that runs at 5 GB/s on the same machine with the same C source code. --- xxh3.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/xxh3.h b/xxh3.h index e0d7c8e9..6d0c7dd3 100644 --- a/xxh3.h +++ b/xxh3.h @@ -49,6 +49,11 @@ # define XXH_VECTOR XXH_AVX2 # elif defined(__SSE2__) # define XXH_VECTOR XXH_SSE2 +/* GCC < 7 for aarch64 generates unreasonably slow code for the NEON + * implementation. We fall back to the scalar version and emit a warning. */ +# elif defined(__aarch64__) && !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 7 +# warning Your GCC version has broken NEON support. Please use GCC 7+ or Clang. +# define XXH_VECTOR XXH_SCALAR /* msvc support maybe later */ # elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__ARM_NEON)) # define XXH_VECTOR XXH_NEON From 8a08cbc10ce363b13ce2df4b3b430e2ea34a4660 Mon Sep 17 00:00:00 2001 From: "easyaspi314 (Devin)" Date: Sat, 2 Mar 2019 19:25:31 -0500 Subject: [PATCH 33/73] Improve aarch64 code. There is no longer need to disable NEON on GCC 6 This new code is faster and vectorizes properly on GCC 6. Apparently, aarch64 really hates shuffling. --- xxh3.h | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/xxh3.h b/xxh3.h index 6d0c7dd3..42c6e244 100644 --- a/xxh3.h +++ b/xxh3.h @@ -49,11 +49,6 @@ # define XXH_VECTOR XXH_AVX2 # elif defined(__SSE2__) # define XXH_VECTOR XXH_SSE2 -/* GCC < 7 for aarch64 generates unreasonably slow code for the NEON - * implementation. We fall back to the scalar version and emit a warning. */ -# elif defined(__aarch64__) && !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 7 -# warning Your GCC version has broken NEON support. Please use GCC 7+ or Clang. -# define XXH_VECTOR XXH_SCALAR /* msvc support maybe later */ # elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__ARM_NEON)) # define XXH_VECTOR XXH_NEON @@ -320,23 +315,26 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k * | v8.2s (.val[0]) | | v9.2s (.val[1]) | | * '-----------------'----------------'-----------------'-----------------' * On aarch64, ld2 will put it into v8.2s and v9.2s. Reinterpreting - * is not going to help us here, as half of it will end up being zero. */ + * is not going to help us here, as half of it will end up being zero. + * + * Even if it did, aarch64 apparently does really bad with shuffling, so + * we use a different method. */ uint32x2x2_t d = vld2_u32(xdata + i * 4); /* load and swap */ uint32x2x2_t k = vld2_u32(xkey + i * 4); /* Not sorry about breaking the strict aliasing rule. * Using a union causes GCC to spit out nonsense, but an alias cast * does not. */ - uint32x4_t const dk = vaddq_u32(*(uint32x4_t*)&d, *(uint32x4_t*)&k); - xacc[i] = vmlal_u32(xacc[i], vget_low_u32(dk), vget_high_u32(dk)); + uint32x4_t const dk = vaddq_u32(*(uint32x4_t*)&d, *(uint32x4_t*)&k); /* dk = d + k */ + xacc[i] = vmlal_u32(xacc[i], vget_low_u32(dk), vget_high_u32(dk)); /* xacc[i] += (U64)dkLo * (U64)dkHi; */ #else - /* Portable, but slightly slower version */ - uint32x2x2_t const d = vld2_u32(xdata + i * 4); - uint32x2x2_t const k = vld2_u32(xkey + i * 4); - uint32x2_t const dkL = vadd_u32(d.val[0], k.val[0]); - uint32x2_t const dkH = vadd_u32(d.val[1], k.val[1]); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ - /* xacc must be aligned on 16 bytes boundaries */ - xacc[i] = vmlal_u32(xacc[i], dkL, dkH); /* uint64 res[2] = {dk0*dk1,dk2*dk3} */ + /* A portable and aarch64-friendly version. It is slower on ARMv7a, though. */ + uint32x4_t d = vld1q_u32(xdata + i * 4); + uint32x4_t k = vld1q_u32(xkey + i * 4); + /* Add d and k, then reinterpret to a uint64x2_t. This is not a long add. */ + uint64x2_t dk = vreinterpretq_u64_u32(vaddq_u32(d, k)); /* dk = (U64)(d[1] + k[1]) << 32) | (d[0] + k[0]); */ + /* Long multiply high and low bits. */ + xacc[i] = vmlal_u32(xacc[i], vmovn_u64(dk), vshrn_n_u64(dk, 32)); /* xacc[i] += (dk & 0xFFFFFFFF) * (dk >> 32); */ #endif } } @@ -424,6 +422,12 @@ static void XXH3_scrambleAcc(void* acc, const void* key) data = veorq_u64(data, xor_p5); { +#ifdef __aarch64__ + /* aarch64 prefers this method, ARMv7a prefers the other. */ + uint64x2_t k = *(uint64x2_t *)(xkey + i * 4); + uint64x2_t const dk = vmull_u32(vmovn_u64(data), vmovn_u64(k)); + uint64x2_t const dk2 = vmull_u32(vshrn_n_u64(data, 32), vshrn_n_u64(k, 32)); +#else /* shuffle: 0, 1, 2, 3 -> 0, 2, 1, 3 */ uint32x2x2_t const d = vzip_u32( @@ -433,6 +437,7 @@ static void XXH3_scrambleAcc(void* acc, const void* key) uint32x2x2_t const k = vld2_u32 (xkey+i*4); /* load and swap */ uint64x2_t const dk = vmull_u32(d.val[0],k.val[0]); /* U64 dk[2] = {d0 * k0, d2 * k2} */ uint64x2_t const dk2 = vmull_u32(d.val[1],k.val[1]); /* U64 dk2[2] = {d1 * k1, d3 * k3} */ +#endif xacc[i] = veorq_u64(dk, dk2); /* xacc[i] = dk ^ dk2; */ } } } From 982a3ab59dbe04b8241e0ce2e8043a2e699330f3 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Sat, 2 Mar 2019 18:26:20 -0800 Subject: [PATCH 34/73] Revert "Improve aarch64 performance" --- xxh3.h | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/xxh3.h b/xxh3.h index 42c6e244..e0d7c8e9 100644 --- a/xxh3.h +++ b/xxh3.h @@ -315,26 +315,23 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k * | v8.2s (.val[0]) | | v9.2s (.val[1]) | | * '-----------------'----------------'-----------------'-----------------' * On aarch64, ld2 will put it into v8.2s and v9.2s. Reinterpreting - * is not going to help us here, as half of it will end up being zero. - * - * Even if it did, aarch64 apparently does really bad with shuffling, so - * we use a different method. */ + * is not going to help us here, as half of it will end up being zero. */ uint32x2x2_t d = vld2_u32(xdata + i * 4); /* load and swap */ uint32x2x2_t k = vld2_u32(xkey + i * 4); /* Not sorry about breaking the strict aliasing rule. * Using a union causes GCC to spit out nonsense, but an alias cast * does not. */ - uint32x4_t const dk = vaddq_u32(*(uint32x4_t*)&d, *(uint32x4_t*)&k); /* dk = d + k */ - xacc[i] = vmlal_u32(xacc[i], vget_low_u32(dk), vget_high_u32(dk)); /* xacc[i] += (U64)dkLo * (U64)dkHi; */ + uint32x4_t const dk = vaddq_u32(*(uint32x4_t*)&d, *(uint32x4_t*)&k); + xacc[i] = vmlal_u32(xacc[i], vget_low_u32(dk), vget_high_u32(dk)); #else - /* A portable and aarch64-friendly version. It is slower on ARMv7a, though. */ - uint32x4_t d = vld1q_u32(xdata + i * 4); - uint32x4_t k = vld1q_u32(xkey + i * 4); - /* Add d and k, then reinterpret to a uint64x2_t. This is not a long add. */ - uint64x2_t dk = vreinterpretq_u64_u32(vaddq_u32(d, k)); /* dk = (U64)(d[1] + k[1]) << 32) | (d[0] + k[0]); */ - /* Long multiply high and low bits. */ - xacc[i] = vmlal_u32(xacc[i], vmovn_u64(dk), vshrn_n_u64(dk, 32)); /* xacc[i] += (dk & 0xFFFFFFFF) * (dk >> 32); */ + /* Portable, but slightly slower version */ + uint32x2x2_t const d = vld2_u32(xdata + i * 4); + uint32x2x2_t const k = vld2_u32(xkey + i * 4); + uint32x2_t const dkL = vadd_u32(d.val[0], k.val[0]); + uint32x2_t const dkH = vadd_u32(d.val[1], k.val[1]); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ + /* xacc must be aligned on 16 bytes boundaries */ + xacc[i] = vmlal_u32(xacc[i], dkL, dkH); /* uint64 res[2] = {dk0*dk1,dk2*dk3} */ #endif } } @@ -422,12 +419,6 @@ static void XXH3_scrambleAcc(void* acc, const void* key) data = veorq_u64(data, xor_p5); { -#ifdef __aarch64__ - /* aarch64 prefers this method, ARMv7a prefers the other. */ - uint64x2_t k = *(uint64x2_t *)(xkey + i * 4); - uint64x2_t const dk = vmull_u32(vmovn_u64(data), vmovn_u64(k)); - uint64x2_t const dk2 = vmull_u32(vshrn_n_u64(data, 32), vshrn_n_u64(k, 32)); -#else /* shuffle: 0, 1, 2, 3 -> 0, 2, 1, 3 */ uint32x2x2_t const d = vzip_u32( @@ -437,7 +428,6 @@ static void XXH3_scrambleAcc(void* acc, const void* key) uint32x2x2_t const k = vld2_u32 (xkey+i*4); /* load and swap */ uint64x2_t const dk = vmull_u32(d.val[0],k.val[0]); /* U64 dk[2] = {d0 * k0, d2 * k2} */ uint64x2_t const dk2 = vmull_u32(d.val[1],k.val[1]); /* U64 dk2[2] = {d1 * k1, d3 * k3} */ -#endif xacc[i] = veorq_u64(dk, dk2); /* xacc[i] = dk ^ dk2; */ } } } From 48e3d724d17877506fef39e331b80fa73b5c8679 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Wed, 6 Mar 2019 11:55:48 -0500 Subject: [PATCH 35/73] updated xxh3 --- xxh3.h | 261 ++++++++++++++++++++++++--------------------------------- 1 file changed, 110 insertions(+), 151 deletions(-) diff --git a/xxh3.h b/xxh3.h index e0d7c8e9..32846a63 100644 --- a/xxh3.h +++ b/xxh3.h @@ -59,191 +59,113 @@ + /* ========================================== - * Short keys + * XXH3 default settings * ========================================== */ -static U64 XXH3_mixHigh(U64 val) { - return val ^ (val >> 47); -} +#define KEYSET_DEFAULT_SIZE 48 /* minimum 32 */ -static U64 XXH3_finalMerge_2u64(U64 ll1, U64 ll2, U64 mul) -{ - U64 const ll11 = XXH3_mixHigh((ll1 ^ ll2) * mul); - U64 const ll21 = XXH3_mixHigh((ll2 ^ ll11) * mul); - return ll21 * mul; -} -static U64 XXH3_finalMerge_4u64(U64 ll1, U64 ll2, U64 ll3, U64 ll4, U64 mul) -{ - U64 const ll11 = XXH_rotl64(ll1 + ll2, 43) + XXH_rotl64(ll3, 30) + ll4; - U64 const ll12 = ll1 + XXH_rotl64(ll2, 18) + ll3 + PRIME64_3; +ALIGN(64) static const U32 kKey[KEYSET_DEFAULT_SIZE] = { + 0xb8fe6c39,0x23a44bbe,0x7c01812c,0xf721ad1c, + 0xded46de9,0x839097db,0x7240a4a4,0xb7b3671f, + 0xcb79e64e,0xccc0e578,0x825ad07d,0xccff7221, + 0xb8084674,0xf743248e,0xe03590e6,0x813a264c, + 0x3c2852bb,0x91c300cb,0x88d0658b,0x1b532ea3, + 0x71644897,0xa20df94e,0x3819ef46,0xa9deacd8, + 0xa8fa763f,0xe39c343f,0xf9dcbbc7,0xc70b4f1d, + 0x8a51e04b,0xcdb45931,0xc89f7ec9,0xd9787364, - return XXH3_finalMerge_2u64(ll11, ll12, mul); -} + 0xeac5ac83,0x34d3ebc3,0xc581a0ff,0xfa1363eb, + 0x170ddd51,0xb7f0da49,0xd3165526,0x29d4689e, + 0x2b16be58,0x7d47a1fc,0x8ff8b8d1,0x7ad031ce, + 0x45cb3a8f,0x95160428,0xafd7fbca,0xbb4b407e, +}; -static U64 XXH3_finalMerge_8u64(U64 ll1, U64 ll2, U64 ll3, U64 ll4, - U64 ll5, U64 ll6, U64 ll7, U64 ll8, - U64 mul) +XXH_FORCE_INLINE U64 +XXH3_mul128(U64 ll1, U64 ll2) { - U64 const ll11 = XXH_rotl64(ll1 + ll7, 21) + (XXH_rotl64(ll2, 34) + ll3) * 9; - U64 const ll12 = ((ll1 + ll2) ^ ll4) + ll6 + 1; - U64 const ll13 = XXH_rotl64(ll5 + ll6, 22) + ll3; - U64 const ll14 = ll5 + XXH_rotl64(ll8, 11) + ll3; - - U64 const ll21 = XXH_swap64((ll11 + ll12) * mul) + ll8; - U64 const ll31 = (XXH_swap64((ll12 + ll21) * mul) + ll7) * mul; - U64 const ll41 = XXH_swap64((ll13 + ll14) * mul + ll31) + ll2; - U64 const ll51 = XXH3_mixHigh((ll14 + ll41) * mul + ll4 + ll8) * mul; - - return ll51 + ll13; + __uint128_t lll = (__uint128_t)ll1 * ll2; + return (U64)lll + (lll >> 64); } +static U64 XXH64_avalanche2(U64 h64) +{ + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + return h64; +} -XXH_FORCE_INLINE U64 XXH3_len_1to3_64b(const void* data, size_t len) +/* ========================================== + * Short keys + * ========================================== */ +XXH_FORCE_INLINE U64 +XXH3_len_1to3_64b(const void* data, size_t len, const void* keyPtr) { assert(data != NULL); assert(len > 0 && len <= 3); - { BYTE const c1 = ((const BYTE*)data)[0]; + assert(keyPtr != NULL); + { const U32* const key32 = (const U32*) keyPtr; + BYTE const c1 = ((const BYTE*)data)[0]; BYTE const c2 = ((const BYTE*)data)[len >> 1]; BYTE const c3 = ((const BYTE*)data)[len - 1]; U32 const l1 = (U32)(c1) + ((U32)(c2) << 8); U32 const l2 = (U32)(len) + ((U32)(c3) << 2); - U64 const ll3 = (l1 * PRIME64_2) ^ (l2 * PRIME64_1); - return XXH3_mixHigh(ll3) * PRIME64_3; + U64 const ll3 = (U64)(l1 + key32[0]) * (l2 + key32[1]); + return XXH64_avalanche2(ll3); } } -XXH_FORCE_INLINE U64 XXH3_len_4to8_64b(const void* data, size_t len) +XXH_FORCE_INLINE U64 +XXH3_len_4to8_64b(const void* data, size_t len, const void* keyPtr) { assert(data != NULL); assert(len >= 4 && len <= 8); - { U64 const mul = PRIME64_2 + (len * 2); /* keep it odd */ - U64 const ll1 = XXH_read32(data); - U64 const ll2 = XXH_read32((const BYTE*)data + len - 4) + PRIME64_1; - return XXH3_finalMerge_2u64((len-1) + (ll1 << 3), ll2, mul); + { const U32* const key32 = (const U32*) keyPtr; + U64 acc = PRIME64_1 * len; + U64 const l1 = XXH_read32(data) + key32[0]; + U64 const l2 = XXH_read32((const BYTE*)data + len - 4) + key32[1]; + acc += (U64)l1 * l2; + return XXH64_avalanche2(acc); } } -XXH_FORCE_INLINE U64 XXH3_len_9to16_64b(const void* data, size_t len) +XXH_FORCE_INLINE U64 +XXH3_len_9to16_64b(const void* data, size_t len, const void* keyPtr) { assert(data != NULL); + assert(key != NULL); assert(len >= 9 && len <= 16); - { U64 const ll1 = XXH_read64(data) + PRIME64_1; - U64 const ll2 = XXH_read64((const BYTE*)data + len - 8); - U64 const mul = PRIME64_2 + (len * 2); /* keep it odd */ - U64 const ll11 = (ll1 * mul) + XXH_rotl64(ll2, 23); - U64 const ll12 = (ll2 * mul) + XXH_rotl64(ll1, 37); - return XXH3_finalMerge_2u64(ll11, ll12, mul); + { const U64* const key64 = (const U64*) keyPtr; + U64 acc = PRIME64_1 * len; + U64 const ll1 = XXH_read64(data) + key64[0]; + U64 const ll2 = XXH_read64((const BYTE*)data + len - 8) + key64[1]; + acc += XXH3_mul128(ll1, ll2); + return XXH64_avalanche2(acc); } } -XXH_FORCE_INLINE U64 XXH3_len_1to16_64b(const void* data, size_t len) +XXH_FORCE_INLINE U64 XXH3_len_0to16_64b(const void* data, size_t len) { assert(data != NULL); - assert(len > 0 && len <= 16); - { if (len > 8) return XXH3_len_9to16_64b(data, len); - if (len >= 4) return XXH3_len_4to8_64b(data, len); - return XXH3_len_1to3_64b(data, len); - } -} - - -static U64 XXH3_len_17to32_64b(const void* data, size_t len) -{ - assert(data != NULL); - assert(len > 16 && len <= 32); - - { const BYTE* const p = (const BYTE*)data; - - U64 const mul = PRIME64_3 + len * 2; /* keep it odd */ - U64 const ll1 = XXH_read64(p) * PRIME64_1; - U64 const ll2 = XXH_read64(p + 8); - U64 const ll3 = XXH_read64(p + len - 8) * mul; - U64 const ll4 = XXH_read64(p + len - 16) * PRIME64_2; - - return XXH3_finalMerge_4u64(ll1, ll2, ll3, ll4, mul); + assert(len <= 16); + { if (len > 8) return XXH3_len_9to16_64b(data, len, kKey); + if (len >= 4) return XXH3_len_4to8_64b(data, len, kKey); + if (len) return XXH3_len_1to3_64b(data, len, kKey); + return 0; } } -static U64 XXH3_len_33to64_64b(const void* data, size_t len) -{ - assert(data != NULL); - assert(len > 33 && len <= 64); - - { const BYTE* const p = (const BYTE*)data; - - U64 const mul = PRIME64_2 + len * 2; /* keep it odd */ - - U64 const ll1 = XXH_read64(p); - U64 const ll2 = XXH_read64(p + 8); - U64 const ll3 = XXH_read64(p + 16); - U64 const ll4 = XXH_read64(p + 24); - U64 const ll5 = XXH_read64(p + len - 32); - U64 const ll6 = XXH_read64(p + len - 24); - U64 const ll7 = XXH_read64(p + len - 16); - U64 const ll8 = XXH_read64(p + len - 8); - - return XXH3_finalMerge_8u64(ll1, ll2, ll3, ll4, ll5, ll6, ll7, ll8, mul); - } -} - - -static U64 XXH3_len_65to96_64b(const void* data, size_t len) -{ - assert(data != NULL); - assert(len > 64 && len <= 96); - - { const BYTE* const p = (const BYTE*)data; - - U64 const ll1 = XXH3_len_33to64_64b(data, 64); - U64 const ll2 = XXH3_len_17to32_64b(p + len - 32, 32); - return XXH3_finalMerge_2u64(ll1, ll2, PRIME64_1 + 2*len); - } -} - -static U64 XXH3_len_97to128_64b(const void* data, size_t len) -{ - assert(data != NULL); - assert(len > 96 && len <= 128); - - { const BYTE* const p = (const BYTE*)data; - - U64 const ll1 = XXH3_len_33to64_64b(data, 64); - U64 const ll2 = XXH3_len_33to64_64b(p + 64, len - 64); - return XXH3_finalMerge_2u64(ll1, ll2, PRIME64_1 + 2*len); - } -} - - - /* ========================================== * Long keys * ========================================== */ #define STRIPE_LEN 64 #define STRIPE_ELTS (STRIPE_LEN / sizeof(U32)) -#define KEYSET_DEFAULT_SIZE 48 /* minimum 32 */ - - -ALIGN(64) static const U32 kKey[KEYSET_DEFAULT_SIZE] = { - 0xb8fe6c39,0x23a44bbe,0x7c01812c,0xf721ad1c, - 0xded46de9,0x839097db,0x7240a4a4,0xb7b3671f, - 0xcb79e64e,0xccc0e578,0x825ad07d,0xccff7221, - 0xb8084674,0xf743248e,0xe03590e6,0x813a264c, - 0x3c2852bb,0x91c300cb,0x88d0658b,0x1b532ea3, - 0x71644897,0xa20df94e,0x3819ef46,0xa9deacd8, - 0xa8fa763f,0xe39c343f,0xf9dcbbc7,0xc70b4f1d, - 0x8a51e04b,0xcdb45931,0xc89f7ec9,0xd9787364, - - 0xeac5ac83,0x34d3ebc3,0xc581a0ff,0xfa1363eb, - 0x170ddd51,0xb7f0da49,0xd3165526,0x29d4689e, - 0x2b16be58,0x7d47a1fc,0x8ff8b8d1,0x7ad031ce, - 0x45cb3a8f,0x95160428,0xafd7fbca,0xbb4b407e, -}; - #define ACC_NB (STRIPE_LEN / sizeof(U64)) XXH_FORCE_INLINE void @@ -461,8 +383,25 @@ static void XXH3_accumulate(U64* acc, const void* restrict data, const U32* rest } } +XXH_FORCE_INLINE U64 XXH3_mix16B(const void* data, const U64* key) +{ + return XXH3_mul128((XXH_read64(data) ^ key[0]), XXH_read64((const BYTE*)data+8) ^ key[1]); +} + +static XXH64_hash_t XXH3_merge64B(const U64* data, const void* keyVoid, U64 len) +{ + const U64* const key = (const U64*)keyVoid; /* presumed aligned */ + + U64 acc = PRIME64_1 * len; + acc += XXH3_mix16B(data+0, key+0); + acc += XXH3_mix16B(data+2, key+2); + acc += XXH3_mix16B(data+4, key+4); + acc += XXH3_mix16B(data+6, key+6); + + return XXH64_avalanche2(acc); +} -__attribute__((noinline)) static U64 /* It seems better for XXH3_64b to have hashLong not inlined : may mess up the switch case ? */ +__attribute__((noinline)) static U64 /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ XXH3_hashLong(const void* data, size_t len) { ALIGN(64) U64 acc[ACC_NB] = { 0, PRIME64_1, PRIME64_2, PRIME64_3, PRIME64_4, PRIME64_5 }; @@ -491,7 +430,9 @@ XXH3_hashLong(const void* data, size_t len) } } /* converge into final hash */ - return XXH3_finalMerge_8u64(acc[0] + len, acc[1], acc[2], acc[3], acc[4], acc[5], acc[6], acc[7] - len, PRIME64_2 + len*2); + //return XXH3_finalMerge_8u64(acc[0] + len, acc[1], acc[2], acc[3], acc[4], acc[5], acc[6], acc[7] - len, PRIME64_2 + len*2); + assert(sizeof(acc) == 64); + return XXH3_merge64B(acc, kKey, len); } @@ -502,17 +443,35 @@ XXH3_hashLong(const void* data, size_t len) XXH_PUBLIC_API XXH64_hash_t XXH3_64b(const void* data, size_t len) { - switch ((len-1) / 16) { /* intentional underflow */ - case 0: return XXH3_len_1to16_64b(data, len); - case 1: return XXH3_len_17to32_64b(data, len); - case 2: - case 3: return XXH3_len_33to64_64b(data, len); /* 33-64 */ - default:; + const BYTE* const p = (const BYTE*)data; + const U64* const key = (const U64*)(const void*)kKey; + + if (len <= 16) return XXH3_len_0to16_64b(data, len); + + { U64 acc = PRIME64_1 * len; + if (len > 32) { + if (len > 64) { + if (len > 96) { + if (len > 128) return XXH3_hashLong(data, len); + + acc += XXH3_mix16B(p+48, key+12); + acc += XXH3_mix16B(p+len-64, key+14); + } + + acc += XXH3_mix16B(p+32, key+8); + acc += XXH3_mix16B(p+len-48, key+10); + } + + acc += XXH3_mix16B(p+16, key+4); + acc += XXH3_mix16B(p+len-32, key+6); + + } + + acc += XXH3_mix16B(p+0, key+0); + acc += XXH3_mix16B(p+len-16, key+2); + + return XXH64_avalanche2(acc); } - if (len==0) return 0; - if (len <= 96) return XXH3_len_65to96_64b(data, len); - if (len <= 128) return XXH3_len_97to128_64b(data, len); - return XXH3_hashLong(data, len); } From 8d96de3e1ca60c0d29243a9118d8f30ec09fb276 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Wed, 6 Mar 2019 17:46:42 -0500 Subject: [PATCH 36/73] added variant with seed --- xxh3.h | 55 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/xxh3.h b/xxh3.h index 32846a63..e81a525b 100644 --- a/xxh3.h +++ b/xxh3.h @@ -87,10 +87,10 @@ XXH_FORCE_INLINE U64 XXH3_mul128(U64 ll1, U64 ll2) { __uint128_t lll = (__uint128_t)ll1 * ll2; - return (U64)lll + (lll >> 64); + return (U64)lll + (U64)(lll >> 64); } -static U64 XXH64_avalanche2(U64 h64) +static XXH64_hash_t XXH64_avalanche2(U64 h64) { h64 ^= h64 >> 29; h64 *= PRIME64_3; @@ -98,11 +98,12 @@ static U64 XXH64_avalanche2(U64 h64) return h64; } + /* ========================================== * Short keys * ========================================== */ -XXH_FORCE_INLINE U64 -XXH3_len_1to3_64b(const void* data, size_t len, const void* keyPtr) +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_1to3_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed) { assert(data != NULL); assert(len > 0 && len <= 3); @@ -113,19 +114,19 @@ XXH3_len_1to3_64b(const void* data, size_t len, const void* keyPtr) BYTE const c3 = ((const BYTE*)data)[len - 1]; U32 const l1 = (U32)(c1) + ((U32)(c2) << 8); U32 const l2 = (U32)(len) + ((U32)(c3) << 2); - U64 const ll3 = (U64)(l1 + key32[0]) * (l2 + key32[1]); + U64 const ll3 = (U64)(l1 + seed + key32[0]) * (l2 + key32[1]); return XXH64_avalanche2(ll3); } } -XXH_FORCE_INLINE U64 -XXH3_len_4to8_64b(const void* data, size_t len, const void* keyPtr) +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_4to8_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed) { assert(data != NULL); assert(len >= 4 && len <= 8); { const U32* const key32 = (const U32*) keyPtr; - U64 acc = PRIME64_1 * len; + U64 acc = PRIME64_1 * (len + seed); U64 const l1 = XXH_read32(data) + key32[0]; U64 const l2 = XXH_read32((const BYTE*)data + len - 4) + key32[1]; acc += (U64)l1 * l2; @@ -133,14 +134,14 @@ XXH3_len_4to8_64b(const void* data, size_t len, const void* keyPtr) } } -XXH_FORCE_INLINE U64 -XXH3_len_9to16_64b(const void* data, size_t len, const void* keyPtr) +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_9to16_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed) { assert(data != NULL); assert(key != NULL); assert(len >= 9 && len <= 16); { const U64* const key64 = (const U64*) keyPtr; - U64 acc = PRIME64_1 * len; + U64 acc = PRIME64_1 * (len + seed); U64 const ll1 = XXH_read64(data) + key64[0]; U64 const ll2 = XXH_read64((const BYTE*)data + len - 8) + key64[1]; acc += XXH3_mul128(ll1, ll2); @@ -148,14 +149,15 @@ XXH3_len_9to16_64b(const void* data, size_t len, const void* keyPtr) } } -XXH_FORCE_INLINE U64 XXH3_len_0to16_64b(const void* data, size_t len) +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_0to16_64b(const void* data, size_t len, XXH64_hash_t seed) { assert(data != NULL); assert(len <= 16); - { if (len > 8) return XXH3_len_9to16_64b(data, len, kKey); - if (len >= 4) return XXH3_len_4to8_64b(data, len, kKey); - if (len) return XXH3_len_1to3_64b(data, len, kKey); - return 0; + { if (len > 8) return XXH3_len_9to16_64b(data, len, kKey, seed); + if (len >= 4) return XXH3_len_4to8_64b(data, len, kKey, seed); + if (len) return XXH3_len_1to3_64b(data, len, kKey, seed); + return seed; } } @@ -401,10 +403,10 @@ static XXH64_hash_t XXH3_merge64B(const U64* data, const void* keyVoid, U64 len) return XXH64_avalanche2(acc); } -__attribute__((noinline)) static U64 /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ -XXH3_hashLong(const void* data, size_t len) +__attribute__((noinline)) static XXH64_hash_t /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ +XXH3_hashLong(const void* data, size_t len, XXH64_hash_t seed) { - ALIGN(64) U64 acc[ACC_NB] = { 0, PRIME64_1, PRIME64_2, PRIME64_3, PRIME64_4, PRIME64_5 }; + ALIGN(64) U64 acc[ACC_NB] = { seed, PRIME64_1, PRIME64_2, PRIME64_3, PRIME64_4, PRIME64_5, -seed }; #define NB_KEYS ((KEYSET_DEFAULT_SIZE - STRIPE_ELTS) / 2) @@ -441,18 +443,19 @@ XXH3_hashLong(const void* data, size_t len) * Public entry point * ========================================== */ -XXH_PUBLIC_API XXH64_hash_t XXH3_64b(const void* data, size_t len) +XXH_PUBLIC_API XXH64_hash_t +XXH3_64b_withSeed(const void* data, size_t len, XXH64_hash_t seed) { const BYTE* const p = (const BYTE*)data; const U64* const key = (const U64*)(const void*)kKey; - if (len <= 16) return XXH3_len_0to16_64b(data, len); + if (len <= 16) return XXH3_len_0to16_64b(data, len, seed); - { U64 acc = PRIME64_1 * len; + { U64 acc = PRIME64_1 * (len + seed); if (len > 32) { if (len > 64) { if (len > 96) { - if (len > 128) return XXH3_hashLong(data, len); + if (len > 128) return XXH3_hashLong(data, len, seed); acc += XXH3_mix16B(p+48, key+12); acc += XXH3_mix16B(p+len-64, key+14); @@ -475,5 +478,11 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64b(const void* data, size_t len) } +XXH_PUBLIC_API XXH64_hash_t XXH3_64b(const void* data, size_t len) +{ + return XXH3_64b_withSeed(data, len, 0); +} + + #endif /* XXH3_H */ From a951c0aebaa0b77aed43ce22b7e506b71f59c4e1 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Wed, 6 Mar 2019 23:42:04 -0500 Subject: [PATCH 37/73] xxh3: updated mul128 with a 32-bits backup path also: started XXH128 (not finished yet) --- xxh3.h | 173 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 171 insertions(+), 2 deletions(-) diff --git a/xxh3.h b/xxh3.h index e81a525b..3b634e34 100644 --- a/xxh3.h +++ b/xxh3.h @@ -83,13 +83,57 @@ ALIGN(64) static const U32 kKey[KEYSET_DEFAULT_SIZE] = { 0x45cb3a8f,0x95160428,0xafd7fbca,0xbb4b407e, }; + + + XXH_FORCE_INLINE U64 XXH3_mul128(U64 ll1, U64 ll2) { - __uint128_t lll = (__uint128_t)ll1 * ll2; - return (U64)lll + (U64)(lll >> 64); +#if defined(__SIZEOF_INT128__) || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + + __uint128_t lll = (__uint128_t)ll1 * ll2; + return (U64)lll + (U64)(lll >> 64); + +#elif defined(_M_X64) || defined(_M_IA64) + +# pragma intrinsic(_umul128) + U64 llhigh; + U64 const lllow = _umul128(ll1, ll2, &llhigh); + return lllow + llhigh; + +#elif defined(__aarch64__) + + U64 const llow = ll1 * ll2; + U64 llhigh; + asm ("umulh %0, %1, %2" : "=r" (llhigh) : "r" (ll1), "r" (ll2)); + return lllow + llhigh; + +#else + + /* emulate 64x64x->128b multiplication, using four 32x32->64 */ + U32 const h1 = ll1 >> 32; + U32 const h2 = ll2 >> 32; + U32 const l1 = (U32)ll1; + U32 const l2 = (U32)ll2; + + U64 const llh = (U64)h1 * h2; + U64 const llm1 = (U64)l1 * h2; + U64 const llm2 = (U64)l2 * h1; + U64 const lll = (U64)l1 * l2; + + U64 const t = lll + (llm1 << 32); + U64 const carry1 = t < lll; + + U64 const lllow = t + (llm2 << 32); + U64 const carry2 = lllow < t; + U64 const llhigh = llh + (llm1 >> 32) + (llm2 >> 32) + carry1 + carry2; + + return llhigh + lllow; + +#endif } + static XXH64_hash_t XXH64_avalanche2(U64 h64) { h64 ^= h64 >> 29; @@ -485,4 +529,129 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64b(const void* data, size_t len) +/* ========================================== + * XXH3 128 bits + * Not ready yet ! + * ========================================== */ + +typedef struct { + XXH64_hash_t ll1; + XXH64_hash_t ll2; +} XXH128_hash_t; + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_1to3_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed) +{ + assert(data != NULL); + assert(len > 0 && len <= 3); + assert(keyPtr != NULL); + { const U32* const key32 = (const U32*) keyPtr; + BYTE const c1 = ((const BYTE*)data)[0]; + BYTE const c2 = ((const BYTE*)data)[len >> 1]; + BYTE const c3 = ((const BYTE*)data)[len - 1]; + U32 const l1 = (U32)(c1) + ((U32)(c2) << 8); + U32 const l2 = (U32)(len) + ((U32)(c3) << 2); + U64 const ll1 = (U64)(l1 + seed + key32[0]) * (l2 + key32[1]); + U64 const ll2 = (U64)(l1 - seed + key32[2]) * (l2 + key32[3]); + return (XXH128_hash_t) { XXH64_avalanche2(ll1), XXH64_avalanche2(ll2) }; + } +} + + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_4to8_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed) +{ + assert(data != NULL); + assert(len >= 4 && len <= 8); + { const U32* const key32 = (const U32*) keyPtr; + U64 acc1 = PRIME64_1 * ((U64)len + seed); + U64 acc2 = PRIME64_2 * ((U64)len - seed); + U64 const l1 = XXH_read32(data) + key32[0]; + U64 const l2 = XXH_read32((const BYTE*)data + len - 4) + key32[1]; + acc1 += (U64)(l1 + key32[0]) * (l2 + key32[1]); + acc2 += (U64)(l1 + key32[2]) * (l2 + key32[3]); + return (XXH128_hash_t){ XXH64_avalanche2(acc1), XXH64_avalanche2(acc2) }; + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_9to16_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed) +{ + assert(data != NULL); + assert(key != NULL); + assert(len >= 9 && len <= 16); + { const U64* const key64 = (const U64*) keyPtr; + U64 acc1 = PRIME64_1 * ((U64)len + seed); + U64 acc2 = PRIME64_2 * ((U64)len - seed); + U64 const ll1 = XXH_read64(data); + U64 const ll2 = XXH_read64((const BYTE*)data + len - 8); + acc1 += XXH3_mul128(ll1 + key64[0], ll2 + key64[1]); + acc2 += XXH3_mul128(ll1 + key64[2], ll2 + key64[3]); + return (XXH128_hash_t){ XXH64_avalanche2(acc1), XXH64_avalanche2(acc2) }; + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_0to16_128b(const void* data, size_t len, XXH64_hash_t seed) +{ + assert(data != NULL); + assert(len <= 16); + { if (len > 8) return XXH3_len_9to16_128b(data, len, kKey, seed); + if (len >= 4) return XXH3_len_4to8_128b(data, len, kKey, seed); + if (len) return XXH3_len_1to3_128b(data, len, kKey, seed); + return (XXH128_hash_t) { seed, -seed }; + } +} + +XXH_PUBLIC_API XXH128_hash_t +XXH3_128b_withSeed(const void* data, size_t len, XXH64_hash_t seed) +{ + if (len <= 16) return XXH3_len_0to16_128b(data, len, seed); + +#if 0 + + { U64 acc = PRIME64_1 * (len + seed); + const BYTE* const p = (const BYTE*)data; + const U64* const key = (const U64*)(const void*)kKey; + if (len > 32) { + if (len > 64) { + if (len > 96) { + if (len > 128) return XXH3_hashLong(data, len, seed); + + acc += XXH3_mix16B(p+48, key+12); + acc += XXH3_mix16B(p+len-64, key+14); + } + + acc += XXH3_mix16B(p+32, key+8); + acc += XXH3_mix16B(p+len-48, key+10); + } + + acc += XXH3_mix16B(p+16, key+4); + acc += XXH3_mix16B(p+len-32, key+6); + + } + + acc += XXH3_mix16B(p+0, key+0); + acc += XXH3_mix16B(p+len-16, key+2); + + return XXH64_avalanche2(acc); + } + +#else + return (XXH128_hash_t){ 0, 0 }; +#endif +} + + +XXH_PUBLIC_API XXH128_hash_t XXH3_128b(const void* data, size_t len) +{ + return XXH3_128b_withSeed(data, len, 0); +} + + +XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed) +{ + return XXH3_128b_withSeed(data, len, seed); +} + #endif /* XXH3_H */ From 7558f18493484738ebcbbaa41fe0aa8d50038b51 Mon Sep 17 00:00:00 2001 From: "easyaspi314 (Devin)" Date: Thu, 7 Mar 2019 17:26:49 -0500 Subject: [PATCH 38/73] Add improved 128-bit multiply routine for 32-bit and use intrinsics long multiply --- xxh3.h | 147 ++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 109 insertions(+), 38 deletions(-) diff --git a/xxh3.h b/xxh3.h index 3b634e34..53785af7 100644 --- a/xxh3.h +++ b/xxh3.h @@ -57,7 +57,14 @@ # endif #endif - +/* U64 XXH_mult32to64(U32 a, U64 b) { return (U64)a * (U64)b; } */ +#ifdef _MSC_VER +# include + /* MSVC doesn't do a good job with the mull detection. */ +# define XXH_mult32to64 __emulu +#else +# define XXH_mult32to64(x, y) ((U64)((x) & 0xFFFFFFFF) * (U64)((y) & 0xFFFFFFFF)) +#endif /* ========================================== @@ -84,12 +91,15 @@ ALIGN(64) static const U32 kKey[KEYSET_DEFAULT_SIZE] = { }; - - -XXH_FORCE_INLINE U64 +#if defined(__GNUC__) && defined(__i386__) +/* GCC is stupid and tries to vectorize this. + * This tells GCC that it is wrong. */ +__attribute__((__target__("no-sse"))) +#endif +static U64 XXH3_mul128(U64 ll1, U64 ll2) { -#if defined(__SIZEOF_INT128__) || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) +#if 0 && defined(__SIZEOF_INT128__) || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) __uint128_t lll = (__uint128_t)ll1 * ll2; return (U64)lll + (U64)(lll >> 64); @@ -101,34 +111,95 @@ XXH3_mul128(U64 ll1, U64 ll2) U64 const lllow = _umul128(ll1, ll2, &llhigh); return lllow + llhigh; -#elif defined(__aarch64__) +#elif defined(__aarch64__) && defined(__GNUC__) - U64 const llow = ll1 * ll2; + U64 llow; U64 llhigh; - asm ("umulh %0, %1, %2" : "=r" (llhigh) : "r" (ll1), "r" (ll2)); - return lllow + llhigh; + __asm__("umulh %0, %1, %2" : "=r" (llhigh) : "r" (ll1), "r" (ll2)); + __asm__("madd %0, %1, %2, %3" : "=r" (llow) : "r" (ll1), "r" (ll2), "r" (llhigh)); + return lllow; #else - - /* emulate 64x64x->128b multiplication, using four 32x32->64 */ - U32 const h1 = ll1 >> 32; - U32 const h2 = ll2 >> 32; - U32 const l1 = (U32)ll1; - U32 const l2 = (U32)ll2; - - U64 const llh = (U64)h1 * h2; - U64 const llm1 = (U64)l1 * h2; - U64 const llm2 = (U64)l2 * h1; - U64 const lll = (U64)l1 * l2; - - U64 const t = lll + (llm1 << 32); - U64 const carry1 = t < lll; - - U64 const lllow = t + (llm2 << 32); - U64 const carry2 = lllow < t; - U64 const llhigh = llh + (llm1 >> 32) + (llm2 >> 32) + carry1 + carry2; - - return llhigh + lllow; + /* Do it out manually on 32-bit. + * This is a modified, unrolled, widened, and optimized version of the + * mulqdu routine from Hacker's Delight. + * + * https://www.hackersdelight.org/hdcodetxt/mulqdu.c.txt + * + * This was modified to use U32->U64 multiplication instead + * of U16->U32, to add the high and low values in the end, + * be endian-independent, and I added a partial assembly + * implementation for ARM. */ + U64 t; + U32 w[4] = { 0 }; + U32 u[2] = { (U32)(ll1 >> 32), (U32)ll1 }; + U32 v[2] = { (U32)(ll2 >> 32), (U32)ll2 }; + U32 k; + /* An easy 128-bit folding multiply on ARMv6T2 and ARMv7-A/R can be done with + * the mighty umaal (Unsigned Multiply Accumulate Accumulate Long) which takes 4 cycles + * or less, doing a long multiply and adding two 32-bit integers: + * + * void umaal(U32 *RdLo, U32 *RdHi, U32 Rn, U32 Rm) + * { + * U64 prodAcc = (U64)Rn * (U64)Rm; + * prodAcc += *RdLo; + * prodAcc += *RdHi; + * *RdLo = prodAcc & 0xFFFFFFFF; + * *RdHi = prodAcc >> 32; + * } + * + * This is compared to umlal which adds to a single 64-bit integer: + * + * void umlal(U32 *RdLo, U32 *RdHi, U32 Rn, U32 Rm) + * { + * U64 prodAcc = (U64)Rn * (U64)Rm; + * prodAcc += (*RdLo | ((U64)*RdHi << 32); + * *RdLo = prodAcc & 0xFFFFFFFF; + * *RdHi = prodAcc >> 32; + * } + * + * Getting the compiler to emit them is like pulling teeth, and checking + * for it is annoying because ARMv7-M lacks this instruction. However, it + * is worth it, because this is an otherwise expensive operation. */ + + /* GCC-compatible, ARMv6t2 or ARMv7+, non-M variant, and 32-bit */ +#if defined(__GNUC__) /* GCC-compatible */ \ + && defined(__ARM_ARCH) && !defined(__aarch64__) && !defined(__arm64__) /* 32-bit ARM */\ + && !defined(__ARM_ARCH_7M__) /* <- Not ARMv7-M vv*/ \ + && !(defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM == 0 && __TARGET_ARCH_THUMB == 4) \ + && (defined(__ARM_ARCH_6T2__) || __ARM_ARCH > 6) /* ARMv6T2 or later */ + __asm__("umull %0, %1, %2, %3" + : "=r" (w[3]), "=r" (k) + : "r" (u[1]), "r" (v[1])); + __asm__("umaal %0, %1, %2, %3" + : "+r" (w[2]), "+r" (k) + : "r" (u[0]), "r" (v[1])); + w[1] = k; + k = 0; + __asm__("umaal %0, %1, %2, %3" + : "+r" (w[2]), "+r" (k) + : "r" (u[1]), "r" (v[0])); + __asm__("umaal %0, %1, %2, %3" + : "+r" (w[1]), "+r" (k) + : "r" (u[0]), "r" (v[0])); + w[0] = k; +#else /* Portable scalar version */ + k = 0; + t = XXH_mult32to64(u[1], v[1]); + w[3] = t & 0xFFFFFFFF; + k = t >> 32; + t = XXH_mult32to64(u[0], v[1]) + w[2] + k; + w[2] = t & 0xFFFFFFFF; + w[1] = t >> 32; + + t = XXH_mult32to64(u[1], v[0]) + w[2]; + w[2] = t & 0xFFFFFFFF; + k = t >> 32; + t = XXH_mult32to64(u[0], v[0]) + w[1] + k; + w[1] = t & 0xFFFFFFFF; + w[0] = t >> 32; +#endif + return (w[1] | ((U64)w[0] << 32)) + (w[3] | ((U64)w[2] << 32)); #endif } @@ -158,7 +229,7 @@ XXH3_len_1to3_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t BYTE const c3 = ((const BYTE*)data)[len - 1]; U32 const l1 = (U32)(c1) + ((U32)(c2) << 8); U32 const l2 = (U32)(len) + ((U32)(c3) << 2); - U64 const ll3 = (U64)(l1 + seed + key32[0]) * (l2 + key32[1]); + U64 const ll3 = XXH_mult32to64((l1 + seed + key32[0]), (l2 + key32[1])); return XXH64_avalanche2(ll3); } } @@ -173,7 +244,7 @@ XXH3_len_4to8_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t U64 acc = PRIME64_1 * (len + seed); U64 const l1 = XXH_read32(data) + key32[0]; U64 const l2 = XXH_read32((const BYTE*)data + len - 4) + key32[1]; - acc += (U64)l1 * l2; + acc += XXH_mult32to64(l1, l2); return XXH64_avalanche2(acc); } } @@ -313,7 +384,7 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k for (i=0; i < (int)ACC_NB; i++) { int const left = 2*i; int const right= 2*i + 1; - xacc[i] += (xdata[left] + xkey[left]) * (U64)(xdata[right] + xkey[right]); + xacc[i] += XXH_mult32to64(xdata[left] + xkey[left], xdata[right] + xkey[right]); } #endif @@ -412,8 +483,8 @@ static void XXH3_scrambleAcc(void* acc, const void* key) xacc[i] ^= xacc[i] >> 47; xacc[i] ^= PRIME64_5; - { U64 p1 = (xacc[i] >> 32) * xkey[left]; - U64 p2 = (xacc[i] & 0xFFFFFFFF) * xkey[right]; + { U64 p1 = XXH_mult32to64(xacc[i] & 0xFFFFFFFF, xkey[left]); + U64 p2 = XXH_mult32to64(xacc[i] >> 32, xkey[right]); xacc[i] = p1 ^ p2; } } @@ -551,8 +622,8 @@ XXH3_len_1to3_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_ BYTE const c3 = ((const BYTE*)data)[len - 1]; U32 const l1 = (U32)(c1) + ((U32)(c2) << 8); U32 const l2 = (U32)(len) + ((U32)(c3) << 2); - U64 const ll1 = (U64)(l1 + seed + key32[0]) * (l2 + key32[1]); - U64 const ll2 = (U64)(l1 - seed + key32[2]) * (l2 + key32[3]); + U64 const ll1 = XXH_mult32to64(l1 + seed + key32[0], l2 + key32[1]); + U64 const ll2 = XXH_mult32to64(l1 - seed + key32[2], l2 + key32[3]); return (XXH128_hash_t) { XXH64_avalanche2(ll1), XXH64_avalanche2(ll2) }; } } @@ -568,8 +639,8 @@ XXH3_len_4to8_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_ U64 acc2 = PRIME64_2 * ((U64)len - seed); U64 const l1 = XXH_read32(data) + key32[0]; U64 const l2 = XXH_read32((const BYTE*)data + len - 4) + key32[1]; - acc1 += (U64)(l1 + key32[0]) * (l2 + key32[1]); - acc2 += (U64)(l1 + key32[2]) * (l2 + key32[3]); + acc1 += XXH_mult32to64(l1 + key32[0], l2 + key32[1]); + acc2 += XXH_mult32to64(l1 + key32[2], l2 + key32[3]); return (XXH128_hash_t){ XXH64_avalanche2(acc1), XXH64_avalanche2(acc2) }; } } From 97952e90295885d9d8e127d91aa199a876c22981 Mon Sep 17 00:00:00 2001 From: "easyaspi314 (Devin)" Date: Thu, 7 Mar 2019 17:29:26 -0500 Subject: [PATCH 39/73] Workaround for Clang vectorization bug Inline assembly fences are the only thing I have found that will prevent Clang from vectorizing XXH32. I explained it in a lot of detail. --- Makefile | 10 +--------- xxhash.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 50 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index 25a5dfdc..ddb2bb96 100644 --- a/Makefile +++ b/Makefile @@ -33,15 +33,7 @@ LIBVER_MINOR := $(shell echo $(LIBVER_MINOR_SCRIPT)) LIBVER_PATCH := $(shell echo $(LIBVER_PATCH_SCRIPT)) LIBVER := $(LIBVER_MAJOR).$(LIBVER_MINOR).$(LIBVER_PATCH) -# SSE4 detection -HAVE_SSE4 := $(shell $(CC) -dM -E - < /dev/null | grep "SSE4" > /dev/null && echo 1 || echo 0) -ifeq ($(HAVE_SSE4), 1) -NOSSE4 := -mno-sse4 -else -NOSSE4 := -endif - -CFLAGS ?= -O3 $(NOSSE4) # disables potential auto-vectorization +CFLAGS ?= -O3 DEBUGFLAGS+=-Wall -Wextra -Wconversion -Wcast-qual -Wcast-align -Wshadow \ -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \ -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \ diff --git a/xxhash.c b/xxhash.c index 02f5cd53..cba16f07 100644 --- a/xxhash.c +++ b/xxhash.c @@ -267,12 +267,56 @@ static const U32 PRIME32_3 = 3266489917U; /* 0b1100001010110010101011100011110 static const U32 PRIME32_4 = 668265263U; /* 0b00100111110101001110101100101111 */ static const U32 PRIME32_5 = 374761393U; /* 0b00010110010101100110011110110001 */ -static U32 XXH32_round(U32 seed, U32 input) +static U32 XXH32_round(U32 acc, U32 input) { - seed += input * PRIME32_2; - seed = XXH_rotl32(seed, 13); - seed *= PRIME32_1; - return seed; + acc += input * PRIME32_2; + acc = XXH_rotl32(acc, 13); + acc *= PRIME32_1; +#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE) + /* UGLY HACK: + * This inline assembly hack forces acc into a normal register. This is the + * only thing that prevents GCC and Clang from autovectorizing the XXH32 loop + * (pragmas and attributes don't work for some resason) without globally + * disabling SSE4.1. + * + * The reason we want to avoid vectorization is because despite working on + * 4 integers at a time, there are multiple factors slowing XXH32 down on + * SSE4: + * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on newer chips!) + * making it slightly slower to multiply four integers at once compared to four + * integers independently. Even when pmulld was fastest, Sandy/Ivy Bridge, it is + * still not worth it to go into SSE just to multiply unless doing a long operation. + * + * - Four instructions are required to rotate, + * movqda tmp, v // not required with VEX encoding + * pslld tmp, 13 // tmp <<= 13 + * psrld v, 19 // x >>= 19 + * por v, tmp // x |= tmp + * compared to one for scalar: + * roll v, 13 // reliably fast across the board + * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason + * + * - Instruction level parallelism is actually more beneficial here because the + * SIMD actually serializes this operation: While v1 is rotating, v2 can load data, + * while v3 can multiply. SSE forces them to operate together. + * + * How this hack works: + * __asm__("" // Declare an assembly block but don't declare any instructions + * : // However, as an Input/Output Operand, + * "+r" // constrain a read/write operand (+) as a general purpose register (r). + * (acc) // and set acc as the operand + * ); + * + * Because of the 'r', the compiler has promised that seed will be in a + * general purpose register and the '+' says that it will be 'read/write', + * so it has to assume it has changed. It is like volatile without all the + * loads and stores. + * + * Since the argument has to be in a normal register (not an SSE register), + * each time XXH32_round is called, it is impossible to vectorize. */ + __asm__("" : "+r" (acc)); +#endif + return acc; } /* mix all bits */ From 1b78d030aa7ee847a4b4234cef53a733e2e5a276 Mon Sep 17 00:00:00 2001 From: "easyaspi314 (Devin)" Date: Thu, 7 Mar 2019 17:32:18 -0500 Subject: [PATCH 40/73] Remove comment about the bug because it is fixed now --- xxhash.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/xxhash.c b/xxhash.c index cba16f07..c5fec9b0 100644 --- a/xxhash.c +++ b/xxhash.c @@ -414,17 +414,6 @@ XXH32_endian_align(const void* input, size_t len, U32 seed, U32 v3 = seed + 0; U32 v4 = seed - PRIME32_1; - /* note : clang will try to vectorize this loop, using pmulld instruction. - * This is a bad idea, and will result in substantial performance reduction. - * To prevent clang from "optimizing" this loop, - * it's necessary to disable SSE4 on command line (-mno-sse4). - * However, this is a build instruction, so it's outside of source code. - * Whenever xxhash.c is used in a different code base, build flags don't follow. - * It would be better to ensure vectorization is disabled from within the source code. - * Alas, so far, I've not found a working method. - * I tried both `#pragma` and `__attribute__`, but clang still vectorizes. - * Help welcomed. - * In the meantime, vectorization is prevented by the `Makefile` */ do { v1 = XXH32_round(v1, XXH_get32bits(p)); p+=4; v2 = XXH32_round(v2, XXH_get32bits(p)); p+=4; From 02d0ba79a01384e887fe4d976c5a159b698b292a Mon Sep 17 00:00:00 2001 From: "easyaspi314 (Devin)" Date: Thu, 7 Mar 2019 19:51:39 -0500 Subject: [PATCH 41/73] Remove preprocessor statement leftover from testing What '0 &&' ? No idea what you are talking about... --- xxh3.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xxh3.h b/xxh3.h index 53785af7..fac97808 100644 --- a/xxh3.h +++ b/xxh3.h @@ -99,7 +99,7 @@ __attribute__((__target__("no-sse"))) static U64 XXH3_mul128(U64 ll1, U64 ll2) { -#if 0 && defined(__SIZEOF_INT128__) || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) +#if defined(__SIZEOF_INT128__) || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) __uint128_t lll = (__uint128_t)ll1 * ll2; return (U64)lll + (U64)(lll >> 64); From 4f4f63c73b86e57f49ed89da08b0943f288ade58 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Fri, 8 Mar 2019 15:37:06 -0500 Subject: [PATCH 42/73] modified xxh128 so that low part == xxh3_64b --- xxh3.h | 96 ++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 56 insertions(+), 40 deletions(-) diff --git a/xxh3.h b/xxh3.h index 3b634e34..64fac8a4 100644 --- a/xxh3.h +++ b/xxh3.h @@ -110,7 +110,7 @@ XXH3_mul128(U64 ll1, U64 ll2) #else - /* emulate 64x64x->128b multiplication, using four 32x32->64 */ + /* emulate 64x64->128b multiplication, using four 32x32->64 */ U32 const h1 = ll1 >> 32; U32 const h2 = ll2 >> 32; U32 const l1 = (U32)ll1; @@ -158,8 +158,8 @@ XXH3_len_1to3_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t BYTE const c3 = ((const BYTE*)data)[len - 1]; U32 const l1 = (U32)(c1) + ((U32)(c2) << 8); U32 const l2 = (U32)(len) + ((U32)(c3) << 2); - U64 const ll3 = (U64)(l1 + seed + key32[0]) * (l2 + key32[1]); - return XXH64_avalanche2(ll3); + U64 const ll11 = (U64)(l1 + seed + key32[0]) * (l2 + key32[1]); + return XXH64_avalanche2(ll11); } } @@ -171,8 +171,8 @@ XXH3_len_4to8_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t assert(len >= 4 && len <= 8); { const U32* const key32 = (const U32*) keyPtr; U64 acc = PRIME64_1 * (len + seed); - U64 const l1 = XXH_read32(data) + key32[0]; - U64 const l2 = XXH_read32((const BYTE*)data + len - 4) + key32[1]; + U32 const l1 = XXH_read32(data) + key32[0]; + U32 const l2 = XXH_read32((const BYTE*)data + len - 4) + key32[1]; acc += (U64)l1 * l2; return XXH64_avalanche2(acc); } @@ -434,11 +434,11 @@ XXH_FORCE_INLINE U64 XXH3_mix16B(const void* data, const U64* key) return XXH3_mul128((XXH_read64(data) ^ key[0]), XXH_read64((const BYTE*)data+8) ^ key[1]); } -static XXH64_hash_t XXH3_merge64B(const U64* data, const void* keyVoid, U64 len) +static XXH64_hash_t XXH3_merge64B(const U64* data, const void* keyVoid, U64 start) { const U64* const key = (const U64*)keyVoid; /* presumed aligned */ - U64 acc = PRIME64_1 * len; + U64 acc = start; acc += XXH3_mix16B(data+0, key+0); acc += XXH3_mix16B(data+2, key+2); acc += XXH3_mix16B(data+4, key+4); @@ -447,11 +447,9 @@ static XXH64_hash_t XXH3_merge64B(const U64* data, const void* keyVoid, U64 len) return XXH64_avalanche2(acc); } -__attribute__((noinline)) static XXH64_hash_t /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ -XXH3_hashLong(const void* data, size_t len, XXH64_hash_t seed) +static void +XXH3_hashLong(U64* acc, const void* data, size_t len) { - ALIGN(64) U64 acc[ACC_NB] = { seed, PRIME64_1, PRIME64_2, PRIME64_3, PRIME64_4, PRIME64_5, -seed }; - #define NB_KEYS ((KEYSET_DEFAULT_SIZE - STRIPE_ELTS) / 2) size_t const block_len = STRIPE_LEN * NB_KEYS; @@ -474,15 +472,21 @@ XXH3_hashLong(const void* data, size_t len, XXH64_hash_t seed) const BYTE* const p = (const BYTE*) data + len - STRIPE_LEN; XXH3_accumulate_512(acc, p, kKey + nbStripes*2); } } +} + +__attribute__((noinline)) static XXH64_hash_t /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ +XXH3_hashLong_64b(const void* data, size_t len, XXH64_hash_t seed) +{ + ALIGN(64) U64 acc[ACC_NB] = { seed, PRIME64_1, PRIME64_2, PRIME64_3, PRIME64_4, PRIME64_5, -seed, 0 }; + + XXH3_hashLong(acc, data, len); /* converge into final hash */ - //return XXH3_finalMerge_8u64(acc[0] + len, acc[1], acc[2], acc[3], acc[4], acc[5], acc[6], acc[7] - len, PRIME64_2 + len*2); assert(sizeof(acc) == 64); - return XXH3_merge64B(acc, kKey, len); + return XXH3_merge64B(acc, kKey, (U64)len * PRIME64_1); } - /* ========================================== * Public entry point * ========================================== */ @@ -499,7 +503,7 @@ XXH3_64b_withSeed(const void* data, size_t len, XXH64_hash_t seed) if (len > 32) { if (len > 64) { if (len > 96) { - if (len > 128) return XXH3_hashLong(data, len, seed); + if (len > 128) return XXH3_hashLong_64b(data, len, seed); acc += XXH3_mix16B(p+48, key+12); acc += XXH3_mix16B(p+len-64, key+14); @@ -530,10 +534,8 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64b(const void* data, size_t len) /* ========================================== - * XXH3 128 bits - * Not ready yet ! + * XXH3 128 bits (=> XXH128) * ========================================== */ - typedef struct { XXH64_hash_t ll1; XXH64_hash_t ll2; @@ -551,9 +553,9 @@ XXH3_len_1to3_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_ BYTE const c3 = ((const BYTE*)data)[len - 1]; U32 const l1 = (U32)(c1) + ((U32)(c2) << 8); U32 const l2 = (U32)(len) + ((U32)(c3) << 2); - U64 const ll1 = (U64)(l1 + seed + key32[0]) * (l2 + key32[1]); - U64 const ll2 = (U64)(l1 - seed + key32[2]) * (l2 + key32[3]); - return (XXH128_hash_t) { XXH64_avalanche2(ll1), XXH64_avalanche2(ll2) }; + U64 const ll11 = (U64)(l1 + seed + key32[0]) * (l2 + key32[1]); + U64 const ll12 = (U64)(l1 - seed + key32[2]) * (l2 + key32[3]); + return (XXH128_hash_t) { XXH64_avalanche2(ll11), XXH64_avalanche2(ll12) }; } } @@ -566,8 +568,8 @@ XXH3_len_4to8_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_ { const U32* const key32 = (const U32*) keyPtr; U64 acc1 = PRIME64_1 * ((U64)len + seed); U64 acc2 = PRIME64_2 * ((U64)len - seed); - U64 const l1 = XXH_read32(data) + key32[0]; - U64 const l2 = XXH_read32((const BYTE*)data + len - 4) + key32[1]; + U32 const l1 = XXH_read32(data); + U32 const l2 = XXH_read32((const BYTE*)data + len - 4); acc1 += (U64)(l1 + key32[0]) * (l2 + key32[1]); acc2 += (U64)(l1 + key32[2]) * (l2 + key32[3]); return (XXH128_hash_t){ XXH64_avalanche2(acc1), XXH64_avalanche2(acc2) }; @@ -603,43 +605,57 @@ XXH3_len_0to16_128b(const void* data, size_t len, XXH64_hash_t seed) } } +__attribute__((noinline)) static XXH128_hash_t /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ +XXH3_hashLong_128b(const void* data, size_t len, XXH64_hash_t seed) +{ + ALIGN(64) U64 acc[ACC_NB] = { seed, PRIME64_1, PRIME64_2, PRIME64_3, PRIME64_4, PRIME64_5, -seed, 0 }; + assert(len > 128); + + XXH3_hashLong(acc, data, len); + + /* converge into final hash */ + assert(sizeof(acc) == 64); + { U64 const part1 = XXH3_merge64B(acc, kKey, (U64)len * PRIME64_1); + U64 const part2 = XXH3_merge64B(acc, kKey+16, ((U64)len+1) * PRIME64_2); + return (XXH128_hash_t) { part1, part2 }; + } +} + XXH_PUBLIC_API XXH128_hash_t XXH3_128b_withSeed(const void* data, size_t len, XXH64_hash_t seed) { if (len <= 16) return XXH3_len_0to16_128b(data, len, seed); -#if 0 - - { U64 acc = PRIME64_1 * (len + seed); + { U64 acc1 = PRIME64_1 * (len + seed); + U64 acc2 = 0; const BYTE* const p = (const BYTE*)data; const U64* const key = (const U64*)(const void*)kKey; if (len > 32) { if (len > 64) { if (len > 96) { - if (len > 128) return XXH3_hashLong(data, len, seed); + if (len > 128) return XXH3_hashLong_128b(data, len, seed); - acc += XXH3_mix16B(p+48, key+12); - acc += XXH3_mix16B(p+len-64, key+14); + acc1 += XXH3_mix16B(p+48, key+12); + acc2 += XXH3_mix16B(p+len-64, key+14); } - acc += XXH3_mix16B(p+32, key+8); - acc += XXH3_mix16B(p+len-48, key+10); + acc1 += XXH3_mix16B(p+32, key+8); + acc2 += XXH3_mix16B(p+len-48, key+10); } - acc += XXH3_mix16B(p+16, key+4); - acc += XXH3_mix16B(p+len-32, key+6); + acc1 += XXH3_mix16B(p+16, key+4); + acc2 += XXH3_mix16B(p+len-32, key+6); } - acc += XXH3_mix16B(p+0, key+0); - acc += XXH3_mix16B(p+len-16, key+2); + acc1 += XXH3_mix16B(p+0, key+0); + acc2 += XXH3_mix16B(p+len-16, key+2); - return XXH64_avalanche2(acc); + { U64 const part1 = acc1 + acc2; + U64 const part2 = (acc1 * PRIME64_3) + (acc2 * PRIME64_4) + ((len - seed) * PRIME64_2); + return (XXH128_hash_t) { XXH64_avalanche2(part1), -XXH64_avalanche2(part2) }; + } } - -#else - return (XXH128_hash_t){ 0, 0 }; -#endif } From 2afd24d8bb564eba44543f617118a5e85137b1a9 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Fri, 8 Mar 2019 16:03:24 -0500 Subject: [PATCH 43/73] xxh128: minor modifications to improve bias 1.4% => 0.6% --- xxh3.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xxh3.h b/xxh3.h index 55bc12a0..40c1f78e 100644 --- a/xxh3.h +++ b/xxh3.h @@ -634,7 +634,7 @@ XXH3_len_1to3_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_ U32 const l1 = (U32)(c1) + ((U32)(c2) << 8); U32 const l2 = (U32)(len) + ((U32)(c3) << 2); U64 const ll11 = XXH_mult32to64(l1 + seed + key32[0], l2 + key32[1]); - U64 const ll12 = XXH_mult32to64(l1 - seed + key32[2], l2 + key32[3]); + U64 const ll12 = XXH_mult32to64(l1 + key32[2], l2 - seed + key32[3]); return (XXH128_hash_t) { XXH64_avalanche2(ll11), XXH64_avalanche2(ll12) }; } } @@ -651,7 +651,7 @@ XXH3_len_4to8_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_ U32 const l1 = XXH_read32(data); U32 const l2 = XXH_read32((const BYTE*)data + len - 4); acc1 += XXH_mult32to64(l1 + key32[0], l2 + key32[1]); - acc2 += XXH_mult32to64(l1 + key32[2], l2 + key32[3]); + acc2 += XXH_mult32to64(l1 - key32[2], l2 + key32[3]); return (XXH128_hash_t){ XXH64_avalanche2(acc1), XXH64_avalanche2(acc2) }; } } From c5953f132c548a0413a4635c8ecc6d801f04f75a Mon Sep 17 00:00:00 2001 From: "easyaspi314 (Devin)" Date: Fri, 8 Mar 2019 22:07:08 -0500 Subject: [PATCH 44/73] Add unroll pragma for Clang in XXH3_accumulate. Clang doesn't unroll the XXH3_accumulate loop for some reason. Using `#pragma clang loop unroll(enable)` to hint to Clang that it should unroll results in a huge 1.4-1.5x speedup. Before: 15 GB/s After: 21 GB/s --- xxh3.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/xxh3.h b/xxh3.h index 40c1f78e..ee636305 100644 --- a/xxh3.h +++ b/xxh3.h @@ -503,6 +503,11 @@ static void XXH3_scrambleAcc(void* acc, const void* key) static void XXH3_accumulate(U64* acc, const void* restrict data, const U32* restrict key, size_t nbStripes) { size_t n; + +/* Clang doesn't unroll this loop without the pragma. Unrolling results in code that is about 1.4x faster. */ +#if defined(__clang__) && !defined(__OPTIMIZE_SIZE__) +# pragma clang loop unroll(enable) +#endif for (n = 0; n < nbStripes; n++ ) { XXH3_accumulate_512(acc, (const BYTE*)data + n*STRIPE_LEN, key); key += 2; From 60215c5bfb81518eec16afa122af457681b68d03 Mon Sep 17 00:00:00 2001 From: "easyaspi314 (Devin)" Date: Fri, 8 Mar 2019 22:26:25 -0500 Subject: [PATCH 45/73] Fix typo causing build failure on 32-bit --- xxh3.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xxh3.h b/xxh3.h index ee636305..724cb1a3 100644 --- a/xxh3.h +++ b/xxh3.h @@ -197,9 +197,9 @@ XXH3_mul128(U64 ll1, U64 ll2) U32 const l2 = (U32)ll2; U64 const llh = XXH_mult32to64(h1, h2); - U64 const llm1 = XXH_mult32to64(l1, h2; - U64 const llm2 = XXH_mult32to64(h1, l2; - U64 const lll = XXH_mult32to64(l1, l2; + U64 const llm1 = XXH_mult32to64(l1, h2); + U64 const llm2 = XXH_mult32to64(h1, l2); + U64 const lll = XXH_mult32to64(l1, l2); U64 const t = lll + (llm1 << 32); U64 const carry1 = t < lll; From a5d5bf778f631412fd14a466457878f1a53eace2 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Fri, 8 Mar 2019 22:32:11 -0500 Subject: [PATCH 46/73] improve algorithm by compensating UMAC deficiency no longer possibly to nullify one member through another --- xxh3.h | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/xxh3.h b/xxh3.h index 40c1f78e..50d7dbb5 100644 --- a/xxh3.h +++ b/xxh3.h @@ -197,9 +197,9 @@ XXH3_mul128(U64 ll1, U64 ll2) U32 const l2 = (U32)ll2; U64 const llh = XXH_mult32to64(h1, h2); - U64 const llm1 = XXH_mult32to64(l1, h2; - U64 const llm2 = XXH_mult32to64(h1, l2; - U64 const lll = XXH_mult32to64(l1, l2; + U64 const llm1 = XXH_mult32to64(l1, h2); + U64 const llm2 = XXH_mult32to64(h1, l2); + U64 const lll = XXH_mult32to64(l1, l2); U64 const t = lll + (llm1 << 32); U64 const carry1 = t < lll; @@ -308,8 +308,9 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k __m256i const d = _mm256_loadu_si256 (xdata+i); __m256i const k = _mm256_loadu_si256 (xkey+i); __m256i const dk = _mm256_add_epi32 (d,k); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */ - __m256i const res = _mm256_mul_epu32 (dk, _mm256_shuffle_epi32 (dk,0x31)); /* uint64 res[4] = {dk0*dk1, dk2*dk3, ...} */ - xacc[i] = _mm256_add_epi64(res, xacc[i]); /* xacc must be aligned on 32 bytes boundaries */ + __m256i const res = _mm256_mul_epu32 (dk, _mm256_shuffle_epi32 (dk, 0x31)); /* uint64 res[4] = {dk0*dk1, dk2*dk3, ...} */ + xacc[i] = _mm256_add_epi64(res, xacc[i]); + xacc[i] = _mm256_add_epi32(d, xacc[i]); } } @@ -324,13 +325,14 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { __m128i const d = _mm_loadu_si128 (xdata+i); __m128i const k = _mm_loadu_si128 (xkey+i); - __m128i const dk = _mm_add_epi32 (d,k); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ - __m128i const res = _mm_mul_epu32 (dk, _mm_shuffle_epi32 (dk,0x31)); /* uint64 res[2] = {dk0*dk1,dk2*dk3} */ - xacc[i] = _mm_add_epi64(res, xacc[i]); /* xacc must be aligned on 16 bytes boundaries */ + __m128i const dk = _mm_add_epi32 (d,k); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ + __m128i const res = _mm_mul_epu32 (dk, _mm_shuffle_epi32 (dk, 0x31)); /* uint64 res[2] = {dk0*dk1,dk2*dk3} */ + xacc[i] = _mm_add_epi64(res, xacc[i]); + xacc[i] = _mm_add_epi32(d, xacc[i]); } } -#elif (XXH_VECTOR == XXH_NEON) +#elif (XXH_VECTOR == XXH_NEON) /* note : no longer correct, must be updated to match new formula */ assert(((size_t)acc) & 15 == 0); { uint64x2_t* const xacc = (uint64x2_t *)acc; @@ -394,6 +396,7 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k int const left = 2*i; int const right= 2*i + 1; xacc[i] += XXH_mult32to64(xdata[left] + xkey[left], xdata[right] + xkey[right]); + xacc[i] += xdata[left] + ((U64)xdata[right] << 32); } #endif @@ -407,13 +410,10 @@ static void XXH3_scrambleAcc(void* acc, const void* key) { __m256i* const xacc = (__m256i*) acc; const __m256i* const xkey = (const __m256i *) key; - __m256i const xor_p5 = _mm256_set1_epi64x(PRIME64_5); - for (size_t i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { __m256i data = xacc[i]; __m256i const shifted = _mm256_srli_epi64(data, 47); data = _mm256_xor_si256(data, shifted); - data = _mm256_xor_si256(data, xor_p5); { __m256i const k = _mm256_loadu_si256 (xkey+i); __m256i const dk = _mm256_mul_epu32 (data,k); /* U32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ @@ -422,7 +422,7 @@ static void XXH3_scrambleAcc(void* acc, const void* key) __m256i const k2 = _mm256_shuffle_epi32 (k,0x31); __m256i const dk2 = _mm256_mul_epu32 (d2,k2); /* U32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ - xacc[i] = _mm256_xor_si256(dk, dk2); + xacc[i] = _mm256_xor_si256(dk, dk2); } } } @@ -431,27 +431,25 @@ static void XXH3_scrambleAcc(void* acc, const void* key) assert(((size_t)acc) & 15 == 0); { __m128i* const xacc = (__m128i*) acc; const __m128i* const xkey = (const __m128i *) key; - __m128i const xor_p5 = _mm_set1_epi64((__m64)PRIME64_5); size_t i; for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { __m128i data = xacc[i]; __m128i const shifted = _mm_srli_epi64(data, 47); data = _mm_xor_si128(data, shifted); - data = _mm_xor_si128(data, xor_p5); { __m128i const k = _mm_loadu_si128 (xkey+i); - __m128i const dk = _mm_mul_epu32 (data,k); /* U32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ + __m128i const dk = _mm_mul_epu32 (data,k); - __m128i const d2 = _mm_shuffle_epi32 (data,0x31); - __m128i const k2 = _mm_shuffle_epi32 (k,0x31); - __m128i const dk2 = _mm_mul_epu32 (d2,k2); /* U32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ + __m128i const d2 = _mm_shuffle_epi32 (data, 0x31); + __m128i const k2 = _mm_shuffle_epi32 (k, 0x31); + __m128i const dk2 = _mm_mul_epu32 (d2,k2); - xacc[i] = _mm_xor_si128(dk, dk2); + xacc[i] = _mm_xor_si128(dk, dk2); } } } -#elif (XXH_VECTOR == XXH_NEON) +#elif (XXH_VECTOR == XXH_NEON) /* note : no longer correct, must be updated to match new formula */ assert(((size_t)acc) & 15 == 0); { uint64x2_t* const xacc = (uint64x2_t*) acc; From ed0dbb8fdd97c7ef1c51b5338ac8086dc9d789c4 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Fri, 8 Mar 2019 23:59:02 -0500 Subject: [PATCH 47/73] ensure xxhash.c and xxhsum.c are recompiled when their header change --- Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile b/Makefile index ddb2bb96..1b5ee6a4 100644 --- a/Makefile +++ b/Makefile @@ -82,6 +82,10 @@ xxhsum32: CFLAGS += -m32 xxhsum32: xxhash.c xxhsum.c $(CC) $(FLAGS) $^ $(LDFLAGS) -o $@$(EXT) +xxhash.o: xxhash.h xxh3.h + +xxhsum.o: xxhash.h + .PHONY: xxhsum_and_links xxhsum_and_links: xxhsum xxh32sum xxh64sum From 2010b7e7de7a9ef540c43b0e9bdb8c5e773600aa Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Sat, 9 Mar 2019 00:19:40 -0500 Subject: [PATCH 48/73] fixed addition discrepancy between scalar and vector code let's both have a 64-bit addition with carry --- xxh3.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/xxh3.h b/xxh3.h index 5dbec179..866d5131 100644 --- a/xxh3.h +++ b/xxh3.h @@ -310,7 +310,7 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k __m256i const dk = _mm256_add_epi32 (d,k); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */ __m256i const res = _mm256_mul_epu32 (dk, _mm256_shuffle_epi32 (dk, 0x31)); /* uint64 res[4] = {dk0*dk1, dk2*dk3, ...} */ xacc[i] = _mm256_add_epi64(res, xacc[i]); - xacc[i] = _mm256_add_epi32(d, xacc[i]); + xacc[i] = _mm256_add_epi64(d, xacc[i]); } } @@ -328,7 +328,7 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k __m128i const dk = _mm_add_epi32 (d,k); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ __m128i const res = _mm_mul_epu32 (dk, _mm_shuffle_epi32 (dk, 0x31)); /* uint64 res[2] = {dk0*dk1,dk2*dk3} */ xacc[i] = _mm_add_epi64(res, xacc[i]); - xacc[i] = _mm_add_epi32(d, xacc[i]); + xacc[i] = _mm_add_epi64(d, xacc[i]); } } @@ -501,11 +501,6 @@ static void XXH3_scrambleAcc(void* acc, const void* key) static void XXH3_accumulate(U64* acc, const void* restrict data, const U32* restrict key, size_t nbStripes) { size_t n; - -/* Clang doesn't unroll this loop without the pragma. Unrolling results in code that is about 1.4x faster. */ -#if defined(__clang__) && !defined(__OPTIMIZE_SIZE__) -# pragma clang loop unroll(enable) -#endif for (n = 0; n < nbStripes; n++ ) { XXH3_accumulate_512(acc, (const BYTE*)data + n*STRIPE_LEN, key); key += 2; From 638993f16b29c00346bfedf8e5b24d37dee50b12 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Mon, 11 Mar 2019 15:09:27 -0700 Subject: [PATCH 49/73] added consistency tests for XXH3_64b validated against SSE2 path --- xxh3.h | 14 +++++----- xxhash.h | 38 +++++++++++++++++---------- xxhsum.c | 78 ++++++++++++++++++++++++++++++++++++++++++-------------- 3 files changed, 90 insertions(+), 40 deletions(-) diff --git a/xxh3.h b/xxh3.h index 866d5131..63a279fb 100644 --- a/xxh3.h +++ b/xxh3.h @@ -570,7 +570,7 @@ XXH3_hashLong_64b(const void* data, size_t len, XXH64_hash_t seed) * ========================================== */ XXH_PUBLIC_API XXH64_hash_t -XXH3_64b_withSeed(const void* data, size_t len, XXH64_hash_t seed) +XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed) { const BYTE* const p = (const BYTE*)data; const U64* const key = (const U64*)(const void*)kKey; @@ -604,9 +604,9 @@ XXH3_64b_withSeed(const void* data, size_t len, XXH64_hash_t seed) } -XXH_PUBLIC_API XXH64_hash_t XXH3_64b(const void* data, size_t len) +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len) { - return XXH3_64b_withSeed(data, len, 0); + return XXH3_64bits_withSeed(data, len, 0); } @@ -700,7 +700,7 @@ XXH3_hashLong_128b(const void* data, size_t len, XXH64_hash_t seed) } XXH_PUBLIC_API XXH128_hash_t -XXH3_128b_withSeed(const void* data, size_t len, XXH64_hash_t seed) +XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed) { if (len <= 16) return XXH3_len_0to16_128b(data, len, seed); @@ -737,15 +737,15 @@ XXH3_128b_withSeed(const void* data, size_t len, XXH64_hash_t seed) } -XXH_PUBLIC_API XXH128_hash_t XXH3_128b(const void* data, size_t len) +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len) { - return XXH3_128b_withSeed(data, len, 0); + return XXH3_128bits_withSeed(data, len, 0); } XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed) { - return XXH3_128b_withSeed(data, len, seed); + return XXH3_128bits_withSeed(data, len, seed); } #endif /* XXH3_H */ diff --git a/xxhash.h b/xxhash.h index 1782789e..5b887223 100644 --- a/xxhash.h +++ b/xxhash.h @@ -158,8 +158,8 @@ typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; * Version ***************************************/ #define XXH_VERSION_MAJOR 0 -#define XXH_VERSION_MINOR 6 -#define XXH_VERSION_RELEASE 6 +#define XXH_VERSION_MINOR 7 +#define XXH_VERSION_RELEASE 0 #define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) XXH_PUBLIC_API unsigned XXH_versionNumber (void); @@ -249,18 +249,6 @@ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); -/*-********************************************************************** -* XXH3 -* New experimental hash -************************************************************************/ - -#ifdef XXH_NAMESPACE -# define XXH3_64b XXH_NAME2(XXH_NAMESPACE, XXH3_64b) -#endif - -XXH_PUBLIC_API XXH64_hash_t XXH3_64b(const void* data, size_t len); - - #endif /* XXH_NO_LONG_LONG */ @@ -336,10 +324,32 @@ struct XXH64_state_s { # endif +/*-********************************************************************** +* XXH3 +* New experimental hash +************************************************************************/ + +#ifdef XXH_NAMESPACE +# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) +# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed) +#endif + +/* note : variant without seed produces same result as variant with seed == 0 */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len); +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, unsigned long long seed); + + + + +/*-********************************************************************** +* XXH_INLINE_ALL +************************************************************************/ #if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) # include "xxhash.c" /* include xxhash function bodies as `static`, for inlining */ #endif + + #endif /* XXH_STATIC_LINKING_ONLY */ diff --git a/xxhsum.c b/xxhsum.c index 657cf783..7428b62d 100644 --- a/xxhsum.c +++ b/xxhsum.c @@ -265,7 +265,7 @@ static U32 localXXH32(const void* buffer, size_t bufferSize, U32 seed) { return static U32 localXXH64(const void* buffer, size_t bufferSize, U32 seed) { return (U32)XXH64(buffer, bufferSize, seed); } -static U32 localXXH3_64b(const void* buffer, size_t bufferSize, U32 seed) { (void)seed; return (U32)XXH3_64b(buffer, bufferSize); } +static U32 localXXH3_64b(const void* buffer, size_t bufferSize, U32 seed) { (void)seed; return (U32)XXH3_64bits(buffer, bufferSize); } static void BMK_benchHash(hashFunction h, const char* hName, const void* buffer, size_t bufferSize) { @@ -406,7 +406,6 @@ static int BMK_benchFiles(const char** fileNamesTable, int nbFiles, U32 specific } - static int BMK_benchInternal(size_t keySize, U32 specificTest) { void* const buffer = calloc(keySize+16+3, 1); @@ -434,32 +433,34 @@ static int BMK_benchInternal(size_t keySize, U32 specificTest) } -static void BMK_checkResult(U32 r1, U32 r2) +/* ************************************************ + * Self-test : + * ensure results consistency accross platforms + *********************************************** */ + +static void BMK_checkResult32(U32 r1, U32 r2) { static int nbTests = 1; - if (r1==r2) { - DISPLAYLEVEL(3, "\rTest%3i : %08X == %08X ok ", nbTests, r1, r2); - } else { - DISPLAY("\rERROR : Test%3i : %08X <> %08X !!!!! \n", nbTests, r1, r2); + if (r1!=r2) { + DISPLAY("\rERROR : Test%3i : 0x%08X <> 0x%08X !!!!! \n", nbTests, r1, r2); exit(1); } nbTests++; } - static void BMK_checkResult64(U64 r1, U64 r2) { static int nbTests = 1; if (r1!=r2) { DISPLAY("\rERROR : Test%3i : 64-bit values non equals !!!!! \n", nbTests); - DISPLAY("\r %08X%08X != %08X%08X \n", (U32)(r1>>32), (U32)r1, (U32)(r2>>32), (U32)r2); + DISPLAY("\r 0x%08X%08XULL != 0x%08X%08XULL \n", (U32)(r1>>32), (U32)r1, (U32)(r2>>32), (U32)r2); exit(1); } nbTests++; } -static void BMK_testSequence64(void* sentence, size_t len, U64 seed, U64 Nresult) +static void BMK_testSequence64(const void* sentence, size_t len, U64 seed, U64 Nresult) { XXH64_state_t state; U64 Dresult; @@ -475,11 +476,23 @@ static void BMK_testSequence64(void* sentence, size_t len, U64 seed, U64 Nresult (void)XXH64_reset(&state, seed); for (pos=0; pos Date: Mon, 11 Mar 2019 15:40:01 -0700 Subject: [PATCH 50/73] xxh3: fixed scalar variant scrambling stage wasn't updated to match new formula --- xxh3.h | 1 - 1 file changed, 1 deletion(-) diff --git a/xxh3.h b/xxh3.h index 63a279fb..88b5fdf3 100644 --- a/xxh3.h +++ b/xxh3.h @@ -488,7 +488,6 @@ static void XXH3_scrambleAcc(void* acc, const void* key) int const left = 2*i; int const right= 2*i + 1; xacc[i] ^= xacc[i] >> 47; - xacc[i] ^= PRIME64_5; { U64 p1 = XXH_mult32to64(xacc[i] & 0xFFFFFFFF, xkey[left]); U64 p2 = XXH_mult32to64(xacc[i] >> 32, xkey[right]); From feedac5ccab06168bb02e74051e5b502e18cffd2 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 12 Mar 2019 11:31:57 -0700 Subject: [PATCH 51/73] updated travis tests to ensure results consistency across scalar/sse2/avx2 on x64/x86 --- .travis.yml | 34 +++++++++++++++++++++++++--------- Makefile | 2 ++ 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/.travis.yml b/.travis.yml index 3c37a826..29923cd2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,10 +1,26 @@ language: c -compiler: gcc -script: make -B test-all -before_install: - - sudo apt-get update -qq - - sudo apt-get install -qq gcc-arm-linux-gnueabi - - sudo apt-get install -qq clang - - sudo apt-get install -qq g++-multilib - - sudo apt-get install -qq gcc-multilib - - sudo apt-get install -qq cppcheck + +matrix: + fast_finish: true + include: + + - name: General linux tests (Xenial) + dist: xenial + script: + - make -B test-all + install: + - sudo apt-get update -qq + - sudo apt-get install -qq + gcc-arm-linux-gnueabi + clang + g++-multilib + gcc-multilib + cppcheck + + - name: check results consistency + script: + - CPPFLAGS=-DXXH_VECTOR=0 make check # Scalar code + - make clean + - CPPFLAGS=-DXXH_VECTOR=1 make check # SSE2 code path + - make clean + - CPPFLAGS="-mavx2 -DXXH_VECTOR=2" make check # AVX2 code path diff --git a/Makefile b/Makefile index 1b5ee6a4..c0422d07 100644 --- a/Makefile +++ b/Makefile @@ -118,7 +118,9 @@ libxxhash : $(LIBXXH) lib: libxxhash.a libxxhash +# ================================================= # tests +# ================================================= .PHONY: check check: xxhsum From c76d96454b5ec417872db364883014b61ca6f105 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 12 Mar 2019 11:44:44 -0700 Subject: [PATCH 52/73] xxh3: fixed declaration after statement in AVX2 path also : - added header license - fixed alignment declaration --- xxh3.h | 92 ++++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 67 insertions(+), 25 deletions(-) diff --git a/xxh3.h b/xxh3.h index 88b5fdf3..420c463e 100644 --- a/xxh3.h +++ b/xxh3.h @@ -1,3 +1,42 @@ +/* + xxHash - Extremely Fast Hash algorithm + Development source file for `xxh3` + Copyright (C) 2019-present, Yann Collet. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - xxHash source repository : https://github.com/Cyan4973/xxHash +*/ + +/* Note : + This file is separated for development purposes. + It will be integrated into `xxhash.c` when development phase is complete. +*/ + #ifndef XXH3_H #define XXH3_H @@ -300,26 +339,27 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k #if (XXH_VECTOR == XXH_AVX2) assert(((size_t)acc) & 31 == 0); - { __m256i* const xacc = (__m256i *) acc; - const __m256i* const xdata = (const __m256i *) data; - ALIGN(32) const __m256i* const xkey = (const __m256i *) key; + { ALIGN(32) __m256i* const xacc = (__m256i *) acc; + const __m256i* const xdata = (const __m256i *) data; + const __m256i* const xkey = (const __m256i *) key; - for (size_t i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { __m256i const d = _mm256_loadu_si256 (xdata+i); __m256i const k = _mm256_loadu_si256 (xkey+i); __m256i const dk = _mm256_add_epi32 (d,k); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */ - __m256i const res = _mm256_mul_epu32 (dk, _mm256_shuffle_epi32 (dk, 0x31)); /* uint64 res[4] = {dk0*dk1, dk2*dk3, ...} */ - xacc[i] = _mm256_add_epi64(res, xacc[i]); - xacc[i] = _mm256_add_epi64(d, xacc[i]); + __m256i const res = _mm256_mul_epu32 (dk, _mm256_shuffle_epi32 (dk, 0x31)); /* uint64 res[4] = {dk0*dk1, dk2*dk3, ...} */ + __m256i const add = _mm256_add_epi64(d, xacc[i]); + xacc[i] = _mm256_add_epi64(res, add); } } #elif (XXH_VECTOR == XXH_SSE2) assert(((size_t)acc) & 15 == 0); - { __m128i* const xacc = (__m128i *) acc; - const __m128i* const xdata = (const __m128i *) data; - ALIGN(16) const __m128i* const xkey = (const __m128i *) key; + { ALIGN(16) __m128i* const xacc = (__m128i *) acc; + const __m128i* const xdata = (const __m128i *) data; + const __m128i* const xkey = (const __m128i *) key; size_t i; for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { @@ -327,21 +367,21 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k __m128i const k = _mm_loadu_si128 (xkey+i); __m128i const dk = _mm_add_epi32 (d,k); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ __m128i const res = _mm_mul_epu32 (dk, _mm_shuffle_epi32 (dk, 0x31)); /* uint64 res[2] = {dk0*dk1,dk2*dk3} */ - xacc[i] = _mm_add_epi64(res, xacc[i]); - xacc[i] = _mm_add_epi64(d, xacc[i]); + __m128i const add = _mm_add_epi64(d, xacc[i]); + xacc[i] = _mm_add_epi64(res, add); } } #elif (XXH_VECTOR == XXH_NEON) /* note : no longer correct, must be updated to match new formula */ assert(((size_t)acc) & 15 == 0); - { uint64x2_t* const xacc = (uint64x2_t *)acc; - const uint32_t* const xdata = (const uint32_t *)data; - ALIGN(16) const uint32_t* const xkey = (const uint32_t *)key; + { uint64x2_t* const xacc = (uint64x2_t *)acc; + const uint32_t* const xdata = (const uint32_t *)data; + const uint32_t* const xkey = (const uint32_t *)key; size_t i; for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) { -#if !defined(__aarch64__) && !defined(__arm64__) && !defined(XXH_NO_ARM32_HACK) +# if !defined(__aarch64__) && !defined(__arm64__) && !defined(XXH_NO_ARM32_HACK) /* On 32-bit ARM, we can take advantage of the packed registers. * This is not portable to aarch64! * Basically, on 32-bit NEON, registers are stored like so: @@ -374,7 +414,7 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k * does not. */ uint32x4_t const dk = vaddq_u32(*(uint32x4_t*)&d, *(uint32x4_t*)&k); xacc[i] = vmlal_u32(xacc[i], vget_low_u32(dk), vget_high_u32(dk)); -#else +# else /* Portable, but slightly slower version */ uint32x2x2_t const d = vld2_u32(xdata + i * 4); uint32x2x2_t const k = vld2_u32(xkey + i * 4); @@ -382,10 +422,11 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k uint32x2_t const dkH = vadd_u32(d.val[1], k.val[1]); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ /* xacc must be aligned on 16 bytes boundaries */ xacc[i] = vmlal_u32(xacc[i], dkL, dkH); /* uint64 res[2] = {dk0*dk1,dk2*dk3} */ -#endif +# endif } } -#else /* scalar variant */ + +#else /* scalar variant - universal */ U64* const xacc = (U64*) acc; const U32* const xdata = (const U32*) data; @@ -407,10 +448,11 @@ static void XXH3_scrambleAcc(void* acc, const void* key) #if (XXH_VECTOR == XXH_AVX2) assert(((size_t)acc) & 31 == 0); - { __m256i* const xacc = (__m256i*) acc; - const __m256i* const xkey = (const __m256i *) key; + { ALIGN(32) __m256i* const xacc = (__m256i*) acc; + const __m256i* const xkey = (const __m256i *) key; - for (size_t i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { __m256i data = xacc[i]; __m256i const shifted = _mm256_srli_epi64(data, 47); data = _mm256_xor_si256(data, shifted); @@ -429,8 +471,8 @@ static void XXH3_scrambleAcc(void* acc, const void* key) #elif (XXH_VECTOR == XXH_SSE2) assert(((size_t)acc) & 15 == 0); - { __m128i* const xacc = (__m128i*) acc; - const __m128i* const xkey = (const __m128i *) key; + { ALIGN(16) __m128i* const xacc = (__m128i*) acc; + const __m128i* const xkey = (const __m128i *) key; size_t i; for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { @@ -478,7 +520,7 @@ static void XXH3_scrambleAcc(void* acc, const void* key) } } } -#else /* scalar variant */ +#else /* scalar variant - universal */ U64* const xacc = (U64*) acc; const U32* const xkey = (const U32*) key; From b74c215b363362cd94a031dc0eedd92e109eae50 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 12 Mar 2019 12:00:30 -0700 Subject: [PATCH 53/73] try to fix travis install script --- .travis.yml | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/.travis.yml b/.travis.yml index 29923cd2..4a53c846 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,16 +6,15 @@ matrix: - name: General linux tests (Xenial) dist: xenial + before_install: + - sudo apt-get update -qq + - sudo apt-get install -qq gcc-arm-linux-gnueabi + - sudo apt-get install -qq clang + - sudo apt-get install -qq g++-multilib + - sudo apt-get install -qq gcc-multilib + - sudo apt-get install -qq cppcheck script: - make -B test-all - install: - - sudo apt-get update -qq - - sudo apt-get install -qq - gcc-arm-linux-gnueabi - clang - g++-multilib - gcc-multilib - cppcheck - name: check results consistency script: From 30c8fb59c55ccfdcd08bfcd447df53e8b2de485e Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 12 Mar 2019 12:44:42 -0700 Subject: [PATCH 54/73] added ARM tests on travis --- .travis.yml | 22 +++++++++++++++++++++- Makefile | 10 ++++++---- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 4a53c846..f93b240d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,10 +16,30 @@ matrix: script: - make -B test-all - - name: check results consistency + - name: Check results consistency on x64 script: - CPPFLAGS=-DXXH_VECTOR=0 make check # Scalar code - make clean - CPPFLAGS=-DXXH_VECTOR=1 make check # SSE2 code path - make clean - CPPFLAGS="-mavx2 -DXXH_VECTOR=2" make check # AVX2 code path + + - name: ARM + aarch64 compilation + install: + - sudo apt-get install -qq + qemu-system-arm + qemu-user-static + gcc-arm-linux-gnueabi + libc6-dev-armel-cross + gcc-aarch64-linux-gnu + libc6-dev-arm64-cross + script: + - CC=arm-linux-gnueabi-gcc RUN_ENV=qemu-arm-static CPPFLAGS=-DXXH_VECTOR=0 LDFLAGS=-static make check # Scalar code path + - make clean + - CC=arm-linux-gnueabi-gcc RUN_ENV=qemu-arm-static CPPFLAGS=-DXXH_VECTOR=3 LDFLAGS=-static make check # NEON code path + - make clean + # aarch64 + - CC=aarch64-linux-gnu-gcc RUN_ENV=qemu-aarch64-static CPPFLAGS=-DXXH_VECTOR=0 LDFLAGS=-static make check # Scalar code path + - make clean + - CC=aarch64-linux-gnu-gcc RUN_ENV=qemu-aarch64-static CPPFLAGS=-DXXH_VECTOR=3 LDFLAGS=-static make check # NEON code path + - make clean diff --git a/Makefile b/Makefile index c0422d07..df4319ab 100644 --- a/Makefile +++ b/Makefile @@ -122,16 +122,18 @@ lib: libxxhash.a libxxhash # tests # ================================================= +# make check can be run with cross-compiled binaries on emulated environments (qemu user mode) +# by setting $(RUN_ENV) to the target emulation environment .PHONY: check check: xxhsum # stdin - ./xxhsum < xxhash.c + $(RUN_ENV) ./xxhsum < xxhash.c # multiple files - ./xxhsum xxhash.* xxhsum.* + $(RUN_ENV) ./xxhsum xxhash.* xxhsum.* # internal bench - ./xxhsum -bi1 + $(RUN_ENV) ./xxhsum -bi1 # file bench - ./xxhsum -bi1 xxhash.c + $(RUN_ENV) ./xxhsum -bi1 xxhash.c .PHONY: test-mem test-mem: xxhsum From 51ac7dc7e9289bd782201e48c8c3534fcf0f0555 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 12 Mar 2019 12:56:52 -0700 Subject: [PATCH 55/73] fixed minor conversion warning detected on ARM 32-bit --- .travis.yml | 5 +++-- xxh3.h | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index f93b240d..7598ae86 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,13 +18,13 @@ matrix: - name: Check results consistency on x64 script: - - CPPFLAGS=-DXXH_VECTOR=0 make check # Scalar code + - CPPFLAGS=-DXXH_VECTOR=0 make check # Scalar code path - make clean - CPPFLAGS=-DXXH_VECTOR=1 make check # SSE2 code path - make clean - CPPFLAGS="-mavx2 -DXXH_VECTOR=2" make check # AVX2 code path - - name: ARM + aarch64 compilation + - name: ARM + aarch64 compilation and consistency checks install: - sudo apt-get install -qq qemu-system-arm @@ -34,6 +34,7 @@ matrix: gcc-aarch64-linux-gnu libc6-dev-arm64-cross script: + # arm (32-bit) - CC=arm-linux-gnueabi-gcc RUN_ENV=qemu-arm-static CPPFLAGS=-DXXH_VECTOR=0 LDFLAGS=-static make check # Scalar code path - make clean - CC=arm-linux-gnueabi-gcc RUN_ENV=qemu-arm-static CPPFLAGS=-DXXH_VECTOR=3 LDFLAGS=-static make check # NEON code path diff --git a/xxh3.h b/xxh3.h index 420c463e..a019ce0e 100644 --- a/xxh3.h +++ b/xxh3.h @@ -230,8 +230,8 @@ XXH3_mul128(U64 ll1, U64 ll2) #else /* Portable scalar version */ /* emulate 64x64->128b multiplication, using four 32x32->64 */ - U32 const h1 = ll1 >> 32; - U32 const h2 = ll2 >> 32; + U32 const h1 = (U32)(ll1 >> 32); + U32 const h2 = (U32)(ll2 >> 32); U32 const l1 = (U32)ll1; U32 const l2 = (U32)ll2; @@ -375,7 +375,7 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k #elif (XXH_VECTOR == XXH_NEON) /* note : no longer correct, must be updated to match new formula */ assert(((size_t)acc) & 15 == 0); - { uint64x2_t* const xacc = (uint64x2_t *)acc; + { uint64x2_t* const xacc = (uint64x2_t *)acc; const uint32_t* const xdata = (const uint32_t *)data; const uint32_t* const xkey = (const uint32_t *)key; From a767eaa074dffedeba62cebc4bc2f98872c0e711 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 12 Mar 2019 13:58:26 -0700 Subject: [PATCH 56/73] added PowerPC tests on TravisCI will be useful to check endianess. --- .travis.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.travis.yml b/.travis.yml index 7598ae86..d0cd4e70 100644 --- a/.travis.yml +++ b/.travis.yml @@ -44,3 +44,12 @@ matrix: - make clean - CC=aarch64-linux-gnu-gcc RUN_ENV=qemu-aarch64-static CPPFLAGS=-DXXH_VECTOR=3 LDFLAGS=-static make check # NEON code path - make clean + + - name: PowerPC + PPC64 compilation and consistency checks + install: + - sudo apt-get install -qq qemu-system-ppc qemu-user-static gcc-powerpc-linux-gnu + script: + - CC=powerpc-linux-gnu-gcc RUN_ENV=qemu-ppc-static CPPFLAGS=-m32 LDFLAGS=-static make check # Only scalar code path available + - make clean + - CC=powerpc-linux-gnu-gcc RUN_ENV=qemu-ppc64-static CPPFLAGS=-m64 LDFLAGS=-static make check # Only scalar code path available + - make clean From 3fe53a4ab97df94fdb9d97f2b0b77649e321a604 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 12 Mar 2019 14:21:24 -0700 Subject: [PATCH 57/73] fixed endianess issue --- .travis.yml | 2 +- xxh3.h | 125 ++++++++++++++++++++++++++-------------------- xxhash.c | 139 +++++++++++++++++++++++----------------------------- 3 files changed, 135 insertions(+), 131 deletions(-) diff --git a/.travis.yml b/.travis.yml index d0cd4e70..075a947a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -51,5 +51,5 @@ matrix: script: - CC=powerpc-linux-gnu-gcc RUN_ENV=qemu-ppc-static CPPFLAGS=-m32 LDFLAGS=-static make check # Only scalar code path available - make clean - - CC=powerpc-linux-gnu-gcc RUN_ENV=qemu-ppc64-static CPPFLAGS=-m64 LDFLAGS=-static make check # Only scalar code path available + - CC=powerpc-linux-gnu-gcc RUN_ENV=qemu-ppc64-static CFLAGS="-O3 -m64" LDFLAGS="-static -m64" make check # Only scalar code path available - make clean diff --git a/xxh3.h b/xxh3.h index a019ce0e..3ffa6825 100644 --- a/xxh3.h +++ b/xxh3.h @@ -253,7 +253,7 @@ XXH3_mul128(U64 ll1, U64 ll2) } -static XXH64_hash_t XXH64_avalanche2(U64 h64) +static XXH64_hash_t XXH3_avalanche(U64 h64) { h64 ^= h64 >> 29; h64 *= PRIME64_3; @@ -265,6 +265,7 @@ static XXH64_hash_t XXH64_avalanche2(U64 h64) /* ========================================== * Short keys * ========================================== */ + XXH_FORCE_INLINE XXH64_hash_t XXH3_len_1to3_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed) { @@ -278,11 +279,10 @@ XXH3_len_1to3_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t U32 const l1 = (U32)(c1) + ((U32)(c2) << 8); U32 const l2 = (U32)(len) + ((U32)(c3) << 2); U64 const ll11 = XXH_mult32to64((l1 + seed + key32[0]), (l2 + key32[1])); - return XXH64_avalanche2(ll11); + return XXH3_avalanche(ll11); } } - XXH_FORCE_INLINE XXH64_hash_t XXH3_len_4to8_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed) { @@ -290,10 +290,22 @@ XXH3_len_4to8_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t assert(len >= 4 && len <= 8); { const U32* const key32 = (const U32*) keyPtr; U64 acc = PRIME64_1 * (len + seed); - U32 const l1 = XXH_read32(data) + key32[0]; - U32 const l2 = XXH_read32((const BYTE*)data + len - 4) + key32[1]; + U32 const l1 = XXH_readLE32(data) + key32[0]; + U32 const l2 = XXH_readLE32((const BYTE*)data + len - 4) + key32[1]; acc += XXH_mult32to64(l1, l2); - return XXH64_avalanche2(acc); + return XXH3_avalanche(acc); + } +} + +XXH_FORCE_INLINE U64 +XXH3_readKey64(const void* ptr) +{ + assert(((size_t)ptr & 7) == 0); /* aligned on 8-bytes boundaries */ + if (XXH_CPU_LITTLE_ENDIAN) { + return *(const U64*)ptr; + } else { + const U32* const ptr32 = (const U32*)ptr; + return (U64)ptr32[0] + (((U64)ptr32[1]) << 32); } } @@ -305,10 +317,10 @@ XXH3_len_9to16_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_ assert(len >= 9 && len <= 16); { const U64* const key64 = (const U64*) keyPtr; U64 acc = PRIME64_1 * (len + seed); - U64 const ll1 = XXH_read64(data) + key64[0]; - U64 const ll2 = XXH_read64((const BYTE*)data + len - 8) + key64[1]; + U64 const ll1 = XXH_readLE64(data) + XXH3_readKey64(key64); + U64 const ll2 = XXH_readLE64((const BYTE*)data + len - 8) + XXH3_readKey64(key64+1); acc += XXH3_mul128(ll1, ll2); - return XXH64_avalanche2(acc); + return XXH3_avalanche(acc); } } @@ -325,9 +337,7 @@ XXH3_len_0to16_64b(const void* data, size_t len, XXH64_hash_t seed) } -/* ========================================== - * Long keys - * ========================================== */ +/* === Long Keys === */ #define STRIPE_LEN 64 #define STRIPE_ELTS (STRIPE_LEN / sizeof(U32)) @@ -428,7 +438,7 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k #else /* scalar variant - universal */ - U64* const xacc = (U64*) acc; + U64* const xacc = (U64*) acc; /* presumed aligned */ const U32* const xdata = (const U32*) data; const U32* const xkey = (const U32*) key; @@ -436,8 +446,10 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k for (i=0; i < (int)ACC_NB; i++) { int const left = 2*i; int const right= 2*i + 1; - xacc[i] += XXH_mult32to64(xdata[left] + xkey[left], xdata[right] + xkey[right]); - xacc[i] += xdata[left] + ((U64)xdata[right] << 32); + U32 const dataLeft = XXH_readLE32(xdata + left); + U32 const dataRight = XXH_readLE32(xdata + right); + xacc[i] += XXH_mult32to64(dataLeft + xkey[left], dataRight + xkey[right]); + xacc[i] += dataLeft + ((U64)dataRight << 32); } #endif @@ -531,8 +543,8 @@ static void XXH3_scrambleAcc(void* acc, const void* key) int const right= 2*i + 1; xacc[i] ^= xacc[i] >> 47; - { U64 p1 = XXH_mult32to64(xacc[i] & 0xFFFFFFFF, xkey[left]); - U64 p2 = XXH_mult32to64(xacc[i] >> 32, xkey[right]); + { U64 const p1 = XXH_mult32to64(xacc[i] & 0xFFFFFFFF, xkey[left]); + U64 const p2 = XXH_mult32to64(xacc[i] >> 32, xkey[right]); xacc[i] = p1 ^ p2; } } @@ -548,24 +560,6 @@ static void XXH3_accumulate(U64* acc, const void* restrict data, const U32* rest } } -XXH_FORCE_INLINE U64 XXH3_mix16B(const void* data, const U64* key) -{ - return XXH3_mul128((XXH_read64(data) ^ key[0]), XXH_read64((const BYTE*)data+8) ^ key[1]); -} - -static XXH64_hash_t XXH3_merge64B(const U64* data, const void* keyVoid, U64 start) -{ - const U64* const key = (const U64*)keyVoid; /* presumed aligned */ - - U64 acc = start; - acc += XXH3_mix16B(data+0, key+0); - acc += XXH3_mix16B(data+2, key+2); - acc += XXH3_mix16B(data+4, key+4); - acc += XXH3_mix16B(data+6, key+6); - - return XXH64_avalanche2(acc); -} - static void XXH3_hashLong(U64* acc, const void* data, size_t len) { @@ -593,6 +587,35 @@ XXH3_hashLong(U64* acc, const void* data, size_t len) } } } + +XXH_FORCE_INLINE U64 XXH3_mix16B(const void* data, const void* key) +{ + const U64* const key64 = (const U64*)key; + return XXH3_mul128( + XXH_readLE64(data) ^ XXH3_readKey64(key64), + XXH_readLE64((const BYTE*)data+8) ^ XXH3_readKey64(key64+1) ); +} + +XXH_FORCE_INLINE U64 XXH3_mix2Accs(const U64* acc, const void* key) +{ + const U64* const key64 = (const U64*)key; + return XXH3_mul128( + acc[0] ^ XXH3_readKey64(key64), + acc[1] ^ XXH3_readKey64(key64+1) ); +} + +static XXH64_hash_t XXH3_mergeAccs(const U64* acc, const U32* key, U64 start) +{ + U64 result64 = start; + + result64 += XXH3_mix2Accs(acc+0, key+0); + result64 += XXH3_mix2Accs(acc+2, key+4); + result64 += XXH3_mix2Accs(acc+4, key+8); + result64 += XXH3_mix2Accs(acc+6, key+12); + + return XXH3_avalanche(result64); +} + __attribute__((noinline)) static XXH64_hash_t /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ XXH3_hashLong_64b(const void* data, size_t len, XXH64_hash_t seed) { @@ -602,13 +625,11 @@ XXH3_hashLong_64b(const void* data, size_t len, XXH64_hash_t seed) /* converge into final hash */ assert(sizeof(acc) == 64); - return XXH3_merge64B(acc, kKey, (U64)len * PRIME64_1); + return XXH3_mergeAccs(acc, kKey, (U64)len * PRIME64_1); } -/* ========================================== - * Public entry point - * ========================================== */ +/* === Public entry point === */ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed) @@ -640,7 +661,7 @@ XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed) acc += XXH3_mix16B(p+0, key+0); acc += XXH3_mix16B(p+len-16, key+2); - return XXH64_avalanche2(acc); + return XXH3_avalanche(acc); } } @@ -674,7 +695,7 @@ XXH3_len_1to3_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_ U32 const l2 = (U32)(len) + ((U32)(c3) << 2); U64 const ll11 = XXH_mult32to64(l1 + seed + key32[0], l2 + key32[1]); U64 const ll12 = XXH_mult32to64(l1 + key32[2], l2 - seed + key32[3]); - return (XXH128_hash_t) { XXH64_avalanche2(ll11), XXH64_avalanche2(ll12) }; + return (XXH128_hash_t) { XXH3_avalanche(ll11), XXH3_avalanche(ll12) }; } } @@ -687,11 +708,11 @@ XXH3_len_4to8_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_ { const U32* const key32 = (const U32*) keyPtr; U64 acc1 = PRIME64_1 * ((U64)len + seed); U64 acc2 = PRIME64_2 * ((U64)len - seed); - U32 const l1 = XXH_read32(data); - U32 const l2 = XXH_read32((const BYTE*)data + len - 4); + U32 const l1 = XXH_readLE32(data); + U32 const l2 = XXH_readLE32((const BYTE*)data + len - 4); acc1 += XXH_mult32to64(l1 + key32[0], l2 + key32[1]); acc2 += XXH_mult32to64(l1 - key32[2], l2 + key32[3]); - return (XXH128_hash_t){ XXH64_avalanche2(acc1), XXH64_avalanche2(acc2) }; + return (XXH128_hash_t){ XXH3_avalanche(acc1), XXH3_avalanche(acc2) }; } } @@ -704,11 +725,11 @@ XXH3_len_9to16_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash { const U64* const key64 = (const U64*) keyPtr; U64 acc1 = PRIME64_1 * ((U64)len + seed); U64 acc2 = PRIME64_2 * ((U64)len - seed); - U64 const ll1 = XXH_read64(data); - U64 const ll2 = XXH_read64((const BYTE*)data + len - 8); - acc1 += XXH3_mul128(ll1 + key64[0], ll2 + key64[1]); - acc2 += XXH3_mul128(ll1 + key64[2], ll2 + key64[3]); - return (XXH128_hash_t){ XXH64_avalanche2(acc1), XXH64_avalanche2(acc2) }; + U64 const ll1 = XXH_readLE64(data); + U64 const ll2 = XXH_readLE64((const BYTE*)data + len - 8); + acc1 += XXH3_mul128(ll1 + XXH3_readKey64(key64+0), ll2 + XXH3_readKey64(key64+1)); + acc2 += XXH3_mul128(ll1 + XXH3_readKey64(key64+2), ll2 + XXH3_readKey64(key64+3)); + return (XXH128_hash_t){ XXH3_avalanche(acc1), XXH3_avalanche(acc2) }; } } @@ -734,8 +755,8 @@ XXH3_hashLong_128b(const void* data, size_t len, XXH64_hash_t seed) /* converge into final hash */ assert(sizeof(acc) == 64); - { U64 const part1 = XXH3_merge64B(acc, kKey, (U64)len * PRIME64_1); - U64 const part2 = XXH3_merge64B(acc, kKey+16, ((U64)len+1) * PRIME64_2); + { U64 const part1 = XXH3_mergeAccs(acc, kKey, (U64)len * PRIME64_1); + U64 const part2 = XXH3_mergeAccs(acc, kKey+16, ((U64)len+1) * PRIME64_2); return (XXH128_hash_t) { part1, part2 }; } } @@ -772,7 +793,7 @@ XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed) { U64 const part1 = acc1 + acc2; U64 const part2 = (acc1 * PRIME64_3) + (acc2 * PRIME64_4) + ((len - seed) * PRIME64_2); - return (XXH128_hash_t) { XXH64_avalanche2(part1), -XXH64_avalanche2(part2) }; + return (XXH128_hash_t) { XXH3_avalanche(part1), -XXH3_avalanche(part2) }; } } } diff --git a/xxhash.c b/xxhash.c index c5fec9b0..82ee887b 100644 --- a/xxhash.c +++ b/xxhash.c @@ -154,6 +154,9 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcp # endif #endif + +/* === Memory access === */ + #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) /* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ @@ -181,6 +184,22 @@ static U32 XXH_read32(const void* memPtr) #endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ +/* === Endianess === */ +typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess; + +/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */ +#ifndef XXH_CPU_LITTLE_ENDIAN +static int XXH_isLittleEndian(void) +{ + const union { U32 u; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ + return one.c[0]; +} +# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() +#endif + + + + /* **************************************** * Compiler-specific Functions and Macros ******************************************/ @@ -210,44 +229,29 @@ static U32 XXH_swap32 (U32 x) #endif -/* ************************************* -* Architecture Macros -***************************************/ -typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess; - -/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */ -#ifndef XXH_CPU_LITTLE_ENDIAN -static int XXH_isLittleEndian(void) -{ - const union { U32 u; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ - return one.c[0]; -} -# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() -#endif - - /* *************************** * Memory reads *****************************/ typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; -XXH_FORCE_INLINE U32 -XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align) +XXH_FORCE_INLINE U32 XXH_readLE32(const void* ptr) { - if (align==XXH_unaligned) - return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); - else - return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr); + return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); } -XXH_FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian) +static U32 XXH_readBE32(const void* ptr) { - return XXH_readLE32_align(ptr, endian, XXH_unaligned); + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); } -static U32 XXH_readBE32(const void* ptr) +XXH_FORCE_INLINE U32 +XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align) { - return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); + if (align==XXH_unaligned) { + return XXH_readLE32(ptr); + } else { + return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr); + } } @@ -492,8 +496,8 @@ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int s } -XXH_FORCE_INLINE XXH_errorcode -XXH32_update_endian(XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian) +XXH_PUBLIC_API XXH_errorcode +XXH32_update(XXH32_state_t* state, const void* input, size_t len) { if (input==NULL) #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) @@ -517,10 +521,10 @@ XXH32_update_endian(XXH32_state_t* state, const void* input, size_t len, XXH_end if (state->memsize) { /* some data left from previous update */ XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize); { const U32* p32 = state->mem32; - state->v1 = XXH32_round(state->v1, XXH_readLE32(p32, endian)); p32++; - state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian)); p32++; - state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian)); p32++; - state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian)); + state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++; + state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++; + state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++; + state->v4 = XXH32_round(state->v4, XXH_readLE32(p32)); } p += 16-state->memsize; state->memsize = 0; @@ -534,10 +538,10 @@ XXH32_update_endian(XXH32_state_t* state, const void* input, size_t len, XXH_end U32 v4 = state->v4; do { - v1 = XXH32_round(v1, XXH_readLE32(p, endian)); p+=4; - v2 = XXH32_round(v2, XXH_readLE32(p, endian)); p+=4; - v3 = XXH32_round(v3, XXH_readLE32(p, endian)); p+=4; - v4 = XXH32_round(v4, XXH_readLE32(p, endian)); p+=4; + v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4; + v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4; + v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4; + v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4; } while (p<=limit); state->v1 = v1; @@ -556,17 +560,6 @@ XXH32_update_endian(XXH32_state_t* state, const void* input, size_t len, XXH_end } -XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len) -{ - XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; - - if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) - return XXH32_update_endian(state_in, input, len, XXH_littleEndian); - else - return XXH32_update_endian(state_in, input, len, XXH_bigEndian); -} - - XXH_FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian) { @@ -686,23 +679,23 @@ static U64 XXH_swap64 (U64 x) } #endif -XXH_FORCE_INLINE U64 -XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align) +XXH_FORCE_INLINE U64 XXH_readLE64(const void* ptr) { - if (align==XXH_unaligned) - return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); - else - return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr); + return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); } -XXH_FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian) +static U64 XXH_readBE64(const void* ptr) { - return XXH_readLE64_align(ptr, endian, XXH_unaligned); + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); } -static U64 XXH_readBE64(const void* ptr) +XXH_FORCE_INLINE U64 +XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align) { - return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); + if (align==XXH_unaligned) + return XXH_readLE64(ptr); + else + return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr); } @@ -953,8 +946,8 @@ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long return XXH_OK; } -XXH_FORCE_INLINE XXH_errorcode -XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian) +XXH_PUBLIC_API XXH_errorcode +XXH64_update (XXH64_state_t* state, const void* input, size_t len) { if (input==NULL) #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) @@ -976,10 +969,10 @@ XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_en if (state->memsize) { /* tmp buffer is full */ XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize); - state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0, endian)); - state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1, endian)); - state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2, endian)); - state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3, endian)); + state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0)); + state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1)); + state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2)); + state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3)); p += 32-state->memsize; state->memsize = 0; } @@ -992,10 +985,10 @@ XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_en U64 v4 = state->v4; do { - v1 = XXH64_round(v1, XXH_readLE64(p, endian)); p+=8; - v2 = XXH64_round(v2, XXH_readLE64(p, endian)); p+=8; - v3 = XXH64_round(v3, XXH_readLE64(p, endian)); p+=8; - v4 = XXH64_round(v4, XXH_readLE64(p, endian)); p+=8; + v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8; + v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8; + v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8; + v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8; } while (p<=limit); state->v1 = v1; @@ -1013,16 +1006,6 @@ XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_en return XXH_OK; } -XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len) -{ - XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; - - if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) - return XXH64_update_endian(state_in, input, len, XXH_littleEndian); - else - return XXH64_update_endian(state_in, input, len, XXH_bigEndian); -} - XXH_FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian) { U64 h64; @@ -1077,7 +1060,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src /* ******************************************************************* * XXH3 * New generation hash designed for speed on small keys and vectorization -*********************************************************************/ +********************************************************************** */ #include "xxh3.h" From e6433e8dfda583809bbc7f9ce733331827a62c63 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 12 Mar 2019 17:36:37 -0700 Subject: [PATCH 58/73] restored clang #pragma unroll statement that has been accidentally lost in an update. --- .travis.yml | 8 ++++---- xxh3.h | 5 ++++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 075a947a..78d918d7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -35,14 +35,14 @@ matrix: libc6-dev-arm64-cross script: # arm (32-bit) - - CC=arm-linux-gnueabi-gcc RUN_ENV=qemu-arm-static CPPFLAGS=-DXXH_VECTOR=0 LDFLAGS=-static make check # Scalar code path + - CC=arm-linux-gnueabi-gcc CPPFLAGS=-DXXH_VECTOR=0 LDFLAGS=-static RUN_ENV=qemu-arm-static make check # Scalar code path - make clean - - CC=arm-linux-gnueabi-gcc RUN_ENV=qemu-arm-static CPPFLAGS=-DXXH_VECTOR=3 LDFLAGS=-static make check # NEON code path + - CC=arm-linux-gnueabi-gcc CPPFLAGS=-DXXH_VECTOR=3 CFLAGS="-O3 -march=armv7-a -mfloat-abi=hard -mfpu=neon" LDFLAGS=-static RUN_ENV=qemu-arm-static make check # NEON code path - make clean # aarch64 - - CC=aarch64-linux-gnu-gcc RUN_ENV=qemu-aarch64-static CPPFLAGS=-DXXH_VECTOR=0 LDFLAGS=-static make check # Scalar code path + - CC=aarch64-linux-gnu-gcc CPPFLAGS=-DXXH_VECTOR=0 LDFLAGS=-static RUN_ENV=qemu-aarch64-static make check # Scalar code path - make clean - - CC=aarch64-linux-gnu-gcc RUN_ENV=qemu-aarch64-static CPPFLAGS=-DXXH_VECTOR=3 LDFLAGS=-static make check # NEON code path + - CC=aarch64-linux-gnu-gcc CPPFLAGS=-DXXH_VECTOR=3 LDFLAGS=-static RUN_ENV=qemu-aarch64-static make check # NEON code path - make clean - name: PowerPC + PPC64 compilation and consistency checks diff --git a/xxh3.h b/xxh3.h index 3ffa6825..0a5c4ccf 100644 --- a/xxh3.h +++ b/xxh3.h @@ -203,7 +203,6 @@ XXH3_mul128(U64 ll1, U64 ll2) && !(defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM == 0 && __TARGET_ARCH_THUMB == 4) \ && (defined(__ARM_ARCH_6T2__) || __ARM_ARCH > 6) /* ARMv6T2 or later */ - U64 t; U32 w[4] = { 0 }; U32 u[2] = { (U32)(ll1 >> 32), (U32)ll1 }; U32 v[2] = { (U32)(ll2 >> 32), (U32)ll2 }; @@ -554,6 +553,10 @@ static void XXH3_scrambleAcc(void* acc, const void* key) static void XXH3_accumulate(U64* acc, const void* restrict data, const U32* restrict key, size_t nbStripes) { size_t n; + /* Clang doesn't unroll this loop without the pragma. Unrolling can be up to 1.4x faster. */ +#if defined(__clang__) && !defined(__OPTIMIZE_SIZE__) +# pragma clang loop unroll(enable) +#endif for (n = 0; n < nbStripes; n++ ) { XXH3_accumulate_512(acc, (const BYTE*)data + n*STRIPE_LEN, key); key += 2; From af852ac75212661ceb764ee50081b6fe5ddddf5e Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 12 Mar 2019 17:48:59 -0700 Subject: [PATCH 59/73] fixed last strict aliasing issues --- xxh3.h | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/xxh3.h b/xxh3.h index 0a5c4ccf..df1b04cf 100644 --- a/xxh3.h +++ b/xxh3.h @@ -638,7 +638,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed) { const BYTE* const p = (const BYTE*)data; - const U64* const key = (const U64*)(const void*)kKey; + const char* const key = (const char*)kKey; if (len <= 16) return XXH3_len_0to16_64b(data, len, seed); @@ -648,21 +648,21 @@ XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed) if (len > 96) { if (len > 128) return XXH3_hashLong_64b(data, len, seed); - acc += XXH3_mix16B(p+48, key+12); - acc += XXH3_mix16B(p+len-64, key+14); + acc += XXH3_mix16B(p+48, key+96); + acc += XXH3_mix16B(p+len-64, key+112); } - acc += XXH3_mix16B(p+32, key+8); - acc += XXH3_mix16B(p+len-48, key+10); + acc += XXH3_mix16B(p+32, key+64); + acc += XXH3_mix16B(p+len-48, key+80); } - acc += XXH3_mix16B(p+16, key+4); - acc += XXH3_mix16B(p+len-32, key+6); + acc += XXH3_mix16B(p+16, key+32); + acc += XXH3_mix16B(p+len-32, key+48); } acc += XXH3_mix16B(p+0, key+0); - acc += XXH3_mix16B(p+len-16, key+2); + acc += XXH3_mix16B(p+len-16, key+16); return XXH3_avalanche(acc); } @@ -772,27 +772,27 @@ XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed) { U64 acc1 = PRIME64_1 * (len + seed); U64 acc2 = 0; const BYTE* const p = (const BYTE*)data; - const U64* const key = (const U64*)(const void*)kKey; + const char* const key = (const char*)kKey; if (len > 32) { if (len > 64) { if (len > 96) { if (len > 128) return XXH3_hashLong_128b(data, len, seed); - acc1 += XXH3_mix16B(p+48, key+12); - acc2 += XXH3_mix16B(p+len-64, key+14); + acc1 += XXH3_mix16B(p+48, key+96); + acc2 += XXH3_mix16B(p+len-64, key+112); } - acc1 += XXH3_mix16B(p+32, key+8); - acc2 += XXH3_mix16B(p+len-48, key+10); + acc1 += XXH3_mix16B(p+32, key+64); + acc2 += XXH3_mix16B(p+len-48, key+80); } - acc1 += XXH3_mix16B(p+16, key+4); - acc2 += XXH3_mix16B(p+len-32, key+6); + acc1 += XXH3_mix16B(p+16, key+32); + acc2 += XXH3_mix16B(p+len-32, key+48); } acc1 += XXH3_mix16B(p+0, key+0); - acc2 += XXH3_mix16B(p+len-16, key+2); + acc2 += XXH3_mix16B(p+len-16, key+16); { U64 const part1 = acc1 + acc2; U64 const part2 = (acc1 * PRIME64_3) + (acc2 * PRIME64_4) + ((len - seed) * PRIME64_2); From 8423e82ef82b8fe1ecd290d8d2ffa66b82c11524 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 12 Mar 2019 18:13:46 -0700 Subject: [PATCH 60/73] fixed last integration issues --- xxh3.h | 4 ---- xxhash.h | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/xxh3.h b/xxh3.h index df1b04cf..39bc09d1 100644 --- a/xxh3.h +++ b/xxh3.h @@ -679,10 +679,6 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len) /* ========================================== * XXH3 128 bits (=> XXH128) * ========================================== */ -typedef struct { - XXH64_hash_t ll1; - XXH64_hash_t ll2; -} XXH128_hash_t; XXH_FORCE_INLINE XXH128_hash_t XXH3_len_1to3_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed) diff --git a/xxhash.h b/xxhash.h index 5b887223..d12ba0b5 100644 --- a/xxhash.h +++ b/xxhash.h @@ -328,17 +328,31 @@ struct XXH64_state_s { * XXH3 * New experimental hash ************************************************************************/ +#ifndef XXH_NO_LONG_LONG + +typedef struct { + XXH64_hash_t ll1; + XXH64_hash_t ll2; +} XXH128_hash_t; + #ifdef XXH_NAMESPACE +# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) # define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) # define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed) +# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits) +# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) #endif /* note : variant without seed produces same result as variant with seed == 0 */ +XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, unsigned long long seed); XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len); XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, unsigned long long seed); +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len); +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, unsigned long long seed); +#endif /* XXH_NO_LONG_LONG */ /*-********************************************************************** From 79014872e9b8d7165b8c88b412bcc880f21370a5 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 12 Mar 2019 18:27:32 -0700 Subject: [PATCH 61/73] separating ARM tests --- .travis.yml | 1 - Makefile | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 78d918d7..ce23c918 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,7 +8,6 @@ matrix: dist: xenial before_install: - sudo apt-get update -qq - - sudo apt-get install -qq gcc-arm-linux-gnueabi - sudo apt-get install -qq clang - sudo apt-get install -qq g++-multilib - sudo apt-get install -qq gcc-multilib diff --git a/Makefile b/Makefile index df4319ab..72e04f82 100644 --- a/Makefile +++ b/Makefile @@ -227,7 +227,7 @@ preview-man: clean-man man test: all namespaceTest check test-xxhsum-c c90test test-all: CFLAGS += -Werror -test-all: test test32 armtest clangtest cxxtest usan listL120 trailingWhitespace staticAnalyze cppcheck +test-all: test test32 clangtest cxxtest usan listL120 trailingWhitespace staticAnalyze cppcheck .PHONY: listL120 listL120: # extract lines >= 120 characters in *.{c,h}, by Takayuki Matsuoka (note : $$, for Makefile compatibility) From c1ae3287a17d1a123e9bd717806cba10a90420ee Mon Sep 17 00:00:00 2001 From: "easyaspi314 (Devin)" Date: Tue, 12 Mar 2019 22:20:45 -0400 Subject: [PATCH 62/73] Update ARM NEON code The NEON algorithms have now been updated to match the SSE2 algorithm. --- xxh3.h | 108 +++++++++++++++++++++++++++------------------------------ 1 file changed, 51 insertions(+), 57 deletions(-) diff --git a/xxh3.h b/xxh3.h index 39bc09d1..2dc61a35 100644 --- a/xxh3.h +++ b/xxh3.h @@ -208,17 +208,32 @@ XXH3_mul128(U64 ll1, U64 ll2) U32 v[2] = { (U32)(ll2 >> 32), (U32)ll2 }; U32 k; + /* U64 t = (U64)u[1] * (U64)v[1]; + * w[3] = t & 0xFFFFFFFF; + * k = t >> 32; */ __asm__("umull %0, %1, %2, %3" : "=r" (w[3]), "=r" (k) : "r" (u[1]), "r" (v[1])); + + /* t = (U64)u[0] * (U64)v[1] + w[2] + k; + * w[2] = t & 0xFFFFFFFF; + * k = t >> 32; */ __asm__("umaal %0, %1, %2, %3" : "+r" (w[2]), "+r" (k) : "r" (u[0]), "r" (v[1])); w[1] = k; k = 0; + + /* t = (U64)u[1] * (U64)v[0] + w[2] + k; + * w[2] = t & 0xFFFFFFFF; + * k = t >> 32; */ __asm__("umaal %0, %1, %2, %3" : "+r" (w[2]), "+r" (k) : "r" (u[1]), "r" (v[0])); + + /* t = (U64)u[0] * (U64)v[0] + w[1] + k; + * w[1] = t & 0xFFFFFFFF; + * k = t >> 32; */ __asm__("umaal %0, %1, %2, %3" : "+r" (w[1]), "+r" (k) : "r" (u[0]), "r" (v[0])); @@ -381,7 +396,7 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k } } -#elif (XXH_VECTOR == XXH_NEON) /* note : no longer correct, must be updated to match new formula */ +#elif (XXH_VECTOR == XXH_NEON) assert(((size_t)acc) & 15 == 0); { uint64x2_t* const xacc = (uint64x2_t *)acc; @@ -390,48 +405,31 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k size_t i; for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) { -# if !defined(__aarch64__) && !defined(__arm64__) && !defined(XXH_NO_ARM32_HACK) - /* On 32-bit ARM, we can take advantage of the packed registers. - * This is not portable to aarch64! - * Basically, on 32-bit NEON, registers are stored like so: - * .----------------------------------. - * | q8 | // uint32x4_t - * |-----------------.----------------| - * | d16 (.val[0]) | d17 (.val[1]) | // uint32x2x2_t - * '-----------------'----------------' - * vld2.32 will store its values into two double registers, returning - * a uint32x2_t. In NEON, this will be stored in, for example, d16 and d17. - * Reinterpret cast it to a uint32x4_t and you get q8 for free - * - * On aarch64, this was changed completely. - * - * aarch64 gave us 16 more quad registers, but they also removed this behavior, - * instead matching smaller registers to the lower sections of the higher - * registers and zeroing the rest. - * .----------------------------------..---------------------------------. - * | v8.4s | v9.4s | - * |-----------------.----------------|-----------------.-----------------| - * | v8.2s (.val[0]) | | v9.2s (.val[1]) | | - * '-----------------'----------------'-----------------'-----------------' - * On aarch64, ld2 will put it into v8.2s and v9.2s. Reinterpreting - * is not going to help us here, as half of it will end up being zero. */ - - uint32x2x2_t d = vld2_u32(xdata + i * 4); /* load and swap */ - uint32x2x2_t k = vld2_u32(xkey + i * 4); - /* Not sorry about breaking the strict aliasing rule. - * Using a union causes GCC to spit out nonsense, but an alias cast - * does not. */ - uint32x4_t const dk = vaddq_u32(*(uint32x4_t*)&d, *(uint32x4_t*)&k); - xacc[i] = vmlal_u32(xacc[i], vget_low_u32(dk), vget_high_u32(dk)); -# else - /* Portable, but slightly slower version */ - uint32x2x2_t const d = vld2_u32(xdata + i * 4); - uint32x2x2_t const k = vld2_u32(xkey + i * 4); - uint32x2_t const dkL = vadd_u32(d.val[0], k.val[0]); - uint32x2_t const dkH = vadd_u32(d.val[1], k.val[1]); /* uint32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ - /* xacc must be aligned on 16 bytes boundaries */ - xacc[i] = vmlal_u32(xacc[i], dkL, dkH); /* uint64 res[2] = {dk0*dk1,dk2*dk3} */ -# endif + uint32x4_t const d = vld1q_u32(xdata+i*4); /* U32 d[4] = xdata[i]; */ + uint32x4_t const k = vld1q_u32(xkey+i*4); /* U32 k[4] = xkey[i]; */ + uint32x4_t dk = vaddq_u32(d, k); /* U32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */ +#if !defined(__aarch64__) && !defined(__arm64__) /* ARM32-specific hack */ + /* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this. + * vzip on 32-bit ARM NEON will overwrite the original register, and I think that Clang + * assumes I don't want to destroy it and tries to make a copy. This slows down the code + * a lot. + * aarch64 not only uses an entirely different syntax, but it requires three + * instructions... + * ext v1.16B, v0.16B, #8 // select high bits because aarch64 can't address them directly + * zip1 v3.2s, v0.2s, v1.2s // first zip + * zip2 v2.2s, v0.2s, v1.2s // second zip + * ...to do what ARM does in one: + * vzip.32 d0, d1 // Interleave high and low bits and overwrite. */ + __asm__("vzip.32 %e0, %f0" : "+w" (dk)); /* dk = { dk0, dk2, dk1, dk3 }; */ + xacc[i] = vaddq_u64(xacc[i], vreinterpretq_u64_u32(d)); /* xacc[i] += (U64x2)d; */ + xacc[i] = vmlal_u32(xacc[i], vget_low_u32(dk), vget_high_u32(dk)); /* xacc[i] += { (U64)dk0*dk1, (U64)dk2*dk3 }; */ +#else + /* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */ + uint32x2_t dkL = vmovn_u64(vreinterpretq_u64_u32(dk)); /* U32 dkL[2] = dk & 0xFFFFFFFF; */ + uint32x2_t dkH = vshrn_n_u64(vreinterpretq_u64_u32(dk), 32); /* U32 dkH[2] = dk >> 32; */ + xacc[i] = vaddq_u64(xacc[i], vreinterpretq_u64_u32(d)); /* xacc[i] += (U64x2)d; */ + xacc[i] = vmlal_u32(xacc[i], dkL, dkH); /* xacc[i] += (U64x2)dkL*(U64x2)dkH; */ +#endif } } @@ -502,21 +500,17 @@ static void XXH3_scrambleAcc(void* acc, const void* key) } } } -#elif (XXH_VECTOR == XXH_NEON) /* note : no longer correct, must be updated to match new formula */ +#elif (XXH_VECTOR == XXH_NEON) assert(((size_t)acc) & 15 == 0); - { uint64x2_t* const xacc = (uint64x2_t*) acc; - const uint32_t* const xkey = (const uint32_t *) key; - uint64x2_t xor_p5 = vdupq_n_u64(PRIME64_5); + { uint64x2_t* const xacc = (uint64x2_t*) acc; + const uint32_t* const xkey = (const uint32_t*) key; size_t i; - /* Clang and GCC like to put NEON constant loads into the loop. */ - __asm__("" : "+w" (xor_p5)); + for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) { uint64x2_t data = xacc[i]; - uint64x2_t const shifted = vshrq_n_u64(data, 47); - data = veorq_u64(data, shifted); - data = veorq_u64(data, xor_p5); - + uint64x2_t const shifted = vshrq_n_u64(data, 47); /* uint64 shifted[2] = data >> 47; */ + data = veorq_u64(data, shifted); /* data ^= shifted; */ { /* shuffle: 0, 1, 2, 3 -> 0, 2, 1, 3 */ uint32x2x2_t const d = @@ -524,10 +518,10 @@ static void XXH3_scrambleAcc(void* acc, const void* key) vget_low_u32(vreinterpretq_u32_u64(data)), vget_high_u32(vreinterpretq_u32_u64(data)) ); - uint32x2x2_t const k = vld2_u32 (xkey+i*4); /* load and swap */ - uint64x2_t const dk = vmull_u32(d.val[0],k.val[0]); /* U64 dk[2] = {d0 * k0, d2 * k2} */ - uint64x2_t const dk2 = vmull_u32(d.val[1],k.val[1]); /* U64 dk2[2] = {d1 * k1, d3 * k3} */ - xacc[i] = veorq_u64(dk, dk2); /* xacc[i] = dk ^ dk2; */ + uint32x2x2_t const k = vld2_u32(xkey+i*4); /* load and swap */ + uint64x2_t const dk = vmull_u32(d.val[0],k.val[0]); /* U64 dk[2] = {(U64)d0*k0, (U64)d2*k2} */ + uint64x2_t const dk2 = vmull_u32(d.val[1],k.val[1]); /* U64 dk2[2] = {(U64)d1*k1, (U64)d3*k3} */ + xacc[i] = veorq_u64(dk, dk2); /* xacc[i] = dk^dk2; */ } } } From ba14aed723deb8adc701cb09fa5e92d63e8bd6e7 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Wed, 13 Mar 2019 10:42:08 -0700 Subject: [PATCH 63/73] removed cppcheck from test-all this test is unreliable: dubious warning messages, and results vary depending on version. --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 72e04f82..79c0ca64 100644 --- a/Makefile +++ b/Makefile @@ -227,7 +227,7 @@ preview-man: clean-man man test: all namespaceTest check test-xxhsum-c c90test test-all: CFLAGS += -Werror -test-all: test test32 clangtest cxxtest usan listL120 trailingWhitespace staticAnalyze cppcheck +test-all: test test32 clangtest cxxtest usan listL120 trailingWhitespace staticAnalyze .PHONY: listL120 listL120: # extract lines >= 120 characters in *.{c,h}, by Takayuki Matsuoka (note : $$, for Makefile compatibility) From d7419363d3003e78e332d237b4cb3cdee2d9d131 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Wed, 13 Mar 2019 10:47:41 -0700 Subject: [PATCH 64/73] travis: moved ARM tests to Xenial in an effort to replicate success on local Xenial VM --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index ce23c918..25f89735 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,6 +24,7 @@ matrix: - CPPFLAGS="-mavx2 -DXXH_VECTOR=2" make check # AVX2 code path - name: ARM + aarch64 compilation and consistency checks + dist: xenial install: - sudo apt-get install -qq qemu-system-arm From 2e86e206963e147b8d3bf0f60877963f252b035a Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Wed, 13 Mar 2019 12:14:21 -0700 Subject: [PATCH 65/73] added list of opened questions for xxh3 --- xxhash.h | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/xxhash.h b/xxhash.h index d12ba0b5..6a42838a 100644 --- a/xxhash.h +++ b/xxhash.h @@ -330,12 +330,67 @@ struct XXH64_state_s { ************************************************************************/ #ifndef XXH_NO_LONG_LONG + +/* ============================================ + * XXH3 is a new hash algorithm, + * featuring vastly improved speed performance + * for both small and large inputs. + * A full speed analysis will be published, + * it requires a lot more space than this comment can handle. + * In general, expect XXH3 to run about ~2x faster on large inputs, + * and >3x faster on small ones, though exact difference depend on platform. + * + * The algorithm is portable, will generate the same hash on all platforms. + * It benefits greatly from vectorization units, but does not require it. + * + * XXH3 offers 2 variants, _64bits and _128bits. + * The low 64-bits of the _128bits variant are the same as the _64bits variant. + * However, if only 64-bits are needed, prefer calling the _64bits variant. + * It reduces the amount of mixing, resulting in faster speed on small inputs. + * + * The XXH3 algorithm is still considered experimental. + * It's possible to use it for ephemeral data, but avoid storing long-term values for later re-use. + * While labelled experimental, the produced result can still change between versions. + * + * The API currently supports one-shot hashing only. + * The full version will include streaming capability, and canonical representation + * Long term optional feature may include custom secret keys, and secret key generation. + * + * There are still a number of opened questions that community can influence during the experimental period. + * I'm trying to list a few of them below, though don't consider this list as complete. + * + * - 128-bits output type : currently defined as a structure of 2 64-bits fields. + * That's because 128-bits values do not exist in C standard. + * Note that it means that, at byte level, result is not identical depending on endianess. + * However, at field level, they are identical on all platforms. + * The canonical representation will solve the issue of identical byte-level representation across platforms, + * which is necessary for serialization. + * + * - Canonical representation : for the 64-bits variant, it's the same as XXH64() (aka big-endian). + * What should it be for the 128-bits variant ? + * Since it's no longer a scalar value, big-endian representation is no longer an obvious choice. + * One possibility : represent it as the concatenation of two 64-bits canonical representation (aka 2x big-endian) + * Another one : represent it in the same order as natural order for little-endian platforms. + * Less consistent with existing convention for XXH32/XXH64, but may be more natural for little-endian platforms. + * + * - Seed type for 128-bits variant : currently, it's a single 64-bit value, like the 64-bits variant. + * It could be argued that it's more logical to offer a 128-bit seed capability for a 128-bit hash. + * Although it's also more difficult to use, since it requires to declare and pass a structure instead of a value. + * It would either replace current choice, or add a new one. + * Farmhash, for example, offers both variants (the 128-bits seed variant is called `doubleSeed`). + * + * - Result for len==0 : Currently, the result of hashing a zero-length input is the seed. + * This mimics the behavior of a crc : in which case, a seed is effectively an accumulator, so it's not updated if input is empty. + * Consequently, by default, when no seed specified, it returns zero. That part seems okay (it used to be a request for XXH32/XXH64). + * But is it still fine to return the seed when the seed is non-zero ? + * Are there use case which would depend on this behavior, or would prefer a mixing of the seed ? + */ + typedef struct { XXH64_hash_t ll1; XXH64_hash_t ll2; } XXH128_hash_t; - #ifdef XXH_NAMESPACE # define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) # define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) From aaea63b97921a3c17c2e6bbb981053c0fc13d7f3 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Wed, 13 Mar 2019 14:44:41 -0700 Subject: [PATCH 66/73] added XXH128 consistency tests --- xxhash.h | 26 ++++++++---- xxhsum.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 136 insertions(+), 9 deletions(-) diff --git a/xxhash.h b/xxhash.h index 6a42838a..1dc86536 100644 --- a/xxhash.h +++ b/xxhash.h @@ -344,7 +344,7 @@ struct XXH64_state_s { * It benefits greatly from vectorization units, but does not require it. * * XXH3 offers 2 variants, _64bits and _128bits. - * The low 64-bits of the _128bits variant are the same as the _64bits variant. + * The first 64-bits field of the _128bits variant is the same as _64bits result. * However, if only 64-bits are needed, prefer calling the _64bits variant. * It reduces the amount of mixing, resulting in faster speed on small inputs. * @@ -360,21 +360,33 @@ struct XXH64_state_s { * I'm trying to list a few of them below, though don't consider this list as complete. * * - 128-bits output type : currently defined as a structure of 2 64-bits fields. - * That's because 128-bits values do not exist in C standard. + * That's because 128-bit values do not exist in C standard. * Note that it means that, at byte level, result is not identical depending on endianess. * However, at field level, they are identical on all platforms. * The canonical representation will solve the issue of identical byte-level representation across platforms, * which is necessary for serialization. + * Would there be a better representation for a 128-bit hash result ? + * Are the names of the inner 64-bit fields important ? Should they be changed ? * - * - Canonical representation : for the 64-bits variant, it's the same as XXH64() (aka big-endian). - * What should it be for the 128-bits variant ? + * - Canonical representation : for the 64-bit variant, canonical representation is the same as XXH64() (aka big-endian). + * What should it be for the 128-bit variant ? * Since it's no longer a scalar value, big-endian representation is no longer an obvious choice. * One possibility : represent it as the concatenation of two 64-bits canonical representation (aka 2x big-endian) - * Another one : represent it in the same order as natural order for little-endian platforms. + * Another one : represent it in the same order as natural order in the struct for little-endian platforms. * Less consistent with existing convention for XXH32/XXH64, but may be more natural for little-endian platforms. * - * - Seed type for 128-bits variant : currently, it's a single 64-bit value, like the 64-bits variant. - * It could be argued that it's more logical to offer a 128-bit seed capability for a 128-bit hash. + * - Associated functions for 128-bit hash : simple things, such as checking if 2 hashes are equal, become more difficult with struct. + * Granted, it's not terribly difficult to create a comparator, but it's still a workload. + * Would it be beneficial to declare and define a comparator function for XXH128_hash_t ? + * Are there other operations on XXH128_hash_t which would be desirable ? + * + * - Variant compatibility : The first 64-bit field of the _128bits variant is the same as the result of _64bits. + * This is not a compulsory behavior. It just felt that it "wouldn't hurt", and might even help in some (unidentified) cases. + * But it might influence the design of XXH128_hash_t, in ways which may block other possibilities. + * Good idea, bad idea ? + * + * - Seed type for 128-bits variant : currently, it's a single 64-bit value, like the 64-bit variant. + * It could be argued that it's more logical to offer a 128-bit seed input parameter for a 128-bit hash. * Although it's also more difficult to use, since it requires to declare and pass a structure instead of a value. * It would either replace current choice, or add a new one. * Farmhash, for example, offers both variants (the 128-bits seed variant is called `doubleSeed`). diff --git a/xxhsum.c b/xxhsum.c index 7428b62d..1b661cd5 100644 --- a/xxhsum.c +++ b/xxhsum.c @@ -438,7 +438,7 @@ static int BMK_benchInternal(size_t keySize, U32 specificTest) * ensure results consistency accross platforms *********************************************** */ -static void BMK_checkResult32(U32 r1, U32 r2) +static void BMK_checkResult32(XXH32_hash_t r1, XXH32_hash_t r2) { static int nbTests = 1; if (r1!=r2) { @@ -448,7 +448,7 @@ static void BMK_checkResult32(U32 r1, U32 r2) nbTests++; } -static void BMK_checkResult64(U64 r1, U64 r2) +static void BMK_checkResult64(XXH64_hash_t r1, XXH64_hash_t r2) { static int nbTests = 1; if (r1!=r2) { @@ -459,6 +459,19 @@ static void BMK_checkResult64(U64 r1, U64 r2) nbTests++; } +static void BMK_checkResult128(XXH128_hash_t r1, XXH128_hash_t r2) +{ + static int nbTests = 1; + if ((r1.ll1 != r2.ll1) || (r1.ll2 != r2.ll2)) { + DISPLAY("\rERROR : Test%3i : 128-bit values non equals !!!!! \n", nbTests); + DISPLAY("\r { 0x%08X%08XULL, 0x%08X%08XULL } != { 0x%08X%08XULL, %08X%08XULL } \n", + (U32)(r1.ll1>>32), (U32)r1.ll1, (U32)(r1.ll2>>32), (U32)r1.ll2, + (U32)(r2.ll1>>32), (U32)r2.ll1, (U32)(r2.ll2>>32), (U32)r2.ll2 ); + exit(1); + } + nbTests++; +} + static void BMK_testSequence64(const void* sentence, size_t len, U64 seed, U64 Nresult) { @@ -494,6 +507,28 @@ static void BMK_testXXH3(const void* data, size_t len, U64 seed, U64 Nresult) } } +static void BMK_testXXH128(const void* data, size_t len, U64 seed, XXH128_hash_t Nresult) +{ + { XXH128_hash_t const Dresult = XXH3_128bits_withSeed(data, len, seed); + BMK_checkResult128(Dresult, Nresult); + + /* check that XXH128() is identical to XXH3_128bits_withSeed() */ + { XXH128_hash_t const Dresult2 = XXH128(data, len, seed); + BMK_checkResult128(Dresult2, Nresult); + } + + /* check that first field is equal to _64bits variant */ + { U64 const result64 = XXH3_64bits_withSeed(data, len, seed); + BMK_checkResult64(result64, Nresult.ll1); + } } + + /* check that the no-seed variant produces same result as seed==0 */ + if (seed == 0) { + XXH128_hash_t const Dresult = XXH3_128bits(data, len); + BMK_checkResult128(Dresult, Nresult); + } +} + static void BMK_testSequence(const void* sequence, size_t len, U32 seed, U32 Nresult) { XXH32_state_t state; @@ -574,6 +609,86 @@ static void BMK_sanityCheck(void) BMK_testXXH3(sanityBuffer,2243, 0, 0xE7C1890BDBD2B245ULL); /* 3 blocks, last stripe is overlapping */ BMK_testXXH3(sanityBuffer,2243, prime, 0x3A68386AED0C50A7ULL); /* 3 blocks, last stripe is overlapping */ + { XXH128_hash_t const expected = { 0, 0 }; + BMK_testXXH128(NULL, 0, 0, expected); /* zero-length hash is { seed, -seed } by default */ + } + { XXH128_hash_t const expected = { prime, -(U64)prime }; + BMK_testXXH128(NULL, 0, prime, expected); + } + { XXH128_hash_t const expected = { 0xE2C6D3B40D6F9203ULL, 0x82895983D246CA74ULL }; + BMK_testXXH128(sanityBuffer, 1, 0, expected); /* 1-3 */ + } + { XXH128_hash_t const expected = { 0xCEE5DF124E6135DCULL, 0xFA2DA0269396F32DULL }; + BMK_testXXH128(sanityBuffer, 1, prime, expected); /* 1-3 */ + } + { XXH128_hash_t const expected = { 0x585D6F8D1AAD96A2ULL, 0x2791F3B193F0AB86ULL }; + BMK_testXXH128(sanityBuffer, 6, 0, expected); /* 4-8 */ + } + { XXH128_hash_t const expected = { 0x133EC8CA1739250FULL, 0xDF3F422D70BDE07FULL }; + BMK_testXXH128(sanityBuffer, 6, prime, expected); /* 4-8 */ + } + { XXH128_hash_t const expected = { 0x0E85E122FE5356ACULL, 0xD933CC7EDF4D95DAULL }; + BMK_testXXH128(sanityBuffer, 12, 0, expected); /* 9-16 */ + } + { XXH128_hash_t const expected = { 0xE0DB5E70DA67EB16ULL, 0x114C8C76E74C669FULL }; + BMK_testXXH128(sanityBuffer, 12, prime, expected); /* 9-16 */ + } + { XXH128_hash_t const expected = { 0x6C213B15B89230C9ULL, 0x3F3AACF5F277AC02ULL }; + BMK_testXXH128(sanityBuffer, 24, 0, expected); /* 17-32 */ + } + { XXH128_hash_t const expected = { 0x71892DB847A8F53CULL, 0xD11561AC7D0F5ECDULL }; + BMK_testXXH128(sanityBuffer, 24, prime, expected); /* 17-32 */ + } + { XXH128_hash_t const expected = { 0xECED834E8E99DA1EULL, 0x0F85E76A60898313ULL }; + BMK_testXXH128(sanityBuffer, 48, 0, expected); /* 33-64 */ + } + { XXH128_hash_t const expected = { 0xA901250B336F9133ULL, 0xA35D3FB395E1DDE0ULL }; + BMK_testXXH128(sanityBuffer, 48, prime, expected); /* 33-64 */ + } + { XXH128_hash_t const expected = { 0x338B2F6E103D5B4EULL, 0x5DD1777C8FA671ABULL }; + BMK_testXXH128(sanityBuffer, 81, 0, expected); /* 65-96 */ + } + { XXH128_hash_t const expected = { 0x0718382B6D4264C3ULL, 0x1D542DAFEFA1790EULL }; + BMK_testXXH128(sanityBuffer, 81, prime, expected); /* 65-96 */ + } + { XXH128_hash_t const expected = { 0x7DE871A4FE41C90EULL, 0x786CB41C46C6B7B6ULL }; + BMK_testXXH128(sanityBuffer, 103, 0, expected); /* 97-128 */ + } + { XXH128_hash_t const expected = { 0xAD8B0B428C940A2CULL, 0xF8BA6D8B8CB05EB7ULL }; + BMK_testXXH128(sanityBuffer, 103, prime, expected); /* 97-128 */ + } + { XXH128_hash_t const expected = { 0x6D96AC3F415CFCFEULL, 0x947EDFA54DD68990ULL }; + BMK_testXXH128(sanityBuffer, 192, 0, expected); /* one block, ends at stripe boundary */ + } + { XXH128_hash_t const expected = { 0xE4BD30AA1673B966ULL, 0x8132EF45FF3D51F2ULL }; + BMK_testXXH128(sanityBuffer, 192, prime, expected); /* one block, ends at stripe boundary */ + } + { XXH128_hash_t const expected = { 0xB62929C362EF3BF5ULL, 0x1946A7A9E6DD3032ULL }; + BMK_testXXH128(sanityBuffer, 222, 0, expected); /* one block, last stripe is overlapping */ + } + { XXH128_hash_t const expected = { 0x2782C3C49E3FD25EULL, 0x98CE16C40C2D59F6ULL }; + BMK_testXXH128(sanityBuffer, 222, prime, expected); /* one block, last stripe is overlapping */ + } + { XXH128_hash_t const expected = { 0x802EB54C97564FD7ULL, 0x384AADF242348D00ULL }; + BMK_testXXH128(sanityBuffer,2048, 0, expected); /* two blocks, finishing at block boundary */ + } + { XXH128_hash_t const expected = { 0xC9F188CFAFDA22CDULL, 0x7936B69445BE9EEDULL }; + BMK_testXXH128(sanityBuffer,2048, prime, expected); /* two blocks, finishing at block boundary */ + } + { XXH128_hash_t const expected = { 0x16B0035F6ABC1F46ULL, 0x1F6602850A1AA7EEULL }; + BMK_testXXH128(sanityBuffer,2240, 0, expected); /* two blocks, ends at stripe boundary */ + } + { XXH128_hash_t const expected = { 0x389E68C2348B9161ULL, 0xA7D1E8C96586A052ULL }; + BMK_testXXH128(sanityBuffer,2240, prime, expected); /* two blocks, ends at stripe boundary */ + } + { XXH128_hash_t const expected = { 0x8B1DE79158C397D3ULL, 0x9B6B2EEFAC2DE0ADULL }; + BMK_testXXH128(sanityBuffer,2237, 0, expected); /* two blocks, ends at stripe boundary */ + } + { XXH128_hash_t const expected = { 0x9DDF09ABA2B93DD6ULL, 0xB9CEDBE2582CA371ULL }; + BMK_testXXH128(sanityBuffer,2237, prime, expected); /* two blocks, ends at stripe boundary */ + } + + DISPLAYLEVEL(3, "\r%70s\r", ""); /* Clean display line */ DISPLAYLEVEL(3, "Sanity check -- all tests ok\n"); } From 70f9d859594f8ead8105e2264182cb47e691dc03 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Wed, 13 Mar 2019 15:08:04 -0700 Subject: [PATCH 67/73] minor doc edits --- xxhash.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/xxhash.h b/xxhash.h index 1dc86536..cab61fdf 100644 --- a/xxhash.h +++ b/xxhash.h @@ -390,6 +390,7 @@ struct XXH64_state_s { * Although it's also more difficult to use, since it requires to declare and pass a structure instead of a value. * It would either replace current choice, or add a new one. * Farmhash, for example, offers both variants (the 128-bits seed variant is called `doubleSeed`). + * If both 64-bit and 128-bit seeds are possible, which variant should be called XXH128 ? * * - Result for len==0 : Currently, the result of hashing a zero-length input is the seed. * This mimics the behavior of a crc : in which case, a seed is effectively an accumulator, so it's not updated if input is empty. @@ -411,12 +412,13 @@ typedef struct { # define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) #endif -/* note : variant without seed produces same result as variant with seed == 0 */ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, unsigned long long seed); -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len); -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, unsigned long long seed); + +/* note : variants without seed produce same result as variant with seed == 0 */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len); +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, unsigned long long seed); XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len); -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, unsigned long long seed); +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, unsigned long long seed); /* == XXH128() */ #endif /* XXH_NO_LONG_LONG */ From 2b8b68cee527f53e5bb668238bd8dc7cd484b59b Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Wed, 13 Mar 2019 15:15:37 -0700 Subject: [PATCH 68/73] disable ARM 32-bit + NEON tests does not work (yet) on Travis CI --- .travis.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 25f89735..fab28664 100644 --- a/.travis.yml +++ b/.travis.yml @@ -37,7 +37,10 @@ matrix: # arm (32-bit) - CC=arm-linux-gnueabi-gcc CPPFLAGS=-DXXH_VECTOR=0 LDFLAGS=-static RUN_ENV=qemu-arm-static make check # Scalar code path - make clean - - CC=arm-linux-gnueabi-gcc CPPFLAGS=-DXXH_VECTOR=3 CFLAGS="-O3 -march=armv7-a -mfloat-abi=hard -mfpu=neon" LDFLAGS=-static RUN_ENV=qemu-arm-static make check # NEON code path + # Note : the following test (ARM 32-bit + NEON) is disabled for the time being. + # I haven't yet found a way to make it link on Travis CI using gcc cross-compilation. + # NEON code path is fortunately validated through `aarch64` below. + # - CC=arm-linux-gnueabi-gcc CPPFLAGS=-DXXH_VECTOR=3 CFLAGS="-O3 -march=armv7-a -mfloat-abi=hard -mfpu=neon" LDFLAGS=-static RUN_ENV=qemu-arm-static make check # NEON code path - make clean # aarch64 - CC=aarch64-linux-gnu-gcc CPPFLAGS=-DXXH_VECTOR=0 LDFLAGS=-static RUN_ENV=qemu-aarch64-static make check # Scalar code path From f622c806ef89352c3c91170253daa3164550e4da Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Wed, 13 Mar 2019 15:55:24 -0700 Subject: [PATCH 69/73] xxhsum: fixed benchmark on low resolution timers triggered an assert when time measured == 0 --- xxhsum.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/xxhsum.c b/xxhsum.c index 1b661cd5..12b89fb2 100644 --- a/xxhsum.c +++ b/xxhsum.c @@ -284,20 +284,29 @@ static void BMK_benchHash(hashFunction h, const char* hName, const void* buffer, while (clock() == cStart); /* starts clock() at its exact beginning */ cStart = clock(); - { U32 i; - for (i=0; i %8.0f it/s (%7.1f MB/s) \r", iterationNb, hName, (U32)bufferSize, (double)1 / fastestH, ((double)bufferSize / (1<<20)) / fastestH ); } - assert(fastestH > 1./2000000000); /* avoid U32 overflow */ - nbh_perIteration = (U32)(1 / fastestH) + 1; /* adjust nbh_perIteration to last roughtly one second */ + { double nbh_perSecond = (1 / fastestH) + 1; + if (nbh_perSecond > (double)(4000U<<20)) nbh_perSecond = (double)(4000U<<20); + nbh_perIteration = (U32)nbh_perSecond; + } } DISPLAYLEVEL(1, "%-19.19s : %10u -> %8.0f it/s (%7.1f MB/s) \n", hName, (U32)bufferSize, (double)1 / fastestH, From 40dbf78fa950069ff02089fb9f9961ad01a1a46c Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Thu, 14 Mar 2019 13:08:38 -0700 Subject: [PATCH 70/73] renamed XXH128_hash_t members to low64 and high64 --- xxh3.h | 6 +++--- xxhash.h | 20 +++++++++++--------- xxhsum.c | 8 ++++---- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/xxh3.h b/xxh3.h index 2dc61a35..9197b680 100644 --- a/xxh3.h +++ b/xxh3.h @@ -748,9 +748,9 @@ XXH3_hashLong_128b(const void* data, size_t len, XXH64_hash_t seed) /* converge into final hash */ assert(sizeof(acc) == 64); - { U64 const part1 = XXH3_mergeAccs(acc, kKey, (U64)len * PRIME64_1); - U64 const part2 = XXH3_mergeAccs(acc, kKey+16, ((U64)len+1) * PRIME64_2); - return (XXH128_hash_t) { part1, part2 }; + { U64 const low64 = XXH3_mergeAccs(acc, kKey, (U64)len * PRIME64_1); + U64 const high64 = XXH3_mergeAccs(acc, kKey+16, ((U64)len+1) * PRIME64_2); + return (XXH128_hash_t) { low64, high64 }; } } diff --git a/xxhash.h b/xxhash.h index cab61fdf..7f3d6603 100644 --- a/xxhash.h +++ b/xxhash.h @@ -344,9 +344,10 @@ struct XXH64_state_s { * It benefits greatly from vectorization units, but does not require it. * * XXH3 offers 2 variants, _64bits and _128bits. - * The first 64-bits field of the _128bits variant is the same as _64bits result. - * However, if only 64-bits are needed, prefer calling the _64bits variant. - * It reduces the amount of mixing, resulting in faster speed on small inputs. + * When only 64 bits are needed, prefer calling the _64bits variant : + * it reduces the amount of mixing, resulting in faster speed on small inputs. + * It's also generally simpler to manipulate a scalar type than a struct. + * Note : the low 64-bit field of the _128bits variant is the same as _64bits result. * * The XXH3 algorithm is still considered experimental. * It's possible to use it for ephemeral data, but avoid storing long-term values for later re-use. @@ -380,7 +381,7 @@ struct XXH64_state_s { * Would it be beneficial to declare and define a comparator function for XXH128_hash_t ? * Are there other operations on XXH128_hash_t which would be desirable ? * - * - Variant compatibility : The first 64-bit field of the _128bits variant is the same as the result of _64bits. + * - Variant compatibility : The low 64-bit field of the _128bits variant is the same as the result of _64bits. * This is not a compulsory behavior. It just felt that it "wouldn't hurt", and might even help in some (unidentified) cases. * But it might influence the design of XXH128_hash_t, in ways which may block other possibilities. * Good idea, bad idea ? @@ -399,11 +400,6 @@ struct XXH64_state_s { * Are there use case which would depend on this behavior, or would prefer a mixing of the seed ? */ -typedef struct { - XXH64_hash_t ll1; - XXH64_hash_t ll2; -} XXH128_hash_t; - #ifdef XXH_NAMESPACE # define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) # define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) @@ -412,6 +408,12 @@ typedef struct { # define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) #endif + +typedef struct { + XXH64_hash_t low64; + XXH64_hash_t high64; +} XXH128_hash_t; + XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, unsigned long long seed); /* note : variants without seed produce same result as variant with seed == 0 */ diff --git a/xxhsum.c b/xxhsum.c index 12b89fb2..07b5ff29 100644 --- a/xxhsum.c +++ b/xxhsum.c @@ -471,11 +471,11 @@ static void BMK_checkResult64(XXH64_hash_t r1, XXH64_hash_t r2) static void BMK_checkResult128(XXH128_hash_t r1, XXH128_hash_t r2) { static int nbTests = 1; - if ((r1.ll1 != r2.ll1) || (r1.ll2 != r2.ll2)) { + if ((r1.low64 != r2.low64) || (r1.high64 != r2.high64)) { DISPLAY("\rERROR : Test%3i : 128-bit values non equals !!!!! \n", nbTests); DISPLAY("\r { 0x%08X%08XULL, 0x%08X%08XULL } != { 0x%08X%08XULL, %08X%08XULL } \n", - (U32)(r1.ll1>>32), (U32)r1.ll1, (U32)(r1.ll2>>32), (U32)r1.ll2, - (U32)(r2.ll1>>32), (U32)r2.ll1, (U32)(r2.ll2>>32), (U32)r2.ll2 ); + (U32)(r1.low64>>32), (U32)r1.low64, (U32)(r1.high64>>32), (U32)r1.high64, + (U32)(r2.low64>>32), (U32)r2.low64, (U32)(r2.high64>>32), (U32)r2.high64 ); exit(1); } nbTests++; @@ -528,7 +528,7 @@ static void BMK_testXXH128(const void* data, size_t len, U64 seed, XXH128_hash_t /* check that first field is equal to _64bits variant */ { U64 const result64 = XXH3_64bits_withSeed(data, len, seed); - BMK_checkResult64(result64, Nresult.ll1); + BMK_checkResult64(result64, Nresult.low64); } } /* check that the no-seed variant produces same result as seed==0 */ From cf5694603db5df13450918c9904108bfb7826aed Mon Sep 17 00:00:00 2001 From: "easyaspi314 (Devin)" Date: Fri, 15 Mar 2019 11:56:58 -0400 Subject: [PATCH 71/73] Improve xxhsum output message quality - xxhsum now prints more professional-looking error messages: Before: Pb opening foo After: Error: Could not open 'foo': No such file or directory. - xxhsum will now attempt to display the architecture and the compiler version in the benchmark WELCOME_MESSAGE. It detects the following compilers: - Clang - GCC - Intel Compiler - MSVC - tcc and it should detect the following architectures: - x86 (+SSE2/AVX/AVX2) - x86_64 (+SSE2/AVX/AVX2) - ARM (+NEON) - aarch64 - PowerPC 64 - PowerPC - AVR - MIPS 64 - MIPS Before: ./xxhsum 0.7.0 (64-bits little endian), by Yann Collet After: ./xxhsum 0.7.0 (64-bits x86_64 + SSE2 little endian), GCC 8.3.0, by Yann Collet - Sanity checks are consistent now and give better warning messages: Before: ERROR : Test 1 : 0x12345678 <> 0x02CC5D05 !!!!! ERROR : Test 1 : 64-bit values non equals !!!!! 0x1234567890ABCDEFULL != 0xEF46DB3751D8E999ULL After: Error: 32-bit hash test 1: Internal sanity check failed! Got 0x12345678, expected 0x02CC5D05. Note: If you modified the hash functions, make sure to either update the values or temporarily comment out the tests in BMK_sanityCheck. ...and the 64-bit and 128-bit messages now match. I eventually want to name the tests instead of just using the test number, but this is still better than before. - xxhsum now displays "stdin" instead of "-" when reading from stdin. --- xxhsum.c | 151 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 118 insertions(+), 33 deletions(-) diff --git a/xxhsum.c b/xxhsum.c index 07b5ff29..0ec11c01 100644 --- a/xxhsum.c +++ b/xxhsum.c @@ -44,7 +44,6 @@ # define _LARGEFILE64_SOURCE #endif - /* ************************************ * Includes **************************************/ @@ -55,6 +54,7 @@ #include /* stat, stat64, _stat64 */ #include /* clock_t, clock, CLOCKS_PER_SEC */ #include /* assert */ +#include /* errno */ #define XXH_STATIC_LINKING_ONLY /* *_state_t */ #include "xxhash.h" @@ -164,13 +164,86 @@ static unsigned BMK_isLittleEndian(void) #define QUOTE(str) #str #define EXPAND_AND_QUOTE(str) QUOTE(str) #define PROGRAM_VERSION EXPAND_AND_QUOTE(LIB_VERSION) + +/* Show compiler versions in WELCOME_MESSAGE. VERSION_FMT will return the printf specifiers, + * and VERSION will contain the comma separated list of arguments to the VERSION_FMT string. */ +#if defined(__clang_version__) +/* Clang does its own thing. */ +# ifdef __apple_build_version__ +# define VERSION_FMT ", Apple Clang %s" +# else +# define VERSION_FMT ", Clang %s" +# endif +# define VERSION __clang_version__ +#elif defined(__VERSION__) +/* GCC and ICC */ +# define VERSION_FMT ", %s" +# ifdef __INTEL_COMPILER /* icc adds its prefix */ +# define VERSION_STRING __VERSION__ +# else /* assume GCC */ +# define VERSION "GCC " __VERSION__ +# endif +#elif defined(_MSC_FULL_VER) && defined(_MSC_BUILD) +/* "For example, if the version number of the Visual C++ compiler is 15.00.20706.01, the _MSC_FULL_VER macro + * evaluates to 150020706." https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros?view=vs-2017 */ +# define VERSION _MSC_FULL_VER / 10000000 % 100, _MSC_FULL_VER / 100000 % 100, _MSC_FULL_VER % 100000, _MSC_BUILD +# define VERSION_FMT ", MSVC %02i.%02i.%05i.%02i" +#elif defined(__TINYC__) +/* tcc stores its version in the __TINYC__ macro. */ +# define VERSION_FMT ", tcc %i.%i.%i" +# define VERSION __TINYC__ / 10000 % 100, __TINYC__ / 100 % 100, __TINYC__ % 100 +#else +# define VERSION_FMT "%s" +# define VERSION "" +#endif + +/* makes the next part easier */ +#if defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64) +# define ARCH_X86 "x86_64" +#elif defined(__i386__) || defined(_M_X86) || defined(_M_X86_FP) +# define ARCH_X86 "i386" +#endif + +/* Try to detect the architecture. */ +#if defined(ARCH_X86) +# if defined(__AVX2__) +# define ARCH ARCH_X86 " + AVX2" +# elif defined(__AVX__) +# define ARCH ARCH_X86 " + AVX" +# elif defined(__SSE2__) +# define ARCH ARCH_X86 " + SSE2" +# else +# define ARCH ARCH_X86 +# endif +#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) +# define ARCH "aarch64" +#elif defined(__arm__) || defined(__thumb__) || defined(__thumb2__) || defined(_M_ARM) +# if defined(__ARM_NEON) || defined(__ARM_NEON__) +# define ARCH "arm + NEON" +# else +# define ARCH "arm" +# endif +#elif defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) +# define ARCH "ppc64" +#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) +# define ARCH "ppc" +#elif defined(__AVR) +# define ARCH "AVR" +#elif defined(__mips64) +# define ARCH "mips64" +#elif defined(__mips) +# define ARCH "mips" +#else +# define ARCH "unknown" +#endif + static const int g_nbBits = (int)(sizeof(void*)*8); static const char g_lename[] = "little endian"; static const char g_bename[] = "big endian"; #define ENDIAN_NAME (BMK_isLittleEndian() ? g_lename : g_bename) static const char author[] = "Yann Collet"; -#define WELCOME_MESSAGE(exename) "%s %s (%i-bits %s), by %s \n", \ - exename, PROGRAM_VERSION, g_nbBits, ENDIAN_NAME, author +#define WELCOME_MESSAGE(exename) "%s %s (%i-bits %s %s)" VERSION_FMT ", by %s \n", \ + exename, PROGRAM_VERSION, g_nbBits, ARCH, ENDIAN_NAME, VERSION, author #define KB *( 1<<10) #define MB *( 1<<20) @@ -350,7 +423,7 @@ static int BMK_benchMem(const void* buffer, size_t bufferSize, U32 specificTest) BMK_benchHash(localXXH3_64b, "XXH3_64b unaligned", ((const char*)buffer)+3, bufferSize); if (specificTest > 6) { - DISPLAY("benchmark mode invalid \n"); + DISPLAY("Benchmark mode invalid.\n"); return 1; } return 0; @@ -384,12 +457,12 @@ static int BMK_benchFiles(const char** fileNamesTable, int nbFiles, U32 specific /* Checks */ if (inFile==NULL){ - DISPLAY("Pb opening %s\n", inFileName); + DISPLAY("Error: Could not open '%s': %s.\n", inFileName, strerror(errno)); free(buffer); return 11; } if(!buffer) { - DISPLAY("\nError: not enough memory!\n"); + DISPLAY("\nError: Out of memory.\n"); fclose(inFile); return 12; } @@ -399,7 +472,7 @@ static int BMK_benchFiles(const char** fileNamesTable, int nbFiles, U32 specific { size_t const readSize = fread(alignedBuffer, 1, benchedSize, inFile); fclose(inFile); if(readSize != benchedSize) { - DISPLAY("\nError: problem reading file '%s' !! \n", inFileName); + DISPLAY("\nError: Could not read '%s': %s.\n", inFileName, strerror(errno)); free(buffer); return 13; } } @@ -419,7 +492,7 @@ static int BMK_benchInternal(size_t keySize, U32 specificTest) { void* const buffer = calloc(keySize+16+3, 1); if (!buffer) { - DISPLAY("\nError: not enough memory!\n"); + DISPLAY("\nError: Out of memory.\n"); return 12; } @@ -451,7 +524,10 @@ static void BMK_checkResult32(XXH32_hash_t r1, XXH32_hash_t r2) { static int nbTests = 1; if (r1!=r2) { - DISPLAY("\rERROR : Test%3i : 0x%08X <> 0x%08X !!!!! \n", nbTests, r1, r2); + DISPLAY("\rError: 32-bit hash test %i: Internal sanity check failed!\n", nbTests); + DISPLAY("\rGot 0x%08X, expected 0x%08X.\n", r1, r2); + DISPLAY("\rNote: If you modified the hash functions, make sure to either update the values\n" + "or temporarily comment out the tests in BMK_sanityCheck.\n"); exit(1); } nbTests++; @@ -461,8 +537,10 @@ static void BMK_checkResult64(XXH64_hash_t r1, XXH64_hash_t r2) { static int nbTests = 1; if (r1!=r2) { - DISPLAY("\rERROR : Test%3i : 64-bit values non equals !!!!! \n", nbTests); - DISPLAY("\r 0x%08X%08XULL != 0x%08X%08XULL \n", (U32)(r1>>32), (U32)r1, (U32)(r2>>32), (U32)r2); + DISPLAY("\rError: 64-bit hash test %i: Internal sanity check failed!\n", nbTests); + DISPLAY("\rGot 0x%08X%08XULL, expected 0x%08X%08XULL.\n", (U32)(r1>>32), (U32)r1, (U32)(r2>>32), (U32)r2); + DISPLAY("\rNote: If you modified the hash functions, make sure to either update the values\n" + "or temporarily comment out the tests in BMK_sanityCheck.\n"); exit(1); } nbTests++; @@ -472,10 +550,12 @@ static void BMK_checkResult128(XXH128_hash_t r1, XXH128_hash_t r2) { static int nbTests = 1; if ((r1.low64 != r2.low64) || (r1.high64 != r2.high64)) { - DISPLAY("\rERROR : Test%3i : 128-bit values non equals !!!!! \n", nbTests); - DISPLAY("\r { 0x%08X%08XULL, 0x%08X%08XULL } != { 0x%08X%08XULL, %08X%08XULL } \n", + DISPLAY("\rError: 128-bit hash test %i: Internal sanity check failed.\n", nbTests); + DISPLAY("\rGot { 0x%08X%08XULL, 0x%08X%08XULL }, expected { 0x%08X%08XULL, %08X%08XULL } \n", (U32)(r1.low64>>32), (U32)r1.low64, (U32)(r1.high64>>32), (U32)r1.high64, (U32)(r2.low64>>32), (U32)r2.low64, (U32)(r2.high64>>32), (U32)r2.high64 ); + DISPLAY("\rNote: If you modified the hash functions, make sure to either update the values\n" + "or temporarily comment out the tests in BMK_sanityCheck.\n"); exit(1); } nbTests++; @@ -783,19 +863,20 @@ static int BMK_hash(const char* fileName, /* Check file existence */ if (fileName == stdinName) { inFile = stdin; + fileName = "stdin"; SET_BINARY_MODE(stdin); } else inFile = fopen( fileName, "rb" ); if (inFile==NULL) { - DISPLAY( "Pb opening %s\n", fileName); + DISPLAY("Error: Could not open '%s': %s.\n", fileName, strerror(errno)); return 1; } /* Memory allocation & restrictions */ buffer = malloc(blockSize); if(!buffer) { - DISPLAY("\nError: not enough memory!\n"); + DISPLAY("\nError: Out of memory.\n"); fclose(inFile); return 1; } @@ -1104,7 +1185,7 @@ static void parseFile1(ParseFileArg* parseFileArg) if (lineNumber == 0) { /* This is unlikely happen, but md5sum.c has this * error check. */ - DISPLAY("%s : too many checksum lines\n", inFileName); + DISPLAY("%s: Error: Too many checksum lines\n", inFileName); report->quit = 1; break; } @@ -1123,15 +1204,15 @@ static void parseFile1(ParseFileArg* parseFileArg) break; default: - DISPLAY("%s : %lu: unknown error\n", inFileName, lineNumber); + DISPLAY("%s:%lu: Error: Unknown error.\n", inFileName, lineNumber); break; case GetLine_exceedMaxLineLength: - DISPLAY("%s : %lu: too long line\n", inFileName, lineNumber); + DISPLAY("%s:%lu: Error: Line too long.\n", inFileName, lineNumber); break; case GetLine_outOfMemory: - DISPLAY("%s : %lu: out of memory\n", inFileName, lineNumber); + DISPLAY("%s:%lu: Error: Out of memory.\n", inFileName, lineNumber); break; } report->quit = 1; @@ -1141,7 +1222,7 @@ static void parseFile1(ParseFileArg* parseFileArg) if (parseLine(&parsedLine, parseFileArg->lineBuf) != ParseLine_ok) { report->nImproperlyFormattedLines++; if (parseFileArg->warn) { - DISPLAY("%s : %lu: improperly formatted XXHASH checksum line\n" + DISPLAY("%s:%lu: Error: Improperly formatted checksum line.\n" , inFileName, lineNumber); } continue; @@ -1152,7 +1233,7 @@ static void parseFile1(ParseFileArg* parseFileArg) report->nImproperlyFormattedLines++; report->nMixedFormatLines++; if (parseFileArg->warn) { - DISPLAY("%s : %lu: improperly formatted XXHASH checksum line (XXH32/64)\n" + DISPLAY("%s : %lu: Error: Multiple hash types in one file.\n" , inFileName, lineNumber); } continue; @@ -1195,15 +1276,15 @@ static void parseFile1(ParseFileArg* parseFileArg) switch (lineStatus) { default: - DISPLAY("%s : unknown error\n", inFileName); + DISPLAY("%s: Error: Unknown error.\n", inFileName); report->quit = 1; break; case LineStatus_failedToOpen: report->nOpenOrReadFailures++; if (!parseFileArg->statusOnly) { - DISPLAYRESULT("%s : %lu: FAILED open or read %s\n" - , inFileName, lineNumber, parsedLine.filename); + DISPLAYRESULT("%s:%lu: Could not open or read '%s': %s.\n", + inFileName, lineNumber, parsedLine.filename, strerror(errno)); } break; @@ -1266,13 +1347,14 @@ static int checkFile(const char* inFileName, if (inFileName == stdinName) { /* note : Since we expect text input for xxhash -c mode, * Don't set binary mode for stdin */ + inFileName = "stdin"; inFile = stdin; } else { inFile = fopen( inFileName, "rt" ); } if (inFile == NULL) { - DISPLAY( "Pb opening %s\n", inFileName); + DISPLAY("Error: Could not open '%s': %s\n", inFileName, strerror(errno)); return 0; } @@ -1297,19 +1379,22 @@ static int checkFile(const char* inFileName, /* Show error/warning messages. All messages are copied from md5sum.c */ if (report->nProperlyFormattedLines == 0) { - DISPLAY("%s: no properly formatted XXHASH checksum lines found\n", inFileName); + DISPLAY("%s: no properly formatted xxHash checksum lines found\n", inFileName); } else if (!statusOnly) { if (report->nImproperlyFormattedLines) { - DISPLAYRESULT("%lu lines are improperly formatted\n" - , report->nImproperlyFormattedLines); + DISPLAYRESULT("%lu %s are improperly formatted\n" + , report->nImproperlyFormattedLines + , report->nImproperlyFormattedLines == 1 ? "line" : "lines"); } if (report->nOpenOrReadFailures) { - DISPLAYRESULT("%lu listed files could not be read\n" - , report->nOpenOrReadFailures); + DISPLAYRESULT("%lu listed %s could not be read\n" + , report->nOpenOrReadFailures + , report->nOpenOrReadFailures == 1 ? "file" : "files"); } if (report->nMismatchedChecksums) { - DISPLAYRESULT("%lu computed checksums did NOT match\n" - , report->nMismatchedChecksums); + DISPLAYRESULT("%lu computed %s did NOT match\n" + , report->nMismatchedChecksums + , report->nMismatchedChecksums == 1 ? "checksum" : "checksums"); } } /* Result (exit) code logic is copied from @@ -1432,7 +1517,7 @@ static int readU32FromCharChecked(const char** stringPtr, unsigned* value) static unsigned readU32FromChar(const char** stringPtr) { unsigned result; if (readU32FromCharChecked(stringPtr, &result)) { - static const char errorMsg[] = "error: numeric value too large"; + static const char errorMsg[] = "Error: numeric value too large"; errorOut(errorMsg); } return result; From b31ca8c40feca1d1c3172c6af99c92edac87ded3 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Fri, 15 Mar 2019 09:10:54 -0700 Subject: [PATCH 72/73] removed XXH_FORCE_NATIVE_FORMAT --- README.md | 2 -- xxhash.c | 96 ++++++++++++++----------------------------------------- 2 files changed, 24 insertions(+), 74 deletions(-) diff --git a/README.md b/README.md index 6a0ad191..5213b198 100644 --- a/README.md +++ b/README.md @@ -81,8 +81,6 @@ they modify xxhash behavior. They are all disabled by default. - `XXH_CPU_LITTLE_ENDIAN` : by default, endianess is determined at compile time. It's possible to skip auto-detection and force format to little-endian, by setting this macro to 1. Setting it to 0 forces big-endian. -- `XXH_FORCE_NATIVE_FORMAT` : on big-endian systems : use native number representation. - Breaks consistency with little-endian results. - `XXH_PRIVATE_API` : same impact as `XXH_INLINE_ALL`. Name underlines that symbols will not be published on library public interface. - `XXH_NAMESPACE` : prefix all symbols with the value of `XXH_NAMESPACE`. diff --git a/xxhash.c b/xxhash.c index 82ee887b..2edd6f8a 100644 --- a/xxhash.c +++ b/xxhash.c @@ -71,18 +71,6 @@ # define XXH_ACCEPT_NULL_INPUT_POINTER 0 #endif -/*!XXH_FORCE_NATIVE_FORMAT : - * By default, xxHash library provides endian-independent Hash values, based on little-endian convention. - * Results are therefore identical for little-endian and big-endian CPU. - * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format. - * Should endian-independence be of no importance for your application, you may set the #define below to 1, - * to improve speed for Big-endian CPU. - * This option has no impact on Little_Endian CPU. - */ -#ifndef XXH_FORCE_NATIVE_FORMAT /* can be defined externally */ -# define XXH_FORCE_NATIVE_FORMAT 0 -#endif - /*!XXH_FORCE_ALIGN_CHECK : * This is a minor performance trick, only useful with lots of very small keys. * It means : check for aligned/unaligned input. @@ -245,12 +233,12 @@ static U32 XXH_readBE32(const void* ptr) } XXH_FORCE_INLINE U32 -XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align) +XXH_readLE32_align(const void* ptr, XXH_alignment align) { if (align==XXH_unaligned) { return XXH_readLE32(ptr); } else { - return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr); + return XXH_CPU_LITTLE_ENDIAN ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr); } } @@ -334,11 +322,10 @@ static U32 XXH32_avalanche(U32 h32) return(h32); } -#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align) +#define XXH_get32bits(p) XXH_readLE32_align(p, align) static U32 -XXH32_finalize(U32 h32, const void* ptr, size_t len, - XXH_endianess endian, XXH_alignment align) +XXH32_finalize(U32 h32, const void* ptr, size_t len, XXH_alignment align) { const BYTE* p = (const BYTE*)ptr; @@ -397,8 +384,7 @@ XXH32_finalize(U32 h32, const void* ptr, size_t len, } XXH_FORCE_INLINE U32 -XXH32_endian_align(const void* input, size_t len, U32 seed, - XXH_endianess endian, XXH_alignment align) +XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_alignment align) { const BYTE* p = (const BYTE*)input; const BYTE* bEnd = p + len; @@ -433,7 +419,7 @@ XXH32_endian_align(const void* input, size_t len, U32 seed, h32 += (U32)len; - return XXH32_finalize(h32, p, len&15, endian, align); + return XXH32_finalize(h32, p, len&15, align); } @@ -445,21 +431,15 @@ XXH_PUBLIC_API unsigned int XXH32 (const void* input, size_t len, unsigned int s XXH32_reset(&state, seed); XXH32_update(&state, input, len); return XXH32_digest(&state); + #else - XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; if (XXH_FORCE_ALIGN_CHECK) { if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ - if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) - return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); - else - return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); + return XXH32_endian_align(input, len, seed, XXH_aligned); } } - if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) - return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); - else - return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); + return XXH32_endian_align(input, len, seed, XXH_unaligned); #endif } @@ -560,8 +540,7 @@ XXH32_update(XXH32_state_t* state, const void* input, size_t len) } -XXH_FORCE_INLINE U32 -XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian) +XXH_PUBLIC_API unsigned int XXH32_digest (const XXH32_state_t* state) { U32 h32; @@ -576,18 +555,7 @@ XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian) h32 += state->total_len_32; - return XXH32_finalize(h32, state->mem32, state->memsize, endian, XXH_aligned); -} - - -XXH_PUBLIC_API unsigned int XXH32_digest (const XXH32_state_t* state_in) -{ - XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; - - if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) - return XXH32_digest_endian(state_in, XXH_littleEndian); - else - return XXH32_digest_endian(state_in, XXH_bigEndian); + return XXH32_finalize(h32, state->mem32, state->memsize, XXH_aligned); } @@ -690,12 +658,12 @@ static U64 XXH_readBE64(const void* ptr) } XXH_FORCE_INLINE U64 -XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align) +XXH_readLE64_align(const void* ptr, XXH_alignment align) { if (align==XXH_unaligned) return XXH_readLE64(ptr); else - return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr); + return XXH_CPU_LITTLE_ENDIAN ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr); } @@ -734,11 +702,10 @@ static U64 XXH64_avalanche(U64 h64) } -#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align) +#define XXH_get64bits(p) XXH_readLE64_align(p, align) static U64 -XXH64_finalize(U64 h64, const void* ptr, size_t len, - XXH_endianess endian, XXH_alignment align) +XXH64_finalize(U64 h64, const void* ptr, size_t len, XXH_alignment align) { const BYTE* p = (const BYTE*)ptr; @@ -846,8 +813,7 @@ XXH64_finalize(U64 h64, const void* ptr, size_t len, } XXH_FORCE_INLINE U64 -XXH64_endian_align(const void* input, size_t len, U64 seed, - XXH_endianess endian, XXH_alignment align) +XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_alignment align) { const BYTE* p = (const BYTE*)input; const BYTE* bEnd = p + len; @@ -886,7 +852,7 @@ XXH64_endian_align(const void* input, size_t len, U64 seed, h64 += (U64) len; - return XXH64_finalize(h64, p, len, endian, align); + return XXH64_finalize(h64, p, len, align); } @@ -898,21 +864,16 @@ XXH_PUBLIC_API unsigned long long XXH64 (const void* input, size_t len, unsigned XXH64_reset(&state, seed); XXH64_update(&state, input, len); return XXH64_digest(&state); + #else - XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; if (XXH_FORCE_ALIGN_CHECK) { if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ - if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) - return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); - else - return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); + return XXH64_endian_align(input, len, seed, XXH_aligned); } } - if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) - return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); - else - return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); + return XXH64_endian_align(input, len, seed, XXH_unaligned); + #endif } @@ -1006,7 +967,8 @@ XXH64_update (XXH64_state_t* state, const void* input, size_t len) return XXH_OK; } -XXH_FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian) + +XXH_PUBLIC_API unsigned long long XXH64_digest (const XXH64_state_t* state) { U64 h64; @@ -1027,17 +989,7 @@ XXH_FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endian h64 += (U64) state->total_len; - return XXH64_finalize(h64, state->mem64, (size_t)state->total_len, endian, XXH_aligned); -} - -XXH_PUBLIC_API unsigned long long XXH64_digest (const XXH64_state_t* state_in) -{ - XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; - - if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) - return XXH64_digest_endian(state_in, XXH_littleEndian); - else - return XXH64_digest_endian(state_in, XXH_bigEndian); + return XXH64_finalize(h64, state->mem64, (size_t)state->total_len, XXH_aligned); } From 5674c6dcdd449c70ed8ed918f09ae4ef4b538460 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Fri, 15 Mar 2019 09:30:42 -0700 Subject: [PATCH 73/73] update README to present XXH3 --- README.md | 42 +++++++++++++++++++++++++++++++++--------- xxhash.c | 4 ++-- 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 5213b198..323bc6f0 100644 --- a/README.md +++ b/README.md @@ -98,7 +98,7 @@ they modify xxhash behavior. They are all disabled by default. Calling xxhash 64-bit variant from a C program : -```c +```C #include "xxhash.h" unsigned long long calcul_hash(const void* buffer, size_t length) @@ -110,42 +110,66 @@ unsigned long long calcul_hash(const void* buffer, size_t length) ``` Using streaming variant is more involved, but makes it possible to provide data in multiple rounds : -```c +```C #include "stdlib.h" /* abort() */ #include "xxhash.h" unsigned long long calcul_hash_streaming(someCustomType handler) { + /* create a hash state */ XXH64_state_t* const state = XXH64_createState(); if (state==NULL) abort(); - size_t const bufferSize = SOME_VALUE; + size_t const bufferSize = SOME_SIZE; void* const buffer = malloc(bufferSize); if (buffer==NULL) abort(); + /* Initialize state with selected seed */ unsigned long long const seed = 0; /* or any other value */ XXH_errorcode const resetResult = XXH64_reset(state, seed); if (resetResult == XXH_ERROR) abort(); + /* Feed the state with input data, any size, any number of times */ (...) while ( /* any condition */ ) { - size_t const length = get_more_data(buffer, bufferSize, handler); /* undescribed */ - XXH_errorcode const addResult = XXH64_update(state, buffer, length); - if (addResult == XXH_ERROR) abort(); + size_t const length = get_more_data(buffer, bufferSize, handler); + XXH_errorcode const updateResult = XXH64_update(state, buffer, length); + if (updateResult == XXH_ERROR) abort(); (...) } - (...) - unsigned long long const hash = XXH64_digest(state); + /* Get the hash */ + XXH64_hash_t const hash = XXH64_digest(state); + + /* State can then be re-used; in this example, it is simply freed */ free(buffer); XXH64_freeState(state); - return hash; + return (unsigned long long)hash; } ``` +### New experimental hash algorithm + +Starting with `v0.7.0`, the library includes a new algorithm, named `XXH3`, +able to generate 64 and 128-bits hashes. + +The new algorithm is much faster than its predecessors, +for both long and small inputs, +as can be observed in following graphs : + +![XXH3, bargraph](https://github.com/Cyan4973/xxHash/releases/download/graphs/H_bandwidth_bargraph.png) + +![XXH3, latency, random size](https://github.com/Cyan4973/xxHash/releases/download/graphs/H_latency_randomS.png) + +The algorithm is currently labelled experimental, as it may change in a future version. +To access it, one need to unlock its declaration using macro `XXH_STATIC_LINKING_ONLY`. +It can be used for ephemeral data, and for tests, but avoid storing long-term hash values yet. +`XXH3` will be stabilized in a future version. +This period will be used to collect users' feedback. + ### Other programming languages diff --git a/xxhash.c b/xxhash.c index 2edd6f8a..0fd12ce3 100644 --- a/xxhash.c +++ b/xxhash.c @@ -1009,10 +1009,10 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src -/* ******************************************************************* +/* ********************************************************************* * XXH3 * New generation hash designed for speed on small keys and vectorization -********************************************************************** */ +************************************************************************ */ #include "xxh3.h"