From aa76d203ca89686faf819f591ed51eb238bdd0f9 Mon Sep 17 00:00:00 2001
From: ArnaudBienner <arnaud.bienner@gmail.com>
Date: Mon, 20 Aug 2018 17:19:16 +0200
Subject: [PATCH 01/73] Add syntax coloring to README.md examples

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 30318a9f..6a0ad191 100644
--- a/README.md
+++ b/README.md
@@ -100,7 +100,7 @@ they modify xxhash behavior. They are all disabled by default.
 
 Calling xxhash 64-bit variant from a C program :
 
-```
+```c
 #include "xxhash.h"
 
 unsigned long long calcul_hash(const void* buffer, size_t length)
@@ -112,7 +112,7 @@ unsigned long long calcul_hash(const void* buffer, size_t length)
 ```
 
 Using streaming variant is more involved, but makes it possible to provide data in multiple rounds :
-```
+```c
 #include "stdlib.h"   /* abort() */
 #include "xxhash.h"
 

From 52a97a1a081a0dc2a7b4d5fe56e4c0063e2fb053 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Mon, 17 Sep 2018 11:17:41 -0700
Subject: [PATCH 02/73] minor Makefile improvements

- more warnings enabled (inspired by zstd list)
- -fPIC is a CFLAGS rather than an LDFLAGS flag
(though it doesn't change the outcome, since everything is compiler in a single command line)
---
 Makefile | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/Makefile b/Makefile
index 6dd738f2..5fb3e24d 100644
--- a/Makefile
+++ b/Makefile
@@ -41,13 +41,15 @@ else
 NOSSE4 :=
 endif
 
-CFLAGS ?= -O2 $(NOSSE4)   # disables potential auto-vectorization
-CFLAGS += -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
-          -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \
-          -Wstrict-prototypes -Wundef
-
-FLAGS   = $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(MOREFLAGS)
-XXHSUM_VERSION=$(LIBVER)
+CFLAGS ?= -O2 $(NOSSE4) # disables potential auto-vectorization
+DEBUGFLAGS+=-Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
+            -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \
+            -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \
+            -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \
+            -Wredundant-decls -Wstrict-overflow=5
+CFLAGS += $(DEBUGFLAGS)
+FLAGS   = $(CFLAGS) $(CPPFLAGS) $(MOREFLAGS)
+XXHSUM_VERSION = $(LIBVER)
 MD2ROFF = ronn
 MD2ROFF_FLAGS = --roff --warnings --manual="User Commands" --organization="xxhsum $(XXHSUM_VERSION)"
 
@@ -76,6 +78,7 @@ LIBXXH = libxxhash.$(SHARED_EXT_VER)
 
 
 .PHONY: default
+default: DEBUGFLAGS=
 default: lib xxhsum_and_links
 
 .PHONY: all
@@ -83,12 +86,13 @@ all: lib xxhsum xxhsum_inlinedXXH
 
 xxhsum32: CFLAGS += -m32
 xxhsum xxhsum32: xxhash.c xxhsum.c
-	$(CC) $(FLAGS) $^ -o $@$(EXT)
+	$(CC) $(FLAGS) $^ $(LDFLAGS) -o $@$(EXT)
 
 .PHONY: xxhsum_and_links
-xxhsum_and_links: xxhsum
-	ln -sf xxhsum xxh32sum
-	ln -sf xxhsum xxh64sum
+xxhsum_and_links: xxhsum xxh32sum xxh64sum
+
+xxh32sum xxh64sum: xxhsum
+	ln -sf $^ $@
 
 xxhsum_inlinedXXH: xxhsum.c
 	$(CC) $(FLAGS) -DXXH_PRIVATE_API $^ -o $@$(EXT)
@@ -103,11 +107,11 @@ libxxhash.a: xxhash.o
 
 $(LIBXXH): LDFLAGS += -shared
 ifeq (,$(filter Windows%,$(OS)))
-$(LIBXXH): LDFLAGS += -fPIC
+$(LIBXXH): CFLAGS += -fPIC
 endif
 $(LIBXXH): xxhash.c
 	@echo compiling dynamic library $(LIBVER)
-	@$(CC) $(FLAGS) $^ $(LDFLAGS) $(SONAME_FLAGS) -o $@
+	$(CC) $(FLAGS) $^ $(LDFLAGS) $(SONAME_FLAGS) -o $@
 	@echo creating versioned links
 	@ln -sf $@ libxxhash.$(SHARED_EXT_MAJOR)
 	@ln -sf $@ libxxhash.$(SHARED_EXT)

From fbd68c5f09a9da4662c4b58f1520cfda70e455a9 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Mon, 17 Sep 2018 11:40:06 -0700
Subject: [PATCH 03/73] updated C++ test

---
 Makefile | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index 5fb3e24d..857906cf 100644
--- a/Makefile
+++ b/Makefile
@@ -170,15 +170,15 @@ test-xxhsum-c: xxhsum
 
 armtest: clean
 	@echo ---- test ARM compilation ----
-	$(MAKE) xxhsum CC=arm-linux-gnueabi-gcc MOREFLAGS="-Werror -static"
+	CC=arm-linux-gnueabi-gcc MOREFLAGS="-Werror -static" $(MAKE) xxhsum
 
 clangtest: clean
 	@echo ---- test clang compilation ----
-	$(MAKE) all CC=clang MOREFLAGS="-Werror -Wconversion -Wno-sign-conversion"
+	CC=clang MOREFLAGS="-Werror -Wconversion -Wno-sign-conversion" $(MAKE) all
 
-gpptest: clean
+cxxtest: clean
 	@echo ---- test g++ compilation ----
-	$(MAKE) all CC=g++ CFLAGS="-O3 -Wall -Wextra -Wundef -Wshadow -Wcast-align -Werror"
+	CC="$(CXX) -Wno-deprecated" $(MAKE) all CFLAGS="-O3 -Wall -Wextra -Wundef -Wshadow -Wcast-align -Werror -fPIC"
 
 c90test: clean
 	@echo ---- test strict C90 compilation [xxh32 only] ----
@@ -213,7 +213,7 @@ preview-man: clean-man man
 
 test: all namespaceTest check test-xxhsum-c c90test
 
-test-all: test test32 armtest clangtest gpptest usan listL120 trailingWhitespace staticAnalyze
+test-all: test test32 armtest clangtest cxxtest usan listL120 trailingWhitespace staticAnalyze
 
 .PHONY: listL120
 listL120:  # extract lines >= 120 characters in *.{c,h}, by Takayuki Matsuoka (note : $$, for Makefile compatibility)

From bce5f457b0bd05b30c4ae5ffdb975c27feb8c718 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Mon, 17 Sep 2018 12:10:03 -0700
Subject: [PATCH 04/73] fixed pointer arithmetic on NULL

---
 xxhash.c | 170 ++++++++++++++++++++++++++++---------------------------
 1 file changed, 86 insertions(+), 84 deletions(-)

diff --git a/xxhash.c b/xxhash.c
index da06ea72..05be9b29 100644
--- a/xxhash.c
+++ b/xxhash.c
@@ -448,12 +448,9 @@ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int s
 }
 
 
-FORCE_INLINE
-XXH_errorcode XXH32_update_endian (XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian)
+FORCE_INLINE XXH_errorcode
+XXH32_update_endian(XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian)
 {
-    const BYTE* p = (const BYTE*)input;
-    const BYTE* const bEnd = p + len;
-
     if (input==NULL)
 #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
         return XXH_OK;
@@ -461,50 +458,54 @@ XXH_errorcode XXH32_update_endian (XXH32_state_t* state, const void* input, size
         return XXH_ERROR;
 #endif
 
-    state->total_len_32 += (unsigned)len;
-    state->large_len |= (len>=16) | (state->total_len_32>=16);
+    {   const BYTE* p = (const BYTE*)input;
+        const BYTE* const bEnd = p + len;
 
-    if (state->memsize + len < 16)  {   /* fill in tmp buffer */
-        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);
-        state->memsize += (unsigned)len;
-        return XXH_OK;
-    }
+        state->total_len_32 += (unsigned)len;
+        state->large_len |= (len>=16) | (state->total_len_32>=16);
 
-    if (state->memsize) {   /* some data left from previous update */
-        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize);
-        {   const U32* p32 = state->mem32;
-            state->v1 = XXH32_round(state->v1, XXH_readLE32(p32, endian)); p32++;
-            state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian)); p32++;
-            state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian)); p32++;
-            state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian));
+        if (state->memsize + len < 16)  {   /* fill in tmp buffer */
+            XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);
+            state->memsize += (unsigned)len;
+            return XXH_OK;
         }
-        p += 16-state->memsize;
-        state->memsize = 0;
-    }
-
-    if (p <= bEnd-16) {
-        const BYTE* const limit = bEnd - 16;
-        U32 v1 = state->v1;
-        U32 v2 = state->v2;
-        U32 v3 = state->v3;
-        U32 v4 = state->v4;
 
-        do {
-            v1 = XXH32_round(v1, XXH_readLE32(p, endian)); p+=4;
-            v2 = XXH32_round(v2, XXH_readLE32(p, endian)); p+=4;
-            v3 = XXH32_round(v3, XXH_readLE32(p, endian)); p+=4;
-            v4 = XXH32_round(v4, XXH_readLE32(p, endian)); p+=4;
-        } while (p<=limit);
+        if (state->memsize) {   /* some data left from previous update */
+            XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize);
+            {   const U32* p32 = state->mem32;
+                state->v1 = XXH32_round(state->v1, XXH_readLE32(p32, endian)); p32++;
+                state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian)); p32++;
+                state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian)); p32++;
+                state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian));
+            }
+            p += 16-state->memsize;
+            state->memsize = 0;
+        }
 
-        state->v1 = v1;
-        state->v2 = v2;
-        state->v3 = v3;
-        state->v4 = v4;
-    }
+        if (p <= bEnd-16) {
+            const BYTE* const limit = bEnd - 16;
+            U32 v1 = state->v1;
+            U32 v2 = state->v2;
+            U32 v3 = state->v3;
+            U32 v4 = state->v4;
+
+            do {
+                v1 = XXH32_round(v1, XXH_readLE32(p, endian)); p+=4;
+                v2 = XXH32_round(v2, XXH_readLE32(p, endian)); p+=4;
+                v3 = XXH32_round(v3, XXH_readLE32(p, endian)); p+=4;
+                v4 = XXH32_round(v4, XXH_readLE32(p, endian)); p+=4;
+            } while (p<=limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
 
-    if (p < bEnd) {
-        XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
-        state->memsize = (unsigned)(bEnd-p);
+        if (p < bEnd) {
+            XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
     }
 
     return XXH_OK;
@@ -908,12 +909,9 @@ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long
     return XXH_OK;
 }
 
-FORCE_INLINE
-XXH_errorcode XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian)
+FORCE_INLINE XXH_errorcode
+XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian)
 {
-    const BYTE* p = (const BYTE*)input;
-    const BYTE* const bEnd = p + len;
-
     if (input==NULL)
 #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
         return XXH_OK;
@@ -921,47 +919,51 @@ XXH_errorcode XXH64_update_endian (XXH64_state_t* state, const void* input, size
         return XXH_ERROR;
 #endif
 
-    state->total_len += len;
+    {   const BYTE* p = (const BYTE*)input;
+        const BYTE* const bEnd = p + len;
 
-    if (state->memsize + len < 32) {  /* fill in tmp buffer */
-        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
-        state->memsize += (U32)len;
-        return XXH_OK;
-    }
+        state->total_len += len;
 
-    if (state->memsize) {   /* tmp buffer is full */
-        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize);
-        state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0, endian));
-        state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1, endian));
-        state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2, endian));
-        state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3, endian));
-        p += 32-state->memsize;
-        state->memsize = 0;
-    }
-
-    if (p+32 <= bEnd) {
-        const BYTE* const limit = bEnd - 32;
-        U64 v1 = state->v1;
-        U64 v2 = state->v2;
-        U64 v3 = state->v3;
-        U64 v4 = state->v4;
+        if (state->memsize + len < 32) {  /* fill in tmp buffer */
+            XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
+            state->memsize += (U32)len;
+            return XXH_OK;
+        }
 
-        do {
-            v1 = XXH64_round(v1, XXH_readLE64(p, endian)); p+=8;
-            v2 = XXH64_round(v2, XXH_readLE64(p, endian)); p+=8;
-            v3 = XXH64_round(v3, XXH_readLE64(p, endian)); p+=8;
-            v4 = XXH64_round(v4, XXH_readLE64(p, endian)); p+=8;
-        } while (p<=limit);
+        if (state->memsize) {   /* tmp buffer is full */
+            XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize);
+            state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0, endian));
+            state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1, endian));
+            state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2, endian));
+            state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3, endian));
+            p += 32-state->memsize;
+            state->memsize = 0;
+        }
 
-        state->v1 = v1;
-        state->v2 = v2;
-        state->v3 = v3;
-        state->v4 = v4;
-    }
+        if (p+32 <= bEnd) {
+            const BYTE* const limit = bEnd - 32;
+            U64 v1 = state->v1;
+            U64 v2 = state->v2;
+            U64 v3 = state->v3;
+            U64 v4 = state->v4;
+
+            do {
+                v1 = XXH64_round(v1, XXH_readLE64(p, endian)); p+=8;
+                v2 = XXH64_round(v2, XXH_readLE64(p, endian)); p+=8;
+                v3 = XXH64_round(v3, XXH_readLE64(p, endian)); p+=8;
+                v4 = XXH64_round(v4, XXH_readLE64(p, endian)); p+=8;
+            } while (p<=limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
 
-    if (p < bEnd) {
-        XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
-        state->memsize = (unsigned)(bEnd-p);
+        if (p < bEnd) {
+            XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
     }
 
     return XXH_OK;

From df35d637c4b64127b78c9f821fb3c0abaecedb82 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Mon, 17 Sep 2018 12:12:44 -0700
Subject: [PATCH 05/73] fixed minor printf formatting

---
 xxhsum.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/xxhsum.c b/xxhsum.c
index 69931f72..5704f7db 100644
--- a/xxhsum.c
+++ b/xxhsum.c
@@ -227,7 +227,7 @@ static void BMK_benchHash(hashFunction h, const char* hName, const void* buffer,
         U32 r=0;
         clock_t cStart;
 
-        DISPLAYLEVEL(2, "%1i-%-17.17s : %10u ->\r", iterationNb, hName, (U32)bufferSize);
+        DISPLAYLEVEL(2, "%1u-%-17.17s : %10u ->\r", iterationNb, hName, (U32)bufferSize);
         cStart = clock();
         while (clock() == cStart);   /* starts clock() at its exact beginning */
         cStart = clock();
@@ -239,7 +239,7 @@ static void BMK_benchHash(hashFunction h, const char* hName, const void* buffer,
         if (r==0) DISPLAYLEVEL(3,".\r");  /* do something with r to avoid compiler "optimizing" away hash function */
         {   double const timeS = ((double)BMK_clockSpan(cStart) / CLOCKS_PER_SEC) / nbh_perIteration;
             if (timeS < fastestH) fastestH = timeS;
-            DISPLAYLEVEL(2, "%1i-%-17.17s : %10u -> %8.0f it/s (%7.1f MB/s) \r",
+            DISPLAYLEVEL(2, "%1u-%-17.17s : %10u -> %8.0f it/s (%7.1f MB/s) \r",
                     iterationNb, hName, (U32)bufferSize,
                     (double)1 / fastestH,
                     ((double)bufferSize / (1<<20)) / fastestH );
@@ -1140,7 +1140,7 @@ static int usage_advanced(const char* exename)
     DISPLAY( " -V, --version   : display version\n");
     DISPLAY( " -h, --help      : display long help and exit\n");
     DISPLAY( " -b  : benchmark mode \n");
-    DISPLAY( " -i# : number of iterations (benchmark mode; default %i)\n", g_nbIterations);
+    DISPLAY( " -i# : number of iterations (benchmark mode; default %u)\n", g_nbIterations);
     DISPLAY( "\n");
     DISPLAY( "The following four options are useful only when verifying checksums (-c):\n");
     DISPLAY( "--strict : don't print OK for each successfully verified file\n");

From 68652df700793b86dcbfa945f9ecdeb4c45159b5 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Mon, 17 Sep 2018 12:28:59 -0700
Subject: [PATCH 06/73] fixed ptr arithmetic on NULL

---
 xxhsum.c | 139 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 73 insertions(+), 66 deletions(-)

diff --git a/xxhsum.c b/xxhsum.c
index 5704f7db..7790458f 100644
--- a/xxhsum.c
+++ b/xxhsum.c
@@ -306,37 +306,40 @@ static int BMK_benchFiles(const char** fileNamesTable, int nbFiles, U32 specific
 
     for (fileIdx=0; fileIdx<nbFiles; fileIdx++) {
         const char* const inFileName = fileNamesTable[fileIdx];
-        FILE* const inFile = fopen( inFileName, "rb" );
-        size_t const benchedSize = BMK_selectBenchedSize(inFileName);
-        char* const buffer = (char*)calloc(benchedSize+16+3, 1);
-        void* const alignedBuffer = (buffer+15) - (((size_t)(buffer+15)) & 0xF);  /* align on next 16 bytes */
-
-        /* Checks */
-        if ((inFile==NULL) || (inFileName==NULL)) {
-            DISPLAY("Pb opening %s\n", inFileName);
-            free(buffer);
-            return 11;
-        }
-        if(!buffer) {
-            DISPLAY("\nError: not enough memory!\n");
-            fclose(inFile);
-            return 12;
-        }
-
-        /* Fill input buffer */
-        DISPLAYLEVEL(1, "\rLoading %s...        \n", inFileName);
-        {   size_t const readSize = fread(alignedBuffer, 1, benchedSize, inFile);
-            fclose(inFile);
-            if(readSize != benchedSize) {
-                DISPLAY("\nError: problem reading file '%s' !!    \n", inFileName);
+        assert(inFileName != NULL);
+        {
+            FILE* const inFile = fopen( inFileName, "rb" );
+            size_t const benchedSize = BMK_selectBenchedSize(inFileName);
+            char* const buffer = (char*)calloc(benchedSize+16+3, 1);
+            void* const alignedBuffer = (buffer+15) - (((size_t)(buffer+15)) & 0xF);  /* align on next 16 bytes */
+
+            /* Checks */
+            if (inFile==NULL){
+                DISPLAY("Pb opening %s\n", inFileName);
                 free(buffer);
-                return 13;
-        }   }
+                return 11;
+            }
+            if(!buffer) {
+                DISPLAY("\nError: not enough memory!\n");
+                fclose(inFile);
+                return 12;
+            }
 
-        /* bench */
-        result |= BMK_benchMem(alignedBuffer, benchedSize, specificTest);
+            /* Fill input buffer */
+            DISPLAYLEVEL(1, "\rLoading %s...        \n", inFileName);
+            {   size_t const readSize = fread(alignedBuffer, 1, benchedSize, inFile);
+                fclose(inFile);
+                if(readSize != benchedSize) {
+                    DISPLAY("\nError: problem reading file '%s' !!    \n", inFileName);
+                    free(buffer);
+                    return 13;
+            }   }
 
-        free(buffer);
+            /* bench */
+            result |= BMK_benchMem(alignedBuffer, benchedSize, specificTest);
+
+            free(buffer);
+        }
     }
 
     return result;
@@ -347,24 +350,26 @@ static int BMK_benchFiles(const char** fileNamesTable, int nbFiles, U32 specific
 static int BMK_benchInternal(size_t keySize, int specificTest)
 {
     void* const buffer = calloc(keySize+16+3, 1);
-    void* const alignedBuffer = ((char*)buffer+15) - (((size_t)((char*)buffer+15)) & 0xF);  /* align on next 16 bytes */
     if(!buffer) {
         DISPLAY("\nError: not enough memory!\n");
         return 12;
     }
 
-    /* bench */
-    DISPLAYLEVEL(1, "Sample of ");
-    if (keySize > 10 KB) {
-        DISPLAYLEVEL(1, "%u KB", (U32)(keySize >> 10));
-    } else {
-        DISPLAYLEVEL(1, "%u bytes", (U32)keySize);
-    }
-    DISPLAYLEVEL(1, "...        \n");
+    {   void* const alignedBuffer = ((char*)buffer+15) - (((size_t)((char*)buffer+15)) & 0xF);  /* align on next 16 bytes */
 
-    {   int const result = BMK_benchMem(alignedBuffer, keySize, specificTest);
-        free(buffer);
-        return result;
+        /* bench */
+        DISPLAYLEVEL(1, "Sample of ");
+        if (keySize > 10 KB) {
+            DISPLAYLEVEL(1, "%u KB", (U32)(keySize >> 10));
+        } else {
+            DISPLAYLEVEL(1, "%u bytes", (U32)keySize);
+        }
+        DISPLAYLEVEL(1, "...        \n");
+
+        {   int const result = BMK_benchMem(alignedBuffer, keySize, specificTest);
+            free(buffer);
+            return result;
+        }
     }
 }
 
@@ -813,41 +818,43 @@ static CanonicalFromStringResult canonicalFromString(unsigned char* dst,
 static ParseLineResult parseLine(ParsedLine* parsedLine, const char* line)
 {
     const char* const firstSpace = strchr(line, ' ');
-    const char* const secondSpace = firstSpace + 1;
+    if (firstSpace == NULL) return ParseLine_invalidFormat;
 
-    parsedLine->filename = NULL;
-    parsedLine->xxhBits = 0;
+    {   const char* const secondSpace = firstSpace + 1;
+        if (*secondSpace != ' ') return ParseLine_invalidFormat;
 
-    if (firstSpace == NULL || *secondSpace != ' ') return ParseLine_invalidFormat;
+        parsedLine->filename = NULL;
+        parsedLine->xxhBits = 0;
 
-    switch (firstSpace - line)
-    {
-    case 8:
-        {   XXH32_canonical_t* xxh32c = &parsedLine->canonical.xxh32;
-            if (canonicalFromString(xxh32c->digest, sizeof(xxh32c->digest), line)
-                != CanonicalFromString_ok) {
-                return ParseLine_invalidFormat;
+        switch (firstSpace - line)
+        {
+        case 8:
+            {   XXH32_canonical_t* xxh32c = &parsedLine->canonical.xxh32;
+                if (canonicalFromString(xxh32c->digest, sizeof(xxh32c->digest), line)
+                    != CanonicalFromString_ok) {
+                    return ParseLine_invalidFormat;
+                }
+                parsedLine->xxhBits = 32;
+                break;
             }
-            parsedLine->xxhBits = 32;
-            break;
-        }
 
-    case 16:
-        {   XXH64_canonical_t* xxh64c = &parsedLine->canonical.xxh64;
-            if (canonicalFromString(xxh64c->digest, sizeof(xxh64c->digest), line)
-                != CanonicalFromString_ok) {
-                return ParseLine_invalidFormat;
+        case 16:
+            {   XXH64_canonical_t* xxh64c = &parsedLine->canonical.xxh64;
+                if (canonicalFromString(xxh64c->digest, sizeof(xxh64c->digest), line)
+                    != CanonicalFromString_ok) {
+                    return ParseLine_invalidFormat;
+                }
+                parsedLine->xxhBits = 64;
+                break;
             }
-            parsedLine->xxhBits = 64;
-            break;
+
+        default:
+                return ParseLine_invalidFormat;
+                break;
         }
 
-    default:
-            return ParseLine_invalidFormat;
-            break;
+        parsedLine->filename = secondSpace + 1;
     }
-
-    parsedLine->filename = secondSpace + 1;
     return ParseLine_ok;
 }
 

From 3eb9d18ddb6250b67c77db58e82a4e283e126129 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Mon, 17 Sep 2018 12:39:18 -0700
Subject: [PATCH 07/73] explicitly states when not checking a return value

although, cppcheck seems to overdo this warning,
as it also warns for function with `void` return type (??)
---
 xxhsum.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/xxhsum.c b/xxhsum.c
index 7790458f..38aaceb4 100644
--- a/xxhsum.c
+++ b/xxhsum.c
@@ -408,14 +408,14 @@ static void BMK_testSequence64(void* sentence, size_t len, U64 seed, U64 Nresult
     Dresult = XXH64(sentence, len, seed);
     BMK_checkResult64(Dresult, Nresult);
 
-    XXH64_reset(&state, seed);
-    XXH64_update(&state, sentence, len);
+    (void)XXH64_reset(&state, seed);
+    (void)XXH64_update(&state, sentence, len);
     Dresult = XXH64_digest(&state);
     BMK_checkResult64(Dresult, Nresult);
 
-    XXH64_reset(&state, seed);
+    (void)XXH64_reset(&state, seed);
     for (pos=0; pos<len; pos++)
-        XXH64_update(&state, ((char*)sentence)+pos, 1);
+        (void)XXH64_update(&state, ((char*)sentence)+pos, 1);
     Dresult = XXH64_digest(&state);
     BMK_checkResult64(Dresult, Nresult);
 }
@@ -430,14 +430,14 @@ static void BMK_testSequence(const void* sequence, size_t len, U32 seed, U32 Nre
     Dresult = XXH32(sequence, len, seed);
     BMK_checkResult(Dresult, Nresult);
 
-    XXH32_reset(&state, seed);
-    XXH32_update(&state, sequence, len);
+    (void)XXH32_reset(&state, seed);
+    (void)XXH32_update(&state, sequence, len);
     Dresult = XXH32_digest(&state);
     BMK_checkResult(Dresult, Nresult);
 
-    XXH32_reset(&state, seed);
+    (void)XXH32_reset(&state, seed);
     for (pos=0; pos<len; pos++)
-        XXH32_update(&state, ((const char*)sequence)+pos, 1);
+        (void)XXH32_update(&state, ((const char*)sequence)+pos, 1);
     Dresult = XXH32_digest(&state);
     BMK_checkResult(Dresult, Nresult);
 }
@@ -506,8 +506,8 @@ static void BMK_hashStream(void* xxhHashValue, const algoType hashType, FILE* in
     size_t readSize;
 
     /* Init */
-    XXH32_reset(&state32, XXHSUM32_DEFAULT_SEED);
-    XXH64_reset(&state64, XXHSUM64_DEFAULT_SEED);
+    (void)XXH32_reset(&state32, XXHSUM32_DEFAULT_SEED);
+    (void)XXH64_reset(&state64, XXHSUM64_DEFAULT_SEED);
 
     /* Load file & update hash */
     readSize = 1;
@@ -516,10 +516,10 @@ static void BMK_hashStream(void* xxhHashValue, const algoType hashType, FILE* in
         switch(hashType)
         {
         case algo_xxh32:
-            XXH32_update(&state32, buffer, readSize);
+            (void)XXH32_update(&state32, buffer, readSize);
             break;
         case algo_xxh64:
-            XXH64_update(&state64, buffer, readSize);
+            (void)XXH64_update(&state64, buffer, readSize);
             break;
         default:
             break;
@@ -610,7 +610,7 @@ static int BMK_hash(const char* fileName,
     {
     case algo_xxh32:
         {   XXH32_canonical_t hcbe32;
-            XXH32_canonicalFromHash(&hcbe32, h32);
+            (void)XXH32_canonicalFromHash(&hcbe32, h32);
             displayEndianess==big_endian ?
                 BMK_display_BigEndian(&hcbe32, sizeof(hcbe32)) : BMK_display_LittleEndian(&hcbe32, sizeof(hcbe32));
             DISPLAYRESULT("  %s\n", fileName);
@@ -618,7 +618,7 @@ static int BMK_hash(const char* fileName,
         }
     case algo_xxh64:
         {   XXH64_canonical_t hcbe64;
-            XXH64_canonicalFromHash(&hcbe64, h64);
+            (void)XXH64_canonicalFromHash(&hcbe64, h64);
             displayEndianess==big_endian ?
                 BMK_display_BigEndian(&hcbe64, sizeof(hcbe64)) : BMK_display_LittleEndian(&hcbe64, sizeof(hcbe64));
             DISPLAYRESULT("  %s\n", fileName);

From b39d9be3482a7eec50c1f142eb22f7d25112a93c Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Mon, 17 Sep 2018 12:51:12 -0700
Subject: [PATCH 08/73] minor unused last pointer change style warning

---
 xxhash.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/xxhash.c b/xxhash.c
index 05be9b29..ff28749e 100644
--- a/xxhash.c
+++ b/xxhash.c
@@ -293,9 +293,9 @@ XXH32_finalize(U32 h32, const void* ptr, size_t len,
 
 {
     const BYTE* p = (const BYTE*)ptr;
-#define PROCESS1             \
-    h32 += (*p) * PRIME32_5; \
-    p++;                     \
+
+#define PROCESS1               \
+    h32 += (*p++) * PRIME32_5; \
     h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
 
 #define PROCESS4                         \
@@ -704,9 +704,8 @@ XXH64_finalize(U64 h64, const void* ptr, size_t len,
 {
     const BYTE* p = (const BYTE*)ptr;
 
-#define PROCESS1_64          \
-    h64 ^= (*p) * PRIME64_5; \
-    p++;                     \
+#define PROCESS1_64            \
+    h64 ^= (*p++) * PRIME64_5; \
     h64 = XXH_rotl64(h64, 11) * PRIME64_1;
 
 #define PROCESS4_64          \

From 79b52d94ba00f286b8ef2ae8ade1e840dc33d31c Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Mon, 17 Sep 2018 13:47:54 -0700
Subject: [PATCH 09/73] added cppcheck test

to Makefile and transitively to .travis.yml
---
 .travis.yml | 1 +
 Makefile    | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 895da855..3c37a826 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,3 +7,4 @@ before_install:
   - sudo apt-get install -qq clang
   - sudo apt-get install -qq g++-multilib
   - sudo apt-get install -qq gcc-multilib
+  - sudo apt-get install -qq cppcheck
diff --git a/Makefile b/Makefile
index 857906cf..32657417 100644
--- a/Makefile
+++ b/Makefile
@@ -190,10 +190,16 @@ usan: clean
 	@echo ---- check undefined behavior - sanitize ----
 	$(MAKE) clean test CC=$(CC) MOREFLAGS="-g -fsanitize=undefined -fno-sanitize-recover=all"
 
+.PHONY: staticAnalyze
 staticAnalyze: clean
 	@echo ---- static analyzer - scan-build ----
 	CFLAGS="-g -Werror" scan-build --status-bugs -v $(MAKE) all
 
+.PHONY: cppcheck
+cppcheck:
+	@echo ---- static analyzer - cppcheck ----
+	cppcheck . --force --enable=warning,portability,performance,style --error-exitcode=1 > /dev/null
+
 namespaceTest:
 	$(CC) -c xxhash.c
 	$(CC) -DXXH_NAMESPACE=TEST_ -c xxhash.c -o xxhash2.o
@@ -213,7 +219,7 @@ preview-man: clean-man man
 
 test: all namespaceTest check test-xxhsum-c c90test
 
-test-all: test test32 armtest clangtest cxxtest usan listL120 trailingWhitespace staticAnalyze
+test-all: test test32 armtest clangtest cxxtest usan listL120 trailingWhitespace staticAnalyze cppcheck
 
 .PHONY: listL120
 listL120:  # extract lines >= 120 characters in *.{c,h}, by Takayuki Matsuoka (note : $$, for Makefile compatibility)

From 542430e0ec256b681640719580ef0e02b3dc93ba Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Sat, 29 Sep 2018 23:09:23 -0700
Subject: [PATCH 10/73] fixed compilation issues under msys2/mingw64

---
 .gitignore |  1 +
 Makefile   |  3 ++-
 xxhsum.c   | 62 ++++++++++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index 36639c6e..d1c970d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@ libxxhash.*
 xxh32sum
 xxh64sum
 xxhsum
+xxhsum.exe
 xxhsum32
 xxhsum_privateXXH
 xxhsum_inlinedXXH
diff --git a/Makefile b/Makefile
index 32657417..83db5c3c 100644
--- a/Makefile
+++ b/Makefile
@@ -94,8 +94,9 @@ xxhsum_and_links: xxhsum xxh32sum xxh64sum
 xxh32sum xxh64sum: xxhsum
 	ln -sf $^ $@
 
+xxhsum_inlinedXXH: CPPFLAGS += -DXXH_INLINE_ALL
 xxhsum_inlinedXXH: xxhsum.c
-	$(CC) $(FLAGS) -DXXH_PRIVATE_API $^ -o $@$(EXT)
+	$(CC) $(FLAGS) $^ -o $@$(EXT)
 
 
 # library
diff --git a/xxhsum.c b/xxhsum.c
index 38aaceb4..7336e1f6 100644
--- a/xxhsum.c
+++ b/xxhsum.c
@@ -63,15 +63,65 @@
 /* ************************************
  *  OS-Specific Includes
  **************************************/
-#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) || defined(__CYGWIN__)
-#  include <fcntl.h>    /* _O_BINARY */
-#  include <io.h>       /* _setmode, _isatty */
-#  define SET_BINARY_MODE(file) _setmode(_fileno(file), _O_BINARY)
+#if !defined(_WIN32) && (defined(__unix__) || defined(__unix) || (defined(__APPLE__) && defined(__MACH__)) /* UNIX-like OS */ \
+   || defined(__midipix__) || defined(__VMS))
+#  if (defined(__APPLE__) && defined(__MACH__)) || defined(__SVR4) || defined(_AIX) || defined(__hpux) /* POSIX.1-2001 (SUSv3) conformant */ \
+     || defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)  /* BSD distros */
+#    define PLATFORM_POSIX_VERSION 200112L
+#  else
+#    if defined(__linux__) || defined(__linux)
+#      ifndef _POSIX_C_SOURCE
+#        define _POSIX_C_SOURCE 200112L  /* use feature test macro */
+#      endif
+#    endif
+#    include <unistd.h>  /* declares _POSIX_VERSION */
+#    if defined(_POSIX_VERSION)  /* POSIX compliant */
+#      define PLATFORM_POSIX_VERSION _POSIX_VERSION
+#    else
+#      define PLATFORM_POSIX_VERSION 0
+#    endif
+#  endif
+#endif
+#if !defined(PLATFORM_POSIX_VERSION)
+#  define PLATFORM_POSIX_VERSION -1
+#endif
+
+#if (defined(__linux__) && (PLATFORM_POSIX_VERSION >= 1)) \
+ || (PLATFORM_POSIX_VERSION >= 200112L) \
+ || defined(__DJGPP__) \
+ || defined(__MSYS__)
+#  include <unistd.h>   /* isatty */
+#  define IS_CONSOLE(stdStream) isatty(fileno(stdStream))
+#elif defined(MSDOS) || defined(OS2) || defined(__CYGWIN__)
+#  include <io.h>       /* _isatty */
 #  define IS_CONSOLE(stdStream) _isatty(_fileno(stdStream))
+#elif defined(WIN32) || defined(_WIN32)
+#  include <io.h>      /* _isatty */
+#  include <windows.h> /* DeviceIoControl, HANDLE, FSCTL_SET_SPARSE */
+#  include <stdio.h>   /* FILE */
+static __inline int IS_CONSOLE(FILE* stdStream) {
+    DWORD dummy;
+    return _isatty(_fileno(stdStream)) && GetConsoleMode((HANDLE)_get_osfhandle(_fileno(stdStream)), &dummy);
+}
+#else
+#  define IS_CONSOLE(stdStream) 0
+#endif
+
+#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32)
+#  include <fcntl.h>   /* _O_BINARY */
+#  include <io.h>      /* _setmode, _fileno, _get_osfhandle */
+#  if !defined(__DJGPP__)
+#    include <windows.h> /* DeviceIoControl, HANDLE, FSCTL_SET_SPARSE */
+#    include <winioctl.h> /* FSCTL_SET_SPARSE */
+#    define SET_BINARY_MODE(file) { int const unused=_setmode(_fileno(file), _O_BINARY); (void)unused; }
+#    define SET_SPARSE_FILE_MODE(file) { DWORD dw; DeviceIoControl((HANDLE) _get_osfhandle(_fileno(file)), FSCTL_SET_SPARSE, 0, 0, 0, 0, &dw, 0); }
+#  else
+#    define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
+#    define SET_SPARSE_FILE_MODE(file)
+#  endif
 #else
-#  include <unistd.h>   /* isatty, STDIN_FILENO */
 #  define SET_BINARY_MODE(file)
-#  define IS_CONSOLE(stdStream) isatty(STDIN_FILENO)
+#  define SET_SPARSE_FILE_MODE(file)
 #endif
 
 #if !defined(S_ISREG)

From 7a407f64f987731c5ec3472caa9754d55ca5be3c Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Sat, 29 Sep 2018 23:13:01 -0700
Subject: [PATCH 11/73] show script lines during lib compilation

---
 .gitignore | 1 +
 Makefile   | 9 +++------
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index d1c970d7..9a49cfd1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,7 @@ xxhsum.exe
 xxhsum32
 xxhsum_privateXXH
 xxhsum_inlinedXXH
+xxhsum_inlinedXXH.exe
 
 # Mac OS-X artefacts
 *.dSYM
diff --git a/Makefile b/Makefile
index 83db5c3c..55eb82f3 100644
--- a/Makefile
+++ b/Makefile
@@ -103,19 +103,16 @@ xxhsum_inlinedXXH: xxhsum.c
 
 libxxhash.a: ARFLAGS = rcs
 libxxhash.a: xxhash.o
-	@echo compiling static library
-	@$(AR) $(ARFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 
 $(LIBXXH): LDFLAGS += -shared
 ifeq (,$(filter Windows%,$(OS)))
 $(LIBXXH): CFLAGS += -fPIC
 endif
 $(LIBXXH): xxhash.c
-	@echo compiling dynamic library $(LIBVER)
 	$(CC) $(FLAGS) $^ $(LDFLAGS) $(SONAME_FLAGS) -o $@
-	@echo creating versioned links
-	@ln -sf $@ libxxhash.$(SHARED_EXT_MAJOR)
-	@ln -sf $@ libxxhash.$(SHARED_EXT)
+	ln -sf $@ libxxhash.$(SHARED_EXT_MAJOR)
+	ln -sf $@ libxxhash.$(SHARED_EXT)
 
 libxxhash : $(LIBXXH)
 

From 2ec7fddf1c1c7d19212726f3d3dfaaaa6f12d4a8 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Sat, 29 Sep 2018 23:54:24 -0700
Subject: [PATCH 12/73] minor optimization : shared xxhash.o compilation

---
 Makefile | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 55eb82f3..e88013b6 100644
--- a/Makefile
+++ b/Makefile
@@ -84,8 +84,10 @@ default: lib xxhsum_and_links
 .PHONY: all
 all: lib xxhsum xxhsum_inlinedXXH
 
+xxhsum : xxhash.o xxhsum.o
+
 xxhsum32: CFLAGS += -m32
-xxhsum xxhsum32: xxhash.c xxhsum.c
+xxhsum32: xxhash.c xxhsum.c
 	$(CC) $(FLAGS) $^ $(LDFLAGS) -o $@$(EXT)
 
 .PHONY: xxhsum_and_links
@@ -116,6 +118,7 @@ $(LIBXXH): xxhash.c
 
 libxxhash : $(LIBXXH)
 
+.PHONY: lib
 lib: libxxhash.a libxxhash
 
 
@@ -175,12 +178,16 @@ clangtest: clean
 	CC=clang MOREFLAGS="-Werror -Wconversion -Wno-sign-conversion" $(MAKE) all
 
 cxxtest: clean
-	@echo ---- test g++ compilation ----
+	@echo ---- test C++ compilation ----
 	CC="$(CXX) -Wno-deprecated" $(MAKE) all CFLAGS="-O3 -Wall -Wextra -Wundef -Wshadow -Wcast-align -Werror -fPIC"
 
-c90test: clean
+.PHONY: c90test
+c90test: CPPFLAGS += -DXXH_NO_LONG_LONG
+c90test: CFLAGS += -std=c90 -Werror -pedantic
+c90test: xxhash.c
 	@echo ---- test strict C90 compilation [xxh32 only] ----
-	$(CC) -std=c90 -Werror -pedantic -DXXH_NO_LONG_LONG -c xxhash.c
+	$(RM) xxhash.o
+	$(CC) $(FLAGS) $^ $(LDFLAGS) -c
 	$(RM) xxhash.o
 
 usan: CC=clang
@@ -198,6 +205,7 @@ cppcheck:
 	@echo ---- static analyzer - cppcheck ----
 	cppcheck . --force --enable=warning,portability,performance,style --error-exitcode=1 > /dev/null
 
+.PHONY: namespaceTest
 namespaceTest:
 	$(CC) -c xxhash.c
 	$(CC) -DXXH_NAMESPACE=TEST_ -c xxhash.c -o xxhash2.o
@@ -207,6 +215,7 @@ namespaceTest:
 xxhsum.1: xxhsum.1.md
 	cat $^ | $(MD2ROFF) $(MD2ROFF_FLAGS) | sed -n '/^\.\\\".*/!p' > $@
 
+.PHONY: man
 man: xxhsum.1
 
 clean-man:

From eec5700f4d62113b47ee548edbc4746f61ffb098 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Thu, 11 Oct 2018 17:07:38 -0700
Subject: [PATCH 13/73] added some notes of constant selection

as suggested in #151.
---
 doc/xxhash_spec.md | 29 +++++++++++++++++------------
 xxhash.c           | 20 ++++++++++----------
 2 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/doc/xxhash_spec.md b/doc/xxhash_spec.md
index e673334b..7e634e03 100644
--- a/doc/xxhash_spec.md
+++ b/doc/xxhash_spec.md
@@ -16,7 +16,7 @@ Distribution of this document is unlimited.
 
 ### Version
 
-0.1.0 (15/01/18)
+0.1.1 (10/10/18)
 
 
 Table of Contents
@@ -63,13 +63,15 @@ The algorithm collect and transform input in _stripes_ of 16 bytes. The transfor
 
 The algorithm uses 32-bits addition, multiplication, rotate, shift and xor operations. Many operations require some 32-bits prime number constants, all defined below :
 
-    static const u32 PRIME32_1 = 2654435761U;
-    static const u32 PRIME32_2 = 2246822519U;
-    static const u32 PRIME32_3 = 3266489917U;
-    static const u32 PRIME32_4 =  668265263U;
-    static const u32 PRIME32_5 =  374761393U;
+    static const u32 PRIME32_1 = 2654435761U;  // 0b10011110001101110111100110110001
+    static const u32 PRIME32_2 = 2246822519U;  // 0b10000101111010111100101001110111
+    static const u32 PRIME32_3 = 3266489917U;  // 0b11000010101100101010111000111101
+    static const u32 PRIME32_4 =  668265263U;  // 0b00100111110101001110101100101111
+    static const u32 PRIME32_5 =  374761393U;  // 0b00010110010101100110011110110001
 
-### Step 1. Initialise internal accumulators
+These constants are prime numbers, and feature a good mix of bits 1 and 0, neither too regular, nor too dissymmetric. These properties help dispersion capabilities.
+
+### Step 1. Initialize internal accumulators
 
 Each accumulator gets an initial value based on optional `seed` input. Since the `seed` is optional, it can be `0`.
 
@@ -170,11 +172,13 @@ The algorithm collects and transforms input in _stripes_ of 32 bytes. The transf
 
 The algorithm uses 64-bit addition, multiplication, rotate, shift and xor operations. Many operations require some 64-bit prime number constants, all defined below :
 
-    static const u64 PRIME64_1 = 11400714785074694791ULL;
-    static const u64 PRIME64_2 = 14029467366897019727ULL;
-    static const u64 PRIME64_3 =  1609587929392839161ULL;
-    static const u64 PRIME64_4 =  9650029242287828579ULL;
-    static const u64 PRIME64_5 =  2870177450012600261ULL;
+    static const u64 PRIME64_1 = 11400714785074694791ULL;  // 0b1001111000110111011110011011000110000101111010111100101010000111
+    static const u64 PRIME64_2 = 14029467366897019727ULL;  // 0b1100001010110010101011100011110100100111110101001110101101001111
+    static const u64 PRIME64_3 =  1609587929392839161ULL;  // 0b0001011001010110011001111011000110011110001101110111100111111001
+    static const u64 PRIME64_4 =  9650029242287828579ULL;  // 0b1000010111101011110010100111011111000010101100101010111001100011
+    static const u64 PRIME64_5 =  2870177450012600261ULL;  // 0b0010011111010100111010110010111100010110010101100110011111000101
+
+These constants are prime numbers, and feature a good mix of bits 1 and 0, neither too regular, nor too dissymmetric. These properties help dispersion capabilities.
 
 ### Step 1. Initialise internal accumulators
 
@@ -308,4 +312,5 @@ It links to the [github project page](https://github.com/Cyan4973/xxHash) where
 
 Version changes
 --------------------
+v0.1.1 : added a note on rationale for selection of constants
 v0.1.0 : initial release
diff --git a/xxhash.c b/xxhash.c
index ff28749e..13669b2a 100644
--- a/xxhash.c
+++ b/xxhash.c
@@ -260,11 +260,11 @@ XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
 /* *******************************************************************
 *  32-bit hash functions
 *********************************************************************/
-static const U32 PRIME32_1 = 2654435761U;
-static const U32 PRIME32_2 = 2246822519U;
-static const U32 PRIME32_3 = 3266489917U;
-static const U32 PRIME32_4 =  668265263U;
-static const U32 PRIME32_5 =  374761393U;
+static const U32 PRIME32_1 = 2654435761U;   /* 0b10011110001101110111100110110001 */
+static const U32 PRIME32_2 = 2246822519U;   /* 0b10000101111010111100101001110111 */
+static const U32 PRIME32_3 = 3266489917U;   /* 0b11000010101100101010111000111101 */
+static const U32 PRIME32_4 =  668265263U;   /* 0b00100111110101001110101100101111 */
+static const U32 PRIME32_5 =  374761393U;   /* 0b00010110010101100110011110110001 */
 
 static U32 XXH32_round(U32 seed, U32 input)
 {
@@ -663,11 +663,11 @@ static U64 XXH_readBE64(const void* ptr)
 
 /*======   xxh64   ======*/
 
-static const U64 PRIME64_1 = 11400714785074694791ULL;
-static const U64 PRIME64_2 = 14029467366897019727ULL;
-static const U64 PRIME64_3 =  1609587929392839161ULL;
-static const U64 PRIME64_4 =  9650029242287828579ULL;
-static const U64 PRIME64_5 =  2870177450012600261ULL;
+static const U64 PRIME64_1 = 11400714785074694791ULL;   /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
+static const U64 PRIME64_2 = 14029467366897019727ULL;   /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
+static const U64 PRIME64_3 =  1609587929392839161ULL;   /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
+static const U64 PRIME64_4 =  9650029242287828579ULL;   /* 0b1000010111101011110010100111011111000010101100101010111001100011 */
+static const U64 PRIME64_5 =  2870177450012600261ULL;   /* 0b0010011111010100111010110010111100010110010101100110011111000101 */
 
 static U64 XXH64_round(U64 acc, U64 input)
 {

From 0f2dd4a1cb103e3fc8c55c855b821eb24c6d82c3 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 16 Oct 2018 21:47:53 -0700
Subject: [PATCH 14/73] fixed minor cast warning

fix #139
---
 xxhsum.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xxhsum.c b/xxhsum.c
index 7336e1f6..d9f5be2d 100644
--- a/xxhsum.c
+++ b/xxhsum.c
@@ -267,7 +267,7 @@ static U32 localXXH64(const void* buffer, size_t bufferSize, U32 seed) { return
 
 static void BMK_benchHash(hashFunction h, const char* hName, const void* buffer, size_t bufferSize)
 {
-    U32 nbh_perIteration = ((300 MB) / (bufferSize+1)) + 1;  /* first loop conservatively aims for 300 MB/s */
+    U32 nbh_perIteration = (U32)((300 MB) / (bufferSize+1)) + 1;  /* first loop conservatively aims for 300 MB/s */
     U32 iterationNb;
     double fastestH = 100000000.;
 

From c99e0c1c700f6cee34a8cd31fc4222c1420cd0df Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Mon, 4 Feb 2019 13:57:55 -0800
Subject: [PATCH 15/73] ensure rotl macro arguments are in parenthesis

to support non-singleton arguments
---
 xxhash.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xxhash.c b/xxhash.c
index 13669b2a..2cb986f1 100644
--- a/xxhash.c
+++ b/xxhash.c
@@ -191,8 +191,8 @@ static U32 XXH_read32(const void* memPtr)
 #  define XXH_rotl32(x,r) _rotl(x,r)
 #  define XXH_rotl64(x,r) _rotl64(x,r)
 #else
-#  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
-#  define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r)))
+#  define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#  define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
 #endif
 
 #if defined(_MSC_VER)     /* Visual Studio */

From d6f83c47f9160c849ea7a8bd958cd9e6d6ed0261 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Mon, 4 Feb 2019 14:05:34 -0800
Subject: [PATCH 16/73] renamed FORCE_INLINE into XXH_FORCE_INLINE

to reduce risks of symbol duplication when XXH_INLINE_ALL is used
---
 xxhash.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/xxhash.c b/xxhash.c
index 2cb986f1..ce38f71b 100644
--- a/xxhash.c
+++ b/xxhash.c
@@ -122,16 +122,16 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcp
 ***************************************/
 #ifdef _MSC_VER    /* Visual Studio */
 #  pragma warning(disable : 4127)      /* disable: C4127: conditional expression is constant */
-#  define FORCE_INLINE static __forceinline
+#  define XXH_FORCE_INLINE static __forceinline
 #else
 #  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
 #    ifdef __GNUC__
-#      define FORCE_INLINE static inline __attribute__((always_inline))
+#      define XXH_FORCE_INLINE static inline __attribute__((always_inline))
 #    else
-#      define FORCE_INLINE static inline
+#      define XXH_FORCE_INLINE static inline
 #    endif
 #  else
-#    define FORCE_INLINE static
+#    define XXH_FORCE_INLINE static
 #  endif /* __STDC_VERSION__ */
 #endif
 
@@ -231,7 +231,8 @@ static int XXH_isLittleEndian(void)
 *****************************/
 typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
 
-FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+XXH_FORCE_INLINE U32
+XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
 {
     if (align==XXH_unaligned)
         return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
@@ -239,7 +240,7 @@ FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_a
         return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr);
 }
 
-FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian)
+XXH_FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian)
 {
     return XXH_readLE32_align(ptr, endian, XXH_unaligned);
 }
@@ -348,7 +349,7 @@ XXH32_finalize(U32 h32, const void* ptr, size_t len,
 }
 
 
-FORCE_INLINE U32
+XXH_FORCE_INLINE U32
 XXH32_endian_align(const void* input, size_t len, U32 seed,
                     XXH_endianess endian, XXH_alignment align)
 {
@@ -448,7 +449,7 @@ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int s
 }
 
 
-FORCE_INLINE XXH_errorcode
+XXH_FORCE_INLINE XXH_errorcode
 XXH32_update_endian(XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian)
 {
     if (input==NULL)
@@ -523,7 +524,7 @@ XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void*
 }
 
 
-FORCE_INLINE U32
+XXH_FORCE_INLINE U32
 XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian)
 {
     U32 h32;
@@ -642,7 +643,8 @@ static U64 XXH_swap64 (U64 x)
 }
 #endif
 
-FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+XXH_FORCE_INLINE U64
+XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
 {
     if (align==XXH_unaligned)
         return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
@@ -650,7 +652,7 @@ FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_a
         return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr);
 }
 
-FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian)
+XXH_FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian)
 {
     return XXH_readLE64_align(ptr, endian, XXH_unaligned);
 }
@@ -807,7 +809,7 @@ XXH64_finalize(U64 h64, const void* ptr, size_t len,
     return 0;  /* unreachable, but some compilers complain without it */
 }
 
-FORCE_INLINE U64
+XXH_FORCE_INLINE U64
 XXH64_endian_align(const void* input, size_t len, U64 seed,
                 XXH_endianess endian, XXH_alignment align)
 {
@@ -908,7 +910,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long
     return XXH_OK;
 }
 
-FORCE_INLINE XXH_errorcode
+XXH_FORCE_INLINE XXH_errorcode
 XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian)
 {
     if (input==NULL)
@@ -978,7 +980,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void*
         return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
 }
 
-FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian)
+XXH_FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian)
 {
     U64 h64;
 

From 1b0f7b371d63c9b1997fdfb95b926d614ba3d920 Mon Sep 17 00:00:00 2001
From: LambdAurora <aurora42lambda@gmail.com>
Date: Tue, 12 Feb 2019 22:15:58 +0100
Subject: [PATCH 17/73] Added export of public symbols on Windows.

---
 cmake_unofficial/CMakeLists.txt |  1 +
 xxhash.h                        | 10 +++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/cmake_unofficial/CMakeLists.txt b/cmake_unofficial/CMakeLists.txt
index 1ca7a06d..bfdd5481 100644
--- a/cmake_unofficial/CMakeLists.txt
+++ b/cmake_unofficial/CMakeLists.txt
@@ -57,6 +57,7 @@ include_directories("${XXHASH_DIR}")
 
 # libxxhash
 add_library(xxhash "${XXHASH_DIR}/xxhash.c")
+target_compile_definitions(xxhash PUBLIC XXH_EXPORT)
 set_target_properties(xxhash PROPERTIES
   SOVERSION "${XXHASH_VERSION_STRING}"
   VERSION "${XXHASH_VERSION_STRING}")
diff --git a/xxhash.h b/xxhash.h
index d6bad943..486a6d81 100644
--- a/xxhash.h
+++ b/xxhash.h
@@ -107,7 +107,15 @@ typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
 #    define XXH_PUBLIC_API static
 #  endif
 #else
-#  define XXH_PUBLIC_API   /* do nothing */
+#  ifdef WIN32
+#    ifdef XXH_EXPORT
+#      define XXH_PUBLIC_API __declspec(dllexport)
+#    else
+#      define XXH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXH_PUBLIC_API   /* do nothing */
+#  endif
 #endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
 
 /*! XXH_NAMESPACE, aka Namespace Emulation :

From 3c2844854655207191932d1095f40d53c2d039ac Mon Sep 17 00:00:00 2001
From: LambdAurora <aurora42lambda@gmail.com>
Date: Tue, 12 Feb 2019 22:29:57 +0100
Subject: [PATCH 18/73] Added missing condition to export symbols on Windows
 with CMake.

---
 cmake_unofficial/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cmake_unofficial/CMakeLists.txt b/cmake_unofficial/CMakeLists.txt
index bfdd5481..f66e72ef 100644
--- a/cmake_unofficial/CMakeLists.txt
+++ b/cmake_unofficial/CMakeLists.txt
@@ -57,7 +57,9 @@ include_directories("${XXHASH_DIR}")
 
 # libxxhash
 add_library(xxhash "${XXHASH_DIR}/xxhash.c")
-target_compile_definitions(xxhash PUBLIC XXH_EXPORT)
+if (BUILD_SHARED_LIBS)
+  target_compile_definitions(xxhash PUBLIC XXH_EXPORT)
+endif ()
 set_target_properties(xxhash PROPERTIES
   SOVERSION "${XXHASH_VERSION_STRING}"
   VERSION "${XXHASH_VERSION_STRING}")

From c56b856e5850fe984fedf1d98645f294748a0192 Mon Sep 17 00:00:00 2001
From: LambdAurora <aurora42lambda@gmail.com>
Date: Tue, 12 Feb 2019 22:59:45 +0100
Subject: [PATCH 19/73] Fixed undefined reference when building with MinGW.

---
 xxhash.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xxhash.h b/xxhash.h
index 486a6d81..84942e8e 100644
--- a/xxhash.h
+++ b/xxhash.h
@@ -107,7 +107,7 @@ typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
 #    define XXH_PUBLIC_API static
 #  endif
 #else
-#  ifdef WIN32
+#  if defined(WIN32) && !defined(__GNUC__)
 #    ifdef XXH_EXPORT
 #      define XXH_PUBLIC_API __declspec(dllexport)
 #    else

From 45f39e6d34c776956e37f33e66592ea1a8bb2524 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 26 Feb 2019 12:36:23 -0800
Subject: [PATCH 20/73] first implementation of XXH3_64b

currently can only be used for benchmarking (`-b`)
---
 xxh3.h   | 400 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 xxhash.c |  10 ++
 xxhsum.c |  84 ++++++++----
 3 files changed, 471 insertions(+), 23 deletions(-)
 create mode 100644 xxh3.h

diff --git a/xxh3.h b/xxh3.h
new file mode 100644
index 00000000..f425545f
--- /dev/null
+++ b/xxh3.h
@@ -0,0 +1,400 @@
+#ifndef XXH3_H
+#define XXH3_H
+
+
+#define XXH_INLINE_ALL
+#include "xxhash.h"
+
+#define NDEBUG
+#include <assert.h>
+
+//#include <stdio.h>
+#define TRACE(...)  //printf(__VA_ARGS__)
+
+
+// ==========================================
+// Vectorization detection
+// ==========================================
+
+// macro enums
+#define XXH_SCALAR 0
+#define XXH_SSE2   1
+#define XXH_AVX2   2
+
+#ifndef XXH_VECTOR    /* can be defined on command line */
+#  if defined(__AVX2__)
+#    define XXH_VECTOR XXH_AVX2
+#  elif defined(__SSE2__)
+#    define XXH_VECTOR XXH_SSE2
+#  else
+#    define XXH_VECTOR XXH_SCALAR
+#  endif
+#endif
+
+
+// ==========================================
+// Short keys
+// ==========================================
+
+static U64 XXH3_mixHigh(U64 val) {
+  return val ^ (val >> 47);
+}
+
+static U64 XXH3_finalMerge_2u64(U64 ll1, U64 ll2, U64 mul)
+{
+    U64 const llcomb1 = XXH3_mixHigh((ll1 ^ ll2) * mul);
+    U64 const llcomb2 = XXH3_mixHigh((ll2 ^ llcomb1) * mul);
+    return llcomb2 * mul;
+}
+
+static U64 XXH3_finalMerge_4u64(U64 ll1, U64 ll2, U64 ll3, U64 ll4, U64 mul)
+{
+    U64 const llcomb1 = XXH_rotl64(ll1 + ll2, 43) + XXH_rotl64(ll3, 30) + ll4;
+    U64 const llcomb2 = ll1 + XXH_rotl64(ll2 + PRIME64_3, 18) + ll3;
+
+    return XXH3_finalMerge_2u64(llcomb1, llcomb2, mul);
+}
+
+static U64 XXH3_finalMerge_8u64(U64 ll1, U64 ll2, U64 ll3, U64 ll4,
+                                U64 ll5, U64 ll6, U64 ll7, U64 ll8,
+                                U64 mul)
+{
+    U64 const ll11 = XXH_rotl64(ll1 + ll7, 21) + (XXH_rotl64(ll2, 34) + ll3) * 9;
+    U64 const ll12 = ((ll1 + ll2) ^ ll4) + ll6 + 1;
+    U64 const ll13 = XXH_rotl64(ll5 + ll6, 22) + ll3;
+    U64 const ll14 = ll5 + XXH_rotl64(ll8, 11) + ll3;
+
+    U64 const ll21 = XXH_swap64((ll11 + ll12) * mul) + ll8;
+    U64 const ll31 = (XXH_swap64((ll12 + ll21) * mul) + ll7) * mul;
+    U64 const ll41 = XXH_swap64((ll13 + ll14) * mul + ll31) + ll2;
+    U64 const ll51 = XXH3_mixHigh((ll14 + ll41) * mul + ll4 + ll8) * mul;
+
+    return ll51 + ll13;
+}
+
+
+static inline U64 XXH3_len_1to3_64b(const void* data, size_t len)
+{
+    assert(data != NULL);
+    assert(len > 0 && len <= 3);
+    BYTE const c1 = ((const BYTE*)data)[0];
+    BYTE const c2 = ((const BYTE*)data)[len >> 1];
+    BYTE const c3 = ((const BYTE*)data)[len - 1];
+    U32  const l1 = (U32)(c1) + ((U32)(c2) << 8);
+    U32  const l2 = (U32)(len) + ((U32)(c3) << 2);
+    U64  const ll3 = (l1 * PRIME64_2) ^ (l2 * PRIME64_1);
+    return XXH3_mixHigh(ll3) * PRIME64_3;
+}
+
+static inline U64 XXH3_len_4to8_64b(const void* data, size_t len)
+{
+    assert(data != NULL);
+    assert(len >= 4 && len <= 8);
+    U64 const mul = PRIME64_2 + (len * 2);  /* keep it odd */
+    U64 const ll1 = XXH_read32(data);
+    U64 const ll2 = XXH_read32((const BYTE*)data + len - 4) + PRIME64_1;
+    return XXH3_finalMerge_2u64((len-1) + (ll1 << 3), ll2, mul);
+}
+
+static inline U64 XXH3_len_9to16_64b(const void* data, size_t len)
+{
+    assert(data != NULL);
+    assert(len >= 9 && len <= 16);
+    U64 const ll1 = XXH_read64(data) + PRIME64_1;
+    U64 const ll2 = XXH_read64((const BYTE*)data + len - 8);
+    U64 const mul = PRIME64_2 + len * 2;  /* keep it odd */
+    U64 const llcomb3 = ll1 * mul + XXH_rotl64(ll2, 23);
+    U64 const llcomb4 = ll2 * mul + XXH_rotl64(ll1, 37);
+    return XXH3_finalMerge_2u64(llcomb3, llcomb4, mul);
+}
+
+static inline U64 XXH3_len_1to16_64b(const void* data, size_t len)
+{
+    assert(data != NULL);
+    assert(len > 0 && len <= 16);
+    if (len > 8) return XXH3_len_9to16_64b(data, len);
+    if (len >= 4) return XXH3_len_4to8_64b(data, len);
+    return XXH3_len_1to3_64b(data, len);
+}
+
+
+static U64 XXH3_len_17to32_64b(const void* data, size_t len)
+{
+    assert(data != NULL);
+    assert(len > 16 && len <= 32);
+    const BYTE* const p = (const BYTE*)data;
+
+    U64 const mul = PRIME64_3 + len * 2;  /* keep it odd */
+    U64 const ll1 = XXH_read64(p) * PRIME64_1;
+    U64 const ll2 = XXH_read64(p + 8);
+    U64 const ll3 = XXH_read64(p + len - 8) * mul;
+    U64 const ll4 = XXH_read64(p + len - 16) * PRIME64_2;
+
+    return XXH3_finalMerge_4u64(ll1, ll2, ll3, ll4, mul);
+}
+
+
+static U64 XXH3_len_33to64_64b(const void* data, size_t len)
+{
+    assert(data != NULL);
+    assert(len > 33 && len <= 64);
+    const BYTE* const p = (const BYTE*)data;
+
+    U64 const mul = PRIME64_2 + len * 2;   /* keep it odd */
+
+    U64 const ll1 = XXH_read64(p);
+    U64 const ll2 = XXH_read64(p + 8);
+    U64 const ll3 = XXH_read64(p + 16);
+    U64 const ll4 = XXH_read64(p + 24);
+    U64 const ll5 = XXH_read64(p + len - 32);
+    U64 const ll6 = XXH_read64(p + len - 24);
+    U64 const ll7 = XXH_read64(p + len - 16);
+    U64 const ll8 = XXH_read64(p + len - 8);
+
+    return XXH3_finalMerge_8u64(ll1, ll2, ll3, ll4, ll5, ll6, ll7, ll8, mul);
+}
+
+
+static U64 XXH3_len_65to96_64b(const void* data, size_t len)
+{
+    assert(data != NULL);
+    assert(len > 64 && len <= 96);
+    const BYTE* const p = (const BYTE*)data;
+
+    U64 const ll1 = XXH3_len_33to64_64b(data, 64);
+    U64 const ll2 = XXH3_len_17to32_64b(p + len - 32, 32);
+    return XXH3_finalMerge_2u64(ll1, ll2, PRIME64_1 + 2*len);
+}
+
+static U64 XXH3_len_97to128_64b(const void* data, size_t len)
+{
+    assert(data != NULL);
+    assert(len > 96 && len <= 128);
+    const BYTE* const p = (const BYTE*)data;
+
+    U64 const ll1 = XXH3_len_33to64_64b(data, 64);
+    U64 const ll2 = XXH3_len_33to64_64b(p + 64, len - 64);
+    return XXH3_finalMerge_2u64(ll1, ll2, PRIME64_1 + 2*len);
+}
+
+
+// ==========================================
+// Long keys
+// ==========================================
+
+#if __GNUC__
+#include <x86intrin.h>
+#define ALIGN(n)      __attribute__ ((aligned(n)))
+#elif _MSC_VER
+#include <intrin.h>
+#define ALIGN(n)      __declspec(align(n))
+#else
+#define ALIGN(n)
+#endif
+
+#define STRIPE_LEN 64
+#define STRIPE_ELTS (STRIPE_LEN / sizeof(U32))
+#define KEYSET_DEFAULT_SIZE 48  // minimum 32
+
+
+ALIGN(64) static const U32 kKey[KEYSET_DEFAULT_SIZE] = {
+    0xb8fe6c39,0x23a44bbe,0x7c01812c,0xf721ad1c,
+    0xded46de9,0x839097db,0x7240a4a4,0xb7b3671f,
+    0xcb79e64e,0xccc0e578,0x825ad07d,0xccff7221,
+    0xb8084674,0xf743248e,0xe03590e6,0x813a264c,
+    0x3c2852bb,0x91c300cb,0x88d0658b,0x1b532ea3,
+    0x71644897,0xa20df94e,0x3819ef46,0xa9deacd8,
+    0xa8fa763f,0xe39c343f,0xf9dcbbc7,0xc70b4f1d,
+    0x8a51e04b,0xcdb45931,0xc89f7ec9,0xd9787364,
+
+    0xeac5ac83,0x34d3ebc3,0xc581a0ff,0xfa1363eb,
+    0x170ddd51,0xb7f0da49,0xd3165526,0x29d4689e,
+    0x2b16be58,0x7d47a1fc,0x8ff8b8d1,0x7ad031ce,
+    0x45cb3a8f,0x95160428,0xafd7fbca,0xbb4b407e,
+};
+
+#define ACC_NB (STRIPE_LEN / sizeof(U64))
+
+inline static void
+XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict key)
+{
+
+#if (XXH_VECTOR == XXH_AVX2)
+
+    assert(((size_t)acc) & 31 == 0);
+
+                    __m256i* const xacc  =       (__m256i *) acc;
+              const __m256i* const xdata = (const __m256i *) data;
+    ALIGN(32) const __m256i* const xkey  = (const __m256i *) key;
+
+    for (size_t i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+        __m256i const d   = _mm256_loadu_si256 (xdata+i);
+        __m256i const k   = _mm256_loadu_si256 (xkey+i);
+        __m256i const dk  = _mm256_add_epi32 (d,k);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
+        __m256i const res = _mm256_mul_epu32 (dk, _mm256_shuffle_epi32 (dk,0x31));   /* uint64 res[4] = {dk0*dk1, dk2*dk3, ...} */
+        xacc[i]           = _mm256_add_epi64(res, xacc[i]);                          /* xacc must be aligned on 32 bytes boundaries */
+    }
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+    assert(((size_t)acc) & 15 == 0);
+
+                    __m128i* const xacc  =       (__m128i *) acc;
+              const __m128i* const xdata = (const __m128i *) data;
+    ALIGN(16) const __m128i* const xkey  = (const __m128i *) key;
+
+    for (size_t i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+        __m128i const d   = _mm_loadu_si128 (xdata+i);
+        __m128i const k   = _mm_loadu_si128 (xkey+i);
+        __m128i const dk  = _mm_add_epi32 (d,k);                               /* uint32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
+        __m128i const res = _mm_mul_epu32 (dk, _mm_shuffle_epi32 (dk,0x31));   /* uint64 res[2] = {dk0*dk1,dk2*dk3} */
+        xacc[i]           = _mm_add_epi64(res, xacc[i]);                       /* xacc must be aligned on 16 bytes boundaries */
+    }
+
+#else // scalar variant
+
+          U64* const xacc  =       (U64*) acc;
+    const U32* const xdata = (const U32*) data;
+    const U32* const xkey  = (const U32*) key;
+
+    int i;
+    for (i=0; i < (int)ACC_NB; i++) {
+        int const left = 2*i;
+        int const right= 2*i + 1;
+        xacc[i] += (xdata[left] + xkey[left]) * (U64)(xdata[right] + xkey[right]);
+    }
+
+#endif
+}
+
+static void XXH3_scrambleAcc(void* acc, const void* key)
+{
+#if (XXH_VECTOR == XXH_AVX2)
+
+    __m256i const xor_p5 = _mm256_set1_epi64x(PRIME64_5);
+
+    assert(((size_t)acc) & 31 == 0);
+    __m256i* const xacc = (__m256i*) acc;
+    const __m256i* const xkey  = (const __m256i *) key;
+
+    for (size_t i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+        __m256i data = xacc[i];
+        __m256i const shifted = _mm256_srli_epi64(data, 47);
+        data = _mm256_xor_si256(data, shifted);
+        data = _mm256_xor_si256(data, xor_p5);
+
+        __m256i const k   = _mm256_loadu_si256 (xkey+i);
+        __m256i const dk  = _mm256_mul_epu32 (data,k);                     /* uint32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
+
+        __m256i const d2  = _mm256_shuffle_epi32 (data,0x31);
+        __m256i const k2  = _mm256_shuffle_epi32 (k,0x31);
+        __m256i const dk2 = _mm256_mul_epu32 (d2,k2);                      /* uint32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
+
+        xacc[i] = _mm256_xor_si256(dk, dk2);
+    }
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+    __m128i const xor_p5 = _mm_set1_epi64((__m64)PRIME64_5);
+
+    assert(((size_t)acc) & 15 == 0);
+    __m128i* const xacc = (__m128i*) acc;
+    const __m128i* const xkey  = (const __m128i *) key;
+
+    for (size_t i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+        __m128i data = xacc[i];
+        __m128i const shifted = _mm_srli_epi64(data, 47);
+        data = _mm_xor_si128(data, shifted);
+        data = _mm_xor_si128(data, xor_p5);
+
+        __m128i const k   = _mm_loadu_si128 (xkey+i);
+        __m128i const dk  = _mm_mul_epu32 (data,k);                     /* uint32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
+
+        __m128i const d2  = _mm_shuffle_epi32 (data,0x31);
+        __m128i const k2  = _mm_shuffle_epi32 (k,0x31);
+        __m128i const dk2 = _mm_mul_epu32 (d2,k2);                      /* uint32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
+
+        xacc[i] = _mm_xor_si128(dk, dk2);
+    }
+
+#else   /* scalar variant */
+
+          U64* const xacc =       (U64*) acc
+    const U32* const xkey = (const U32*) key;
+
+    int i;
+    for (i=0; i < (int)ACC_NB; i++) {
+        int const left = 2*i;
+        int const right= 2*i + 1;
+        xacc[i] ^= xacc[i] >> 47;
+        xacc[i] ^= PRIME64_5;
+
+        U64 p1 = (xacc[i] >> 32) * xkey[left];
+        U64 p2 = (xacc[i] & 0xFFFFFFFF) * xkey[right];
+        xacc[i] = p1 ^ p2;
+    }
+
+#endif
+}
+
+static void XXH3_accumulate(U64* acc, const void* restrict data, const U32* restrict key, size_t nbStripes)
+{
+    for (size_t n = 0; n < nbStripes; n++ ) {
+        XXH3_accumulate_512(acc, (const BYTE*)data + n*STRIPE_LEN, key);
+        key += 2;
+    }
+}
+
+
+__attribute__((noinline)) static U64    // it seems better for XXH3_64b that hashLong is not inlined : may mess up the switch case ?
+XXH3_hashLong(const void* data, size_t len)
+{
+    ALIGN(64) U64 acc[ACC_NB] = { len, PRIME64_1, PRIME64_2, PRIME64_3, -len };
+
+    #define NB_KEYS ((KEYSET_DEFAULT_SIZE - STRIPE_ELTS) / 2)
+
+    size_t const block_len = STRIPE_LEN * NB_KEYS;
+    size_t const nb_blocks = len / block_len;
+
+    for (size_t n = 0; n < nb_blocks; n++) {
+        XXH3_accumulate(acc, (const BYTE*)data + n*block_len, kKey, NB_KEYS);
+        XXH3_scrambleAcc(acc, kKey + (KEYSET_DEFAULT_SIZE - STRIPE_ELTS));
+    }
+
+    /* last partial block */
+    assert(len > STRIPE_LEN);
+    size_t const nbStripes = (len % block_len) / STRIPE_LEN;
+    assert(nbStripes < NB_KEYS);
+    XXH3_accumulate(acc, (const BYTE*)data + nb_blocks*block_len, kKey, nbStripes);
+
+    /* last stripe */
+    if (len & (STRIPE_LEN - 1)) {
+        const BYTE* const p = (const BYTE*) data + len - STRIPE_LEN;
+        XXH3_accumulate_512(acc, p, kKey + nbStripes*2);
+    }
+
+    /* converge into final hash */
+    return XXH3_finalMerge_8u64(acc[0], acc[1], acc[2], acc[3], acc[4], acc[5], acc[6], acc[7], PRIME64_2);
+}
+
+
+// ==========================================
+// Public prototype
+// ==========================================
+
+XXH_PUBLIC_API U64 XXH3_64b(const void* data, size_t len)
+{
+    switch ((len-1) / 16) {  /* intentional underflow */
+        case 0: return XXH3_len_1to16_64b(data, len);
+        case 1: return XXH3_len_17to32_64b(data, len);
+        case 2:
+        case 3: return XXH3_len_33to64_64b(data, len);  /* 33-64 */
+        default:;
+    }
+    if (len==0) return 0;
+    if (len <= 96) return XXH3_len_65to96_64b(data, len);
+    if (len <= 128) return XXH3_len_97to128_64b(data, len);
+    return XXH3_hashLong(data, len);
+}
+
+#endif  /* XXH3_H */
diff --git a/xxhash.c b/xxhash.c
index ce38f71b..5b678313 100644
--- a/xxhash.c
+++ b/xxhash.c
@@ -1029,4 +1029,14 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
     return XXH_readBE64(src);
 }
 
+
+
+/* *******************************************************************
+*  XXH3
+*  New generation hash designed for speed on small keys and vectorization
+*********************************************************************/
+
+#include "xxh3.h"
+
+
 #endif  /* XXH_NO_LONG_LONG */
diff --git a/xxhsum.c b/xxhsum.c
index d9f5be2d..da5a566d 100644
--- a/xxhsum.c
+++ b/xxhsum.c
@@ -265,6 +265,9 @@ static U32 localXXH32(const void* buffer, size_t bufferSize, U32 seed) { return
 
 static U32 localXXH64(const void* buffer, size_t bufferSize, U32 seed) { return (U32)XXH64(buffer, bufferSize, seed); }
 
+U64 XXH3_64b(const void* data, size_t len);
+static U32 localXXH3_64b(const void* buffer, size_t bufferSize, U32 seed) { (void)seed; return (U32)XXH3_64b(buffer, bufferSize); }
+
 static void BMK_benchHash(hashFunction h, const char* hName, const void* buffer, size_t bufferSize)
 {
     U32 nbh_perIteration = (U32)((300 MB) / (bufferSize+1)) + 1;  /* first loop conservatively aims for 300 MB/s */
@@ -330,7 +333,15 @@ static int BMK_benchMem(const void* buffer, size_t bufferSize, U32 specificTest)
     if ((specificTest==0) | (specificTest==4))
         BMK_benchHash(localXXH64, "XXH64 unaligned", ((const char*)buffer)+3, bufferSize);
 
-    if (specificTest > 4) {
+    /* Bench XXH3 */
+    if ((specificTest==0) | (specificTest==5))
+        BMK_benchHash(localXXH3_64b, "XXH3_64bits", buffer, bufferSize);
+
+    /* Bench XXH3 on Unaligned input */
+    if ((specificTest==0) | (specificTest==6))
+        BMK_benchHash(localXXH3_64b, "XXH3_64b unaligned", ((const char*)buffer)+3, bufferSize);
+
+    if (specificTest > 6) {
         DISPLAY("benchmark mode invalid \n");
         return 1;
     }
@@ -397,15 +408,15 @@ static int BMK_benchFiles(const char** fileNamesTable, int nbFiles, U32 specific
 
 
 
-static int BMK_benchInternal(size_t keySize, int specificTest)
+static int BMK_benchInternal(size_t keySize, U32 specificTest)
 {
     void* const buffer = calloc(keySize+16+3, 1);
-    if(!buffer) {
+    if (!buffer) {
         DISPLAY("\nError: not enough memory!\n");
         return 12;
     }
 
-    {   void* const alignedBuffer = ((char*)buffer+15) - (((size_t)((char*)buffer+15)) & 0xF);  /* align on next 16 bytes */
+    {   const void* const alignedBuffer = ((char*)buffer+15) - (((size_t)((char*)buffer+15)) & 0xF);  /* align on next 16 bytes */
 
         /* bench */
         DISPLAYLEVEL(1, "Sample of ");
@@ -749,10 +760,10 @@ typedef struct {
     char*           lineBuf;
     size_t          blockSize;
     char*           blockBuf;
-    int             strictMode;
-    int             statusOnly;
-    int             warn;
-    int             quiet;
+    U32             strictMode;
+    U32             statusOnly;
+    U32             warn;
+    U32             quiet;
     ParseFileReport report;
 } ParseFileArg;
 
@@ -766,7 +777,7 @@ typedef struct {
 static GetLineResult getLine(char** lineBuf, int* lineMax, FILE* inFile)
 {
     GetLineResult result = GetLine_ok;
-    int len = 0;
+    size_t len = 0;
 
     if ((*lineBuf == NULL) || (*lineMax<1)) {
         free(*lineBuf);  /* in case it's != NULL */
@@ -787,9 +798,9 @@ static GetLineResult getLine(char** lineBuf, int* lineMax, FILE* inFile)
         }
 
         /* Make enough space for len+1 (for final NUL) bytes. */
-        if (len+1 >= *lineMax) {
+        if (len+1 >= (size_t)*lineMax) {
             char* newLineBuf = NULL;
-            int newBufSize = *lineMax;
+            size_t newBufSize = (size_t)*lineMax;
 
             newBufSize += (newBufSize/2) + 1; /* x 1.5 */
             if (newBufSize > MAX_LINE_LENGTH) newBufSize = MAX_LINE_LENGTH;
@@ -799,7 +810,7 @@ static GetLineResult getLine(char** lineBuf, int* lineMax, FILE* inFile)
             if (newLineBuf == NULL) return GetLine_outOfMemory;
 
             *lineBuf = newLineBuf;
-            *lineMax = newBufSize;
+            *lineMax = (int)newBufSize;
         }
 
         if (c == '\n') break;
@@ -1214,24 +1225,51 @@ static int badusage(const char* exename)
     return 1;
 }
 
-/*! readU32FromChar() :
-   @return : unsigned integer value read from input in `char` format,
-             0 is no figure at *stringPtr position.
-    Interprets K, KB, KiB, M, MB and MiB suffix.
-    Modifies `*stringPtr`, advancing it to position where reading stopped.
-    Note : function result can overflow if digit string > MAX_UINT */
-static unsigned readU32FromChar(const char** stringPtr)
+static void errorOut(const char* msg)
+{
+    DISPLAY("%s \n", msg); exit(1);
+}
+
+/*! readU32FromCharChecked() :
+ * @return 0 if success, and store the result in *value.
+ *  allows and interprets K, KB, KiB, M, MB and MiB suffix.
+ *  Will also modify `*stringPtr`, advancing it to position where it stopped reading.
+ * @return 1 if an overflow error occurs */
+static int readU32FromCharChecked(const char** stringPtr, unsigned* value)
 {
+    static unsigned const max = (((unsigned)(-1)) / 10) - 1;
     unsigned result = 0;
-    while ((**stringPtr >='0') && (**stringPtr <='9'))
-        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
+    while ((**stringPtr >='0') && (**stringPtr <='9')) {
+        if (result > max) return 1; // overflow error
+        result *= 10;
+        result += (unsigned)(**stringPtr - '0');
+        (*stringPtr)++ ;
+    }
     if ((**stringPtr=='K') || (**stringPtr=='M')) {
+        unsigned const maxK = ((unsigned)(-1)) >> 10;
+        if (result > maxK) return 1; // overflow error
         result <<= 10;
-        if (**stringPtr=='M') result <<= 10;
-        (*stringPtr)++ ;
+        if (**stringPtr=='M') {
+            if (result > maxK) return 1; // overflow error
+            result <<= 10;
+        }
+        (*stringPtr)++;  /* skip `K` or `M` */
         if (**stringPtr=='i') (*stringPtr)++;
         if (**stringPtr=='B') (*stringPtr)++;
     }
+    *value = result;
+    return 0;
+}
+
+/*! readU32FromChar() :
+ * @return : unsigned integer value read from input in `char` format.
+ *  allows and interprets K, KB, KiB, M, MB and MiB suffix.
+ *  Will also modify `*stringPtr`, advancing it to position where it stopped reading.
+ *  Note : function will exit() program if digit sequence overflows */
+static unsigned readU32FromChar(const char** stringPtr) {
+    static const char errorMsg[] = "error: numeric value too large";
+    unsigned result;
+    if (readU32FromCharChecked(stringPtr, &result)) { errorOut(errorMsg); }
     return result;
 }
 

From 43c10239c97b7519589516cbfb2dae1087dcc8e4 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 26 Feb 2019 13:45:56 -0800
Subject: [PATCH 21/73] minor C90 adaptation fixes

added -Wconversion flag
---
 Makefile |   2 +-
 xxh3.h   | 281 +++++++++++++++++++++++++++++--------------------------
 xxhash.c |   6 +-
 xxhash.h |  48 ++++++----
 xxhsum.c |   1 -
 5 files changed, 179 insertions(+), 159 deletions(-)

diff --git a/Makefile b/Makefile
index e88013b6..4c426ab0 100644
--- a/Makefile
+++ b/Makefile
@@ -42,7 +42,7 @@ NOSSE4 :=
 endif
 
 CFLAGS ?= -O2 $(NOSSE4) # disables potential auto-vectorization
-DEBUGFLAGS+=-Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
+DEBUGFLAGS+=-Wall -Wextra -Wconversion -Wcast-qual -Wcast-align -Wshadow \
             -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \
             -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \
             -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \
diff --git a/xxh3.h b/xxh3.h
index f425545f..c8d70880 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -2,21 +2,17 @@
 #define XXH3_H
 
 
+#undef XXH_INLINE_ALL   /* in case it's already defined */
 #define XXH_INLINE_ALL
 #include "xxhash.h"
 
 #define NDEBUG
 #include <assert.h>
 
-//#include <stdio.h>
-#define TRACE(...)  //printf(__VA_ARGS__)
 
-
-// ==========================================
-// Vectorization detection
-// ==========================================
-
-// macro enums
+/* ==========================================
+ * Vectorization detection
+ * ========================================== */
 #define XXH_SCALAR 0
 #define XXH_SSE2   1
 #define XXH_AVX2   2
@@ -32,9 +28,9 @@
 #endif
 
 
-// ==========================================
-// Short keys
-// ==========================================
+/* ==========================================
+ * Short keys
+ * ========================================== */
 
 static U64 XXH3_mixHigh(U64 val) {
   return val ^ (val >> 47);
@@ -42,17 +38,17 @@ static U64 XXH3_mixHigh(U64 val) {
 
 static U64 XXH3_finalMerge_2u64(U64 ll1, U64 ll2, U64 mul)
 {
-    U64 const llcomb1 = XXH3_mixHigh((ll1 ^ ll2) * mul);
-    U64 const llcomb2 = XXH3_mixHigh((ll2 ^ llcomb1) * mul);
-    return llcomb2 * mul;
+    U64 const ll11 = XXH3_mixHigh((ll1 ^ ll2) * mul);
+    U64 const ll21 = XXH3_mixHigh((ll2 ^ ll11) * mul);
+    return ll21 * mul;
 }
 
 static U64 XXH3_finalMerge_4u64(U64 ll1, U64 ll2, U64 ll3, U64 ll4, U64 mul)
 {
-    U64 const llcomb1 = XXH_rotl64(ll1 + ll2, 43) + XXH_rotl64(ll3, 30) + ll4;
-    U64 const llcomb2 = ll1 + XXH_rotl64(ll2 + PRIME64_3, 18) + ll3;
+    U64 const ll11 = XXH_rotl64(ll1 + ll2, 43) + XXH_rotl64(ll3, 30) + ll4;
+    U64 const ll12 = ll1 + XXH_rotl64(ll2 + PRIME64_3, 18) + ll3;
 
-    return XXH3_finalMerge_2u64(llcomb1, llcomb2, mul);
+    return XXH3_finalMerge_2u64(ll11, ll12, mul);
 }
 
 static U64 XXH3_finalMerge_8u64(U64 ll1, U64 ll2, U64 ll3, U64 ll4,
@@ -77,44 +73,49 @@ static inline U64 XXH3_len_1to3_64b(const void* data, size_t len)
 {
     assert(data != NULL);
     assert(len > 0 && len <= 3);
-    BYTE const c1 = ((const BYTE*)data)[0];
-    BYTE const c2 = ((const BYTE*)data)[len >> 1];
-    BYTE const c3 = ((const BYTE*)data)[len - 1];
-    U32  const l1 = (U32)(c1) + ((U32)(c2) << 8);
-    U32  const l2 = (U32)(len) + ((U32)(c3) << 2);
-    U64  const ll3 = (l1 * PRIME64_2) ^ (l2 * PRIME64_1);
-    return XXH3_mixHigh(ll3) * PRIME64_3;
+    {   BYTE const c1 = ((const BYTE*)data)[0];
+        BYTE const c2 = ((const BYTE*)data)[len >> 1];
+        BYTE const c3 = ((const BYTE*)data)[len - 1];
+        U32  const l1 = (U32)(c1) + ((U32)(c2) << 8);
+        U32  const l2 = (U32)(len) + ((U32)(c3) << 2);
+        U64  const ll3 = (l1 * PRIME64_2) ^ (l2 * PRIME64_1);
+        return XXH3_mixHigh(ll3) * PRIME64_3;
+    }
 }
 
+
 static inline U64 XXH3_len_4to8_64b(const void* data, size_t len)
 {
     assert(data != NULL);
     assert(len >= 4 && len <= 8);
-    U64 const mul = PRIME64_2 + (len * 2);  /* keep it odd */
-    U64 const ll1 = XXH_read32(data);
-    U64 const ll2 = XXH_read32((const BYTE*)data + len - 4) + PRIME64_1;
-    return XXH3_finalMerge_2u64((len-1) + (ll1 << 3), ll2, mul);
+    {   U64 const mul = PRIME64_2 + (len * 2);  /* keep it odd */
+        U64 const ll1 = XXH_read32(data);
+        U64 const ll2 = XXH_read32((const BYTE*)data + len - 4) + PRIME64_1;
+        return XXH3_finalMerge_2u64((len-1) + (ll1 << 3), ll2, mul);
+    }
 }
 
 static inline U64 XXH3_len_9to16_64b(const void* data, size_t len)
 {
     assert(data != NULL);
     assert(len >= 9 && len <= 16);
-    U64 const ll1 = XXH_read64(data) + PRIME64_1;
-    U64 const ll2 = XXH_read64((const BYTE*)data + len - 8);
-    U64 const mul = PRIME64_2 + len * 2;  /* keep it odd */
-    U64 const llcomb3 = ll1 * mul + XXH_rotl64(ll2, 23);
-    U64 const llcomb4 = ll2 * mul + XXH_rotl64(ll1, 37);
-    return XXH3_finalMerge_2u64(llcomb3, llcomb4, mul);
+    {   U64 const ll1 = XXH_read64(data) + PRIME64_1;
+        U64 const ll2 = XXH_read64((const BYTE*)data + len - 8);
+        U64 const mul = PRIME64_2 + len * 2;  /* keep it odd */
+        U64 const llcomb3 = ll1 * mul + XXH_rotl64(ll2, 23);
+        U64 const llcomb4 = ll2 * mul + XXH_rotl64(ll1, 37);
+        return XXH3_finalMerge_2u64(llcomb3, llcomb4, mul);
+    }
 }
 
 static inline U64 XXH3_len_1to16_64b(const void* data, size_t len)
 {
     assert(data != NULL);
     assert(len > 0 && len <= 16);
-    if (len > 8) return XXH3_len_9to16_64b(data, len);
-    if (len >= 4) return XXH3_len_4to8_64b(data, len);
-    return XXH3_len_1to3_64b(data, len);
+    {   if (len > 8) return XXH3_len_9to16_64b(data, len);
+        if (len >= 4) return XXH3_len_4to8_64b(data, len);
+        return XXH3_len_1to3_64b(data, len);
+    }
 }
 
 
@@ -122,15 +123,17 @@ static U64 XXH3_len_17to32_64b(const void* data, size_t len)
 {
     assert(data != NULL);
     assert(len > 16 && len <= 32);
-    const BYTE* const p = (const BYTE*)data;
 
-    U64 const mul = PRIME64_3 + len * 2;  /* keep it odd */
-    U64 const ll1 = XXH_read64(p) * PRIME64_1;
-    U64 const ll2 = XXH_read64(p + 8);
-    U64 const ll3 = XXH_read64(p + len - 8) * mul;
-    U64 const ll4 = XXH_read64(p + len - 16) * PRIME64_2;
+    {   const BYTE* const p = (const BYTE*)data;
 
-    return XXH3_finalMerge_4u64(ll1, ll2, ll3, ll4, mul);
+        U64 const mul = PRIME64_3 + len * 2;  /* keep it odd */
+        U64 const ll1 = XXH_read64(p) * PRIME64_1;
+        U64 const ll2 = XXH_read64(p + 8);
+        U64 const ll3 = XXH_read64(p + len - 8) * mul;
+        U64 const ll4 = XXH_read64(p + len - 16) * PRIME64_2;
+
+        return XXH3_finalMerge_4u64(ll1, ll2, ll3, ll4, mul);
+    }
 }
 
 
@@ -138,20 +141,22 @@ static U64 XXH3_len_33to64_64b(const void* data, size_t len)
 {
     assert(data != NULL);
     assert(len > 33 && len <= 64);
-    const BYTE* const p = (const BYTE*)data;
 
-    U64 const mul = PRIME64_2 + len * 2;   /* keep it odd */
+    {   const BYTE* const p = (const BYTE*)data;
 
-    U64 const ll1 = XXH_read64(p);
-    U64 const ll2 = XXH_read64(p + 8);
-    U64 const ll3 = XXH_read64(p + 16);
-    U64 const ll4 = XXH_read64(p + 24);
-    U64 const ll5 = XXH_read64(p + len - 32);
-    U64 const ll6 = XXH_read64(p + len - 24);
-    U64 const ll7 = XXH_read64(p + len - 16);
-    U64 const ll8 = XXH_read64(p + len - 8);
+        U64 const mul = PRIME64_2 + len * 2;   /* keep it odd */
 
-    return XXH3_finalMerge_8u64(ll1, ll2, ll3, ll4, ll5, ll6, ll7, ll8, mul);
+        U64 const ll1 = XXH_read64(p);
+        U64 const ll2 = XXH_read64(p + 8);
+        U64 const ll3 = XXH_read64(p + 16);
+        U64 const ll4 = XXH_read64(p + 24);
+        U64 const ll5 = XXH_read64(p + len - 32);
+        U64 const ll6 = XXH_read64(p + len - 24);
+        U64 const ll7 = XXH_read64(p + len - 16);
+        U64 const ll8 = XXH_read64(p + len - 8);
+
+        return XXH3_finalMerge_8u64(ll1, ll2, ll3, ll4, ll5, ll6, ll7, ll8, mul);
+    }
 }
 
 
@@ -159,28 +164,32 @@ static U64 XXH3_len_65to96_64b(const void* data, size_t len)
 {
     assert(data != NULL);
     assert(len > 64 && len <= 96);
-    const BYTE* const p = (const BYTE*)data;
 
-    U64 const ll1 = XXH3_len_33to64_64b(data, 64);
-    U64 const ll2 = XXH3_len_17to32_64b(p + len - 32, 32);
-    return XXH3_finalMerge_2u64(ll1, ll2, PRIME64_1 + 2*len);
+    {   const BYTE* const p = (const BYTE*)data;
+
+        U64 const ll1 = XXH3_len_33to64_64b(data, 64);
+        U64 const ll2 = XXH3_len_17to32_64b(p + len - 32, 32);
+        return XXH3_finalMerge_2u64(ll1, ll2, PRIME64_1 + 2*len);
+    }
 }
 
 static U64 XXH3_len_97to128_64b(const void* data, size_t len)
 {
     assert(data != NULL);
     assert(len > 96 && len <= 128);
-    const BYTE* const p = (const BYTE*)data;
 
-    U64 const ll1 = XXH3_len_33to64_64b(data, 64);
-    U64 const ll2 = XXH3_len_33to64_64b(p + 64, len - 64);
-    return XXH3_finalMerge_2u64(ll1, ll2, PRIME64_1 + 2*len);
+    {   const BYTE* const p = (const BYTE*)data;
+
+        U64 const ll1 = XXH3_len_33to64_64b(data, 64);
+        U64 const ll2 = XXH3_len_33to64_64b(p + 64, len - 64);
+        return XXH3_finalMerge_2u64(ll1, ll2, PRIME64_1 + 2*len);
+    }
 }
 
 
-// ==========================================
-// Long keys
-// ==========================================
+/* ==========================================
+ * Long keys
+ * ========================================== */
 
 #if __GNUC__
 #include <x86intrin.h>
@@ -194,7 +203,7 @@ static U64 XXH3_len_97to128_64b(const void* data, size_t len)
 
 #define STRIPE_LEN 64
 #define STRIPE_ELTS (STRIPE_LEN / sizeof(U32))
-#define KEYSET_DEFAULT_SIZE 48  // minimum 32
+#define KEYSET_DEFAULT_SIZE 48   /* minimum 32 */
 
 
 ALIGN(64) static const U32 kKey[KEYSET_DEFAULT_SIZE] = {
@@ -218,40 +227,39 @@ ALIGN(64) static const U32 kKey[KEYSET_DEFAULT_SIZE] = {
 inline static void
 XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict key)
 {
-
 #if (XXH_VECTOR == XXH_AVX2)
 
     assert(((size_t)acc) & 31 == 0);
-
-                    __m256i* const xacc  =       (__m256i *) acc;
-              const __m256i* const xdata = (const __m256i *) data;
-    ALIGN(32) const __m256i* const xkey  = (const __m256i *) key;
-
-    for (size_t i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
-        __m256i const d   = _mm256_loadu_si256 (xdata+i);
-        __m256i const k   = _mm256_loadu_si256 (xkey+i);
-        __m256i const dk  = _mm256_add_epi32 (d,k);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
-        __m256i const res = _mm256_mul_epu32 (dk, _mm256_shuffle_epi32 (dk,0x31));   /* uint64 res[4] = {dk0*dk1, dk2*dk3, ...} */
-        xacc[i]           = _mm256_add_epi64(res, xacc[i]);                          /* xacc must be aligned on 32 bytes boundaries */
+    {                   __m256i* const xacc  =       (__m256i *) acc;
+                  const __m256i* const xdata = (const __m256i *) data;
+        ALIGN(32) const __m256i* const xkey  = (const __m256i *) key;
+
+        for (size_t i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+            __m256i const d   = _mm256_loadu_si256 (xdata+i);
+            __m256i const k   = _mm256_loadu_si256 (xkey+i);
+            __m256i const dk  = _mm256_add_epi32 (d,k);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
+            __m256i const res = _mm256_mul_epu32 (dk, _mm256_shuffle_epi32 (dk,0x31));   /* uint64 res[4] = {dk0*dk1, dk2*dk3, ...} */
+            xacc[i]           = _mm256_add_epi64(res, xacc[i]);                          /* xacc must be aligned on 32 bytes boundaries */
+        }
     }
 
 #elif (XXH_VECTOR == XXH_SSE2)
 
     assert(((size_t)acc) & 15 == 0);
-
-                    __m128i* const xacc  =       (__m128i *) acc;
-              const __m128i* const xdata = (const __m128i *) data;
-    ALIGN(16) const __m128i* const xkey  = (const __m128i *) key;
-
-    for (size_t i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
-        __m128i const d   = _mm_loadu_si128 (xdata+i);
-        __m128i const k   = _mm_loadu_si128 (xkey+i);
-        __m128i const dk  = _mm_add_epi32 (d,k);                               /* uint32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
-        __m128i const res = _mm_mul_epu32 (dk, _mm_shuffle_epi32 (dk,0x31));   /* uint64 res[2] = {dk0*dk1,dk2*dk3} */
-        xacc[i]           = _mm_add_epi64(res, xacc[i]);                       /* xacc must be aligned on 16 bytes boundaries */
+    {                   __m128i* const xacc  =       (__m128i *) acc;
+                  const __m128i* const xdata = (const __m128i *) data;
+        ALIGN(16) const __m128i* const xkey  = (const __m128i *) key;
+
+        for (size_t i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+            __m128i const d   = _mm_loadu_si128 (xdata+i);
+            __m128i const k   = _mm_loadu_si128 (xkey+i);
+            __m128i const dk  = _mm_add_epi32 (d,k);                               /* uint32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
+            __m128i const res = _mm_mul_epu32 (dk, _mm_shuffle_epi32 (dk,0x31));   /* uint64 res[2] = {dk0*dk1,dk2*dk3} */
+            xacc[i]           = _mm_add_epi64(res, xacc[i]);                       /* xacc must be aligned on 16 bytes boundaries */
+        }
     }
 
-#else // scalar variant
+#else   /* scalar variant */
 
           U64* const xacc  =       (U64*) acc;
     const U32* const xdata = (const U32*) data;
@@ -271,55 +279,56 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
 {
 #if (XXH_VECTOR == XXH_AVX2)
 
-    __m256i const xor_p5 = _mm256_set1_epi64x(PRIME64_5);
-
     assert(((size_t)acc) & 31 == 0);
-    __m256i* const xacc = (__m256i*) acc;
-    const __m256i* const xkey  = (const __m256i *) key;
+    {   __m256i* const xacc = (__m256i*) acc;
+        const __m256i* const xkey  = (const __m256i *) key;
+
+        __m256i const xor_p5 = _mm256_set1_epi64x(PRIME64_5);
 
-    for (size_t i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
-        __m256i data = xacc[i];
-        __m256i const shifted = _mm256_srli_epi64(data, 47);
-        data = _mm256_xor_si256(data, shifted);
-        data = _mm256_xor_si256(data, xor_p5);
+        for (size_t i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+            __m256i data = xacc[i];
+            __m256i const shifted = _mm256_srli_epi64(data, 47);
+            data = _mm256_xor_si256(data, shifted);
+            data = _mm256_xor_si256(data, xor_p5);
 
-        __m256i const k   = _mm256_loadu_si256 (xkey+i);
-        __m256i const dk  = _mm256_mul_epu32 (data,k);                     /* uint32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
+            {   __m256i const k   = _mm256_loadu_si256 (xkey+i);
+                __m256i const dk  = _mm256_mul_epu32 (data,k);          /* U32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
 
-        __m256i const d2  = _mm256_shuffle_epi32 (data,0x31);
-        __m256i const k2  = _mm256_shuffle_epi32 (k,0x31);
-        __m256i const dk2 = _mm256_mul_epu32 (d2,k2);                      /* uint32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
+                __m256i const d2  = _mm256_shuffle_epi32 (data,0x31);
+                __m256i const k2  = _mm256_shuffle_epi32 (k,0x31);
+                __m256i const dk2 = _mm256_mul_epu32 (d2,k2);           /* U32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
 
-        xacc[i] = _mm256_xor_si256(dk, dk2);
+                xacc[i] = _mm256_xor_si256(dk, dk2);
+        }   }
     }
 
 #elif (XXH_VECTOR == XXH_SSE2)
 
-    __m128i const xor_p5 = _mm_set1_epi64((__m64)PRIME64_5);
-
     assert(((size_t)acc) & 15 == 0);
-    __m128i* const xacc = (__m128i*) acc;
-    const __m128i* const xkey  = (const __m128i *) key;
+    {   __m128i* const xacc = (__m128i*) acc;
+        const __m128i* const xkey  = (const __m128i *) key;
+        __m128i const xor_p5 = _mm_set1_epi64((__m64)PRIME64_5);
 
-    for (size_t i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
-        __m128i data = xacc[i];
-        __m128i const shifted = _mm_srli_epi64(data, 47);
-        data = _mm_xor_si128(data, shifted);
-        data = _mm_xor_si128(data, xor_p5);
+        for (size_t i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+            __m128i data = xacc[i];
+            __m128i const shifted = _mm_srli_epi64(data, 47);
+            data = _mm_xor_si128(data, shifted);
+            data = _mm_xor_si128(data, xor_p5);
 
-        __m128i const k   = _mm_loadu_si128 (xkey+i);
-        __m128i const dk  = _mm_mul_epu32 (data,k);                     /* uint32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
+            {   __m128i const k   = _mm_loadu_si128 (xkey+i);
+                __m128i const dk  = _mm_mul_epu32 (data,k);          /* U32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
 
-        __m128i const d2  = _mm_shuffle_epi32 (data,0x31);
-        __m128i const k2  = _mm_shuffle_epi32 (k,0x31);
-        __m128i const dk2 = _mm_mul_epu32 (d2,k2);                      /* uint32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
+                __m128i const d2  = _mm_shuffle_epi32 (data,0x31);
+                __m128i const k2  = _mm_shuffle_epi32 (k,0x31);
+                __m128i const dk2 = _mm_mul_epu32 (d2,k2);           /* U32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
 
-        xacc[i] = _mm_xor_si128(dk, dk2);
+                xacc[i] = _mm_xor_si128(dk, dk2);
+        }   }
     }
 
 #else   /* scalar variant */
 
-          U64* const xacc =       (U64*) acc
+          U64* const xacc =       (U64*) acc;
     const U32* const xkey = (const U32*) key;
 
     int i;
@@ -346,7 +355,7 @@ static void XXH3_accumulate(U64* acc, const void* restrict data, const U32* rest
 }
 
 
-__attribute__((noinline)) static U64    // it seems better for XXH3_64b that hashLong is not inlined : may mess up the switch case ?
+__attribute__((noinline)) static U64    /* It seems better for XXH3_64b to have hashLong not inlined : may mess up the switch case ? */
 XXH3_hashLong(const void* data, size_t len)
 {
     ALIGN(64) U64 acc[ACC_NB] = { len, PRIME64_1, PRIME64_2, PRIME64_3, -len };
@@ -363,24 +372,24 @@ XXH3_hashLong(const void* data, size_t len)
 
     /* last partial block */
     assert(len > STRIPE_LEN);
-    size_t const nbStripes = (len % block_len) / STRIPE_LEN;
-    assert(nbStripes < NB_KEYS);
-    XXH3_accumulate(acc, (const BYTE*)data + nb_blocks*block_len, kKey, nbStripes);
-
-    /* last stripe */
-    if (len & (STRIPE_LEN - 1)) {
-        const BYTE* const p = (const BYTE*) data + len - STRIPE_LEN;
-        XXH3_accumulate_512(acc, p, kKey + nbStripes*2);
-    }
+    {   size_t const nbStripes = (len % block_len) / STRIPE_LEN;
+        assert(nbStripes < NB_KEYS);
+        XXH3_accumulate(acc, (const BYTE*)data + nb_blocks*block_len, kKey, nbStripes);
+
+        /* last stripe */
+        if (len & (STRIPE_LEN - 1)) {
+            const BYTE* const p = (const BYTE*) data + len - STRIPE_LEN;
+            XXH3_accumulate_512(acc, p, kKey + nbStripes*2);
+    }   }
 
     /* converge into final hash */
     return XXH3_finalMerge_8u64(acc[0], acc[1], acc[2], acc[3], acc[4], acc[5], acc[6], acc[7], PRIME64_2);
 }
 
 
-// ==========================================
-// Public prototype
-// ==========================================
+/* ==========================================
+ * Public prototype
+ * ========================================== */
 
 XXH_PUBLIC_API U64 XXH3_64b(const void* data, size_t len)
 {
@@ -397,4 +406,6 @@ XXH_PUBLIC_API U64 XXH3_64b(const void* data, size_t len)
     return XXH3_hashLong(data, len);
 }
 
+
+
 #endif  /* XXH3_H */
diff --git a/xxhash.c b/xxhash.c
index 5b678313..9e598c5e 100644
--- a/xxhash.c
+++ b/xxhash.c
@@ -462,12 +462,12 @@ XXH32_update_endian(XXH32_state_t* state, const void* input, size_t len, XXH_end
     {   const BYTE* p = (const BYTE*)input;
         const BYTE* const bEnd = p + len;
 
-        state->total_len_32 += (unsigned)len;
-        state->large_len |= (len>=16) | (state->total_len_32>=16);
+        state->total_len_32 += (XXH32_hash_t)len;
+        state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));
 
         if (state->memsize + len < 16)  {   /* fill in tmp buffer */
             XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);
-            state->memsize += (unsigned)len;
+            state->memsize += (XXH32_hash_t)len;
             return XXH_OK;
         }
 
diff --git a/xxhash.h b/xxhash.h
index 84942e8e..56b6aa55 100644
--- a/xxhash.h
+++ b/xxhash.h
@@ -159,7 +159,7 @@ typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
 ***************************************/
 #define XXH_VERSION_MAJOR    0
 #define XXH_VERSION_MINOR    6
-#define XXH_VERSION_RELEASE  5
+#define XXH_VERSION_RELEASE  6
 #define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
 XXH_PUBLIC_API unsigned XXH_versionNumber (void);
 
@@ -247,6 +247,16 @@ XXH_PUBLIC_API XXH64_hash_t  XXH64_digest (const XXH64_state_t* statePtr);
 typedef struct { unsigned char digest[8]; } XXH64_canonical_t;
 XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
 XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+
+
+/*-**********************************************************************
+*  XXH3
+*  New experimental hash
+************************************************************************/
+
+XXH_PUBLIC_API XXH64_hash_t XXH3_64b(const void* data, size_t len);
+
+
 #endif  /* XXH_NO_LONG_LONG */
 
 
@@ -289,33 +299,33 @@ struct XXH64_state_s {
    uint64_t v4;
    uint64_t mem64[4];
    uint32_t memsize;
-   uint32_t reserved[2];          /* never read nor write, might be removed in a future version */
+   uint32_t reserved[2];   /* never read nor write, might be removed in a future version */
 };   /* typedef'd to XXH64_state_t */
 
 # else
 
 struct XXH32_state_s {
-   unsigned total_len_32;
-   unsigned large_len;
-   unsigned v1;
-   unsigned v2;
-   unsigned v3;
-   unsigned v4;
-   unsigned mem32[4];
-   unsigned memsize;
-   unsigned reserved;   /* never read nor write, might be removed in a future version */
+   XXH32_hash_t total_len_32;
+   XXH32_hash_t large_len;
+   XXH32_hash_t v1;
+   XXH32_hash_t v2;
+   XXH32_hash_t v3;
+   XXH32_hash_t v4;
+   XXH32_hash_t mem32[4];
+   XXH32_hash_t memsize;
+   XXH32_hash_t reserved;   /* never read nor write, might be removed in a future version */
 };   /* typedef'd to XXH32_state_t */
 
 #   ifndef XXH_NO_LONG_LONG  /* remove 64-bit support */
 struct XXH64_state_s {
-   unsigned long long total_len;
-   unsigned long long v1;
-   unsigned long long v2;
-   unsigned long long v3;
-   unsigned long long v4;
-   unsigned long long mem64[4];
-   unsigned memsize;
-   unsigned reserved[2];     /* never read nor write, might be removed in a future version */
+   XXH64_hash_t total_len;
+   XXH64_hash_t v1;
+   XXH64_hash_t v2;
+   XXH64_hash_t v3;
+   XXH64_hash_t v4;
+   XXH64_hash_t mem64[4];
+   XXH32_hash_t memsize;
+   XXH32_hash_t reserved[2];     /* never read nor write, might be removed in a future version */
 };   /* typedef'd to XXH64_state_t */
 #    endif
 
diff --git a/xxhsum.c b/xxhsum.c
index da5a566d..af5e46f4 100644
--- a/xxhsum.c
+++ b/xxhsum.c
@@ -265,7 +265,6 @@ static U32 localXXH32(const void* buffer, size_t bufferSize, U32 seed) { return
 
 static U32 localXXH64(const void* buffer, size_t bufferSize, U32 seed) { return (U32)XXH64(buffer, bufferSize, seed); }
 
-U64 XXH3_64b(const void* data, size_t len);
 static U32 localXXH3_64b(const void* buffer, size_t bufferSize, U32 seed) { (void)seed; return (U32)XXH3_64b(buffer, bufferSize); }
 
 static void BMK_benchHash(hashFunction h, const char* hName, const void* buffer, size_t bufferSize)

From e0c6a9e8809b8a9f851e0f656aada697d7798c4d Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 26 Feb 2019 15:14:05 -0800
Subject: [PATCH 22/73] fixed xxh3 namespace issue

---
 xxhash.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/xxhash.h b/xxhash.h
index 56b6aa55..1782789e 100644
--- a/xxhash.h
+++ b/xxhash.h
@@ -254,6 +254,10 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
 *  New experimental hash
 ************************************************************************/
 
+#ifdef XXH_NAMESPACE
+#  define XXH3_64b XXH_NAME2(XXH_NAMESPACE, XXH3_64b)
+#endif
+
 XXH_PUBLIC_API XXH64_hash_t XXH3_64b(const void* data, size_t len);
 
 

From 94bebd5b86ba76c52036376e603bbb6c103a476a Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 26 Feb 2019 15:24:59 -0800
Subject: [PATCH 23/73] xxh3: more c90 compatibility

---
 xxh3.h   | 35 ++++++++++++++++++++++++++---------
 xxhsum.c |  6 +++---
 2 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/xxh3.h b/xxh3.h
index c8d70880..8239677a 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -2,6 +2,8 @@
 #define XXH3_H
 
 
+/* ===   Dependencies   === */
+
 #undef XXH_INLINE_ALL   /* in case it's already defined */
 #define XXH_INLINE_ALL
 #include "xxhash.h"
@@ -10,6 +12,14 @@
 #include <assert.h>
 
 
+/* ===   Compiler versions   === */
+
+#if !(defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)   /* C99+ */
+#  define restrict   /* disable */
+#endif
+
+
+
 /* ==========================================
  * Vectorization detection
  * ========================================== */
@@ -28,6 +38,7 @@
 #endif
 
 
+
 /* ==========================================
  * Short keys
  * ========================================== */
@@ -69,7 +80,7 @@ static U64 XXH3_finalMerge_8u64(U64 ll1, U64 ll2, U64 ll3, U64 ll4,
 }
 
 
-static inline U64 XXH3_len_1to3_64b(const void* data, size_t len)
+XXH_FORCE_INLINE U64 XXH3_len_1to3_64b(const void* data, size_t len)
 {
     assert(data != NULL);
     assert(len > 0 && len <= 3);
@@ -84,7 +95,7 @@ static inline U64 XXH3_len_1to3_64b(const void* data, size_t len)
 }
 
 
-static inline U64 XXH3_len_4to8_64b(const void* data, size_t len)
+XXH_FORCE_INLINE U64 XXH3_len_4to8_64b(const void* data, size_t len)
 {
     assert(data != NULL);
     assert(len >= 4 && len <= 8);
@@ -95,7 +106,7 @@ static inline U64 XXH3_len_4to8_64b(const void* data, size_t len)
     }
 }
 
-static inline U64 XXH3_len_9to16_64b(const void* data, size_t len)
+XXH_FORCE_INLINE U64 XXH3_len_9to16_64b(const void* data, size_t len)
 {
     assert(data != NULL);
     assert(len >= 9 && len <= 16);
@@ -108,7 +119,7 @@ static inline U64 XXH3_len_9to16_64b(const void* data, size_t len)
     }
 }
 
-static inline U64 XXH3_len_1to16_64b(const void* data, size_t len)
+XXH_FORCE_INLINE U64 XXH3_len_1to16_64b(const void* data, size_t len)
 {
     assert(data != NULL);
     assert(len > 0 && len <= 16);
@@ -187,6 +198,7 @@ static U64 XXH3_len_97to128_64b(const void* data, size_t len)
 }
 
 
+
 /* ==========================================
  * Long keys
  * ========================================== */
@@ -224,7 +236,7 @@ ALIGN(64) static const U32 kKey[KEYSET_DEFAULT_SIZE] = {
 
 #define ACC_NB (STRIPE_LEN / sizeof(U64))
 
-inline static void
+XXH_FORCE_INLINE void
 XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict key)
 {
 #if (XXH_VECTOR == XXH_AVX2)
@@ -250,7 +262,8 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k
                   const __m128i* const xdata = (const __m128i *) data;
         ALIGN(16) const __m128i* const xkey  = (const __m128i *) key;
 
-        for (size_t i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
             __m128i const d   = _mm_loadu_si128 (xdata+i);
             __m128i const k   = _mm_loadu_si128 (xkey+i);
             __m128i const dk  = _mm_add_epi32 (d,k);                               /* uint32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
@@ -309,7 +322,8 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
         const __m128i* const xkey  = (const __m128i *) key;
         __m128i const xor_p5 = _mm_set1_epi64((__m64)PRIME64_5);
 
-        for (size_t i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
             __m128i data = xacc[i];
             __m128i const shifted = _mm_srli_epi64(data, 47);
             data = _mm_xor_si128(data, shifted);
@@ -348,7 +362,8 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
 
 static void XXH3_accumulate(U64* acc, const void* restrict data, const U32* restrict key, size_t nbStripes)
 {
-    for (size_t n = 0; n < nbStripes; n++ ) {
+    size_t n;
+    for (n = 0; n < nbStripes; n++ ) {
         XXH3_accumulate_512(acc, (const BYTE*)data + n*STRIPE_LEN, key);
         key += 2;
     }
@@ -365,7 +380,8 @@ XXH3_hashLong(const void* data, size_t len)
     size_t const block_len = STRIPE_LEN * NB_KEYS;
     size_t const nb_blocks = len / block_len;
 
-    for (size_t n = 0; n < nb_blocks; n++) {
+    size_t n;
+    for (n = 0; n < nb_blocks; n++) {
         XXH3_accumulate(acc, (const BYTE*)data + n*block_len, kKey, NB_KEYS);
         XXH3_scrambleAcc(acc, kKey + (KEYSET_DEFAULT_SIZE - STRIPE_ELTS));
     }
@@ -387,6 +403,7 @@ XXH3_hashLong(const void* data, size_t len)
 }
 
 
+
 /* ==========================================
  * Public prototype
  * ========================================== */
diff --git a/xxhsum.c b/xxhsum.c
index af5e46f4..55ef4301 100644
--- a/xxhsum.c
+++ b/xxhsum.c
@@ -1239,17 +1239,17 @@ static int readU32FromCharChecked(const char** stringPtr, unsigned* value)
     static unsigned const max = (((unsigned)(-1)) / 10) - 1;
     unsigned result = 0;
     while ((**stringPtr >='0') && (**stringPtr <='9')) {
-        if (result > max) return 1; // overflow error
+        if (result > max) return 1; /* overflow error */
         result *= 10;
         result += (unsigned)(**stringPtr - '0');
         (*stringPtr)++ ;
     }
     if ((**stringPtr=='K') || (**stringPtr=='M')) {
         unsigned const maxK = ((unsigned)(-1)) >> 10;
-        if (result > maxK) return 1; // overflow error
+        if (result > maxK) return 1; /* overflow error */
         result <<= 10;
         if (**stringPtr=='M') {
-            if (result > maxK) return 1; // overflow error
+            if (result > maxK) return 1; /* overflow error */
             result <<= 10;
         }
         (*stringPtr)++;  /* skip `K` or `M` */

From 7784d41ce3345fc2e40e81991651555e62d1167e Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 26 Feb 2019 16:36:03 -0800
Subject: [PATCH 24/73] fixed ARM compilation error

---
 xxh3.h | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/xxh3.h b/xxh3.h
index 8239677a..8133dde3 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -18,6 +18,18 @@
 #  define restrict   /* disable */
 #endif
 
+#if defined(__GNUC__)
+#  if defined(__SSE2__)
+#    include <x86intrin.h>
+#  endif
+#  define ALIGN(n)      __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#  include <intrin.h>
+#  define ALIGN(n)      __declspec(align(n))
+#else
+#  define ALIGN(n)   // disabled
+#endif
+
 
 
 /* ==========================================
@@ -203,16 +215,6 @@ static U64 XXH3_len_97to128_64b(const void* data, size_t len)
  * Long keys
  * ========================================== */
 
-#if __GNUC__
-#include <x86intrin.h>
-#define ALIGN(n)      __attribute__ ((aligned(n)))
-#elif _MSC_VER
-#include <intrin.h>
-#define ALIGN(n)      __declspec(align(n))
-#else
-#define ALIGN(n)
-#endif
-
 #define STRIPE_LEN 64
 #define STRIPE_ELTS (STRIPE_LEN / sizeof(U32))
 #define KEYSET_DEFAULT_SIZE 48   /* minimum 32 */
@@ -408,7 +410,7 @@ XXH3_hashLong(const void* data, size_t len)
  * Public prototype
  * ========================================== */
 
-XXH_PUBLIC_API U64 XXH3_64b(const void* data, size_t len)
+XXH_PUBLIC_API XXH64_hash_t XXH3_64b(const void* data, size_t len)
 {
     switch ((len-1) / 16) {  /* intentional underflow */
         case 0: return XXH3_len_1to16_64b(data, len);

From 2be95459cde521061573f7bd5b6df60d5b678769 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 26 Feb 2019 16:42:50 -0800
Subject: [PATCH 25/73] fixed minor c90 warning

---
 xxh3.h   | 8 ++++----
 xxhsum.c | 6 ++++--
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/xxh3.h b/xxh3.h
index 8133dde3..52fcef4b 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -354,10 +354,10 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
         xacc[i] ^= xacc[i] >> 47;
         xacc[i] ^= PRIME64_5;
 
-        U64 p1 = (xacc[i] >> 32) * xkey[left];
-        U64 p2 = (xacc[i] & 0xFFFFFFFF) * xkey[right];
-        xacc[i] = p1 ^ p2;
-    }
+        {   U64 p1 = (xacc[i] >> 32) * xkey[left];
+            U64 p2 = (xacc[i] & 0xFFFFFFFF) * xkey[right];
+            xacc[i] = p1 ^ p2;
+    }   }
 
 #endif
 }
diff --git a/xxhsum.c b/xxhsum.c
index 55ef4301..657cf783 100644
--- a/xxhsum.c
+++ b/xxhsum.c
@@ -1266,9 +1266,11 @@ static int readU32FromCharChecked(const char** stringPtr, unsigned* value)
  *  Will also modify `*stringPtr`, advancing it to position where it stopped reading.
  *  Note : function will exit() program if digit sequence overflows */
 static unsigned readU32FromChar(const char** stringPtr) {
-    static const char errorMsg[] = "error: numeric value too large";
     unsigned result;
-    if (readU32FromCharChecked(stringPtr, &result)) { errorOut(errorMsg); }
+    if (readU32FromCharChecked(stringPtr, &result)) {
+        static const char errorMsg[] = "error: numeric value too large";
+        errorOut(errorMsg);
+    }
     return result;
 }
 

From c6c39030fba6acbb737b2622bdbae9912ad1cc7a Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 26 Feb 2019 16:49:23 -0800
Subject: [PATCH 26/73] ensure warnings are blocking during tests

added -Werror flag to target test-all
---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index 4c426ab0..99996db6 100644
--- a/Makefile
+++ b/Makefile
@@ -226,6 +226,7 @@ preview-man: clean-man man
 
 test: all namespaceTest check test-xxhsum-c c90test
 
+test-all: CFLAGS += -Werror
 test-all: test test32 armtest clangtest cxxtest usan listL120 trailingWhitespace staticAnalyze cppcheck
 
 .PHONY: listL120

From 5b827f538c5903c8200b22b098b90d95fe7c4c33 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 26 Feb 2019 18:38:20 -0800
Subject: [PATCH 27/73] improved 8-ways mixer

---
 xxh3.h | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/xxh3.h b/xxh3.h
index 52fcef4b..daebdbc9 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -27,7 +27,7 @@
 #  include <intrin.h>
 #  define ALIGN(n)      __declspec(align(n))
 #else
-#  define ALIGN(n)   // disabled
+#  define ALIGN(n)   /* disabled */
 #endif
 
 
@@ -79,16 +79,14 @@ static U64 XXH3_finalMerge_8u64(U64 ll1, U64 ll2, U64 ll3, U64 ll4,
                                 U64 mul)
 {
     U64 const ll11 = XXH_rotl64(ll1 + ll7, 21) + (XXH_rotl64(ll2, 34) + ll3) * 9;
-    U64 const ll12 = ((ll1 + ll2) ^ ll4) + ll6 + 1;
+    U64 const ll12 = XXH_rotl64(((ll1 + ll2) ^ ll4), 17) + ll6 + 1;
     U64 const ll13 = XXH_rotl64(ll5 + ll6, 22) + ll3;
-    U64 const ll14 = ll5 + XXH_rotl64(ll8, 11) + ll3;
+    U64 const ll14 = ll5 + XXH_rotl64(ll8, 23) + ll7;
 
-    U64 const ll21 = XXH_swap64((ll11 + ll12) * mul) + ll8;
-    U64 const ll31 = (XXH_swap64((ll12 + ll21) * mul) + ll7) * mul;
-    U64 const ll41 = XXH_swap64((ll13 + ll14) * mul + ll31) + ll2;
-    U64 const ll51 = XXH3_mixHigh((ll14 + ll41) * mul + ll4 + ll8) * mul;
+    U64 const ll21 = (XXH_swap64((ll11 + ll12) * mul) + ll13) * mul + ll8;
+    U64 const ll22 = (XXH_swap64((ll12 + ll14) * mul) + ll4) * mul;
 
-    return ll51 + ll13;
+    return XXH3_finalMerge_2u64(ll21, ll22, mul);
 }
 
 
@@ -124,10 +122,10 @@ XXH_FORCE_INLINE U64 XXH3_len_9to16_64b(const void* data, size_t len)
     assert(len >= 9 && len <= 16);
     {   U64 const ll1 = XXH_read64(data) + PRIME64_1;
         U64 const ll2 = XXH_read64((const BYTE*)data + len - 8);
-        U64 const mul = PRIME64_2 + len * 2;  /* keep it odd */
-        U64 const llcomb3 = ll1 * mul + XXH_rotl64(ll2, 23);
-        U64 const llcomb4 = ll2 * mul + XXH_rotl64(ll1, 37);
-        return XXH3_finalMerge_2u64(llcomb3, llcomb4, mul);
+        U64 const mul = PRIME64_2 + (len * 2);  /* keep it odd */
+        U64 const ll11 = (ll1 * mul) + XXH_rotl64(ll2, 23);
+        U64 const ll12 = (ll2 * mul) + XXH_rotl64(ll1, 37);
+        return XXH3_finalMerge_2u64(ll11, ll12, mul);
     }
 }
 
@@ -407,7 +405,7 @@ XXH3_hashLong(const void* data, size_t len)
 
 
 /* ==========================================
- * Public prototype
+ * Public entry point
  * ========================================== */
 
 XXH_PUBLIC_API XXH64_hash_t XXH3_64b(const void* data, size_t len)

From fa31d0b02f492a80cf9d972fde7464f19b6230a1 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Wed, 27 Feb 2019 15:03:23 -0800
Subject: [PATCH 28/73] xxh3: fixed last minor quality metric

in extended tests
---
 xxh3.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/xxh3.h b/xxh3.h
index daebdbc9..826f4c2f 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -69,7 +69,7 @@ static U64 XXH3_finalMerge_2u64(U64 ll1, U64 ll2, U64 mul)
 static U64 XXH3_finalMerge_4u64(U64 ll1, U64 ll2, U64 ll3, U64 ll4, U64 mul)
 {
     U64 const ll11 = XXH_rotl64(ll1 + ll2, 43) + XXH_rotl64(ll3, 30) + ll4;
-    U64 const ll12 = ll1 + XXH_rotl64(ll2 + PRIME64_3, 18) + ll3;
+    U64 const ll12 = ll1 + XXH_rotl64(ll2, 18) + ll3 + PRIME64_3;
 
     return XXH3_finalMerge_2u64(ll11, ll12, mul);
 }
@@ -79,12 +79,12 @@ static U64 XXH3_finalMerge_8u64(U64 ll1, U64 ll2, U64 ll3, U64 ll4,
                                 U64 mul)
 {
     U64 const ll11 = XXH_rotl64(ll1 + ll7, 21) + (XXH_rotl64(ll2, 34) + ll3) * 9;
-    U64 const ll12 = XXH_rotl64(((ll1 + ll2) ^ ll4), 17) + ll6 + 1;
-    U64 const ll13 = XXH_rotl64(ll5 + ll6, 22) + ll3;
-    U64 const ll14 = ll5 + XXH_rotl64(ll8, 23) + ll7;
+    U64 const ll12 = XXH_rotl64(((ll1 + ll2) ^ ll4), 17) + ll6 + PRIME64_5;
+    U64 const ll13 = XXH_rotl64(ll5 * PRIME64_4 + ll6, 46) + ll3;
+    U64 const ll14 = XXH_rotl64(ll8, 23) + XXH_rotl64(ll5 + ll7, 12);
 
-    U64 const ll21 = (XXH_swap64((ll11 + ll12) * mul) + ll13) * mul + ll8;
-    U64 const ll22 = (XXH_swap64((ll12 + ll14) * mul) + ll4) * mul;
+    U64 const ll21 = (XXH_swap64((ll11 + ll12) * PRIME64_1) + ll13) * PRIME64_3 + ll8;
+    U64 const ll22 = (XXH_swap64((ll12 + ll14) * PRIME64_2) + ll4) * mul;
 
     return XXH3_finalMerge_2u64(ll21, ll22, mul);
 }
@@ -373,7 +373,7 @@ static void XXH3_accumulate(U64* acc, const void* restrict data, const U32* rest
 __attribute__((noinline)) static U64    /* It seems better for XXH3_64b to have hashLong not inlined : may mess up the switch case ? */
 XXH3_hashLong(const void* data, size_t len)
 {
-    ALIGN(64) U64 acc[ACC_NB] = { len, PRIME64_1, PRIME64_2, PRIME64_3, -len };
+    ALIGN(64) U64 acc[ACC_NB] = { 0, PRIME64_1, PRIME64_2, PRIME64_3, PRIME64_4, PRIME64_5 };
 
     #define NB_KEYS ((KEYSET_DEFAULT_SIZE - STRIPE_ELTS) / 2)
 
@@ -399,7 +399,7 @@ XXH3_hashLong(const void* data, size_t len)
     }   }
 
     /* converge into final hash */
-    return XXH3_finalMerge_8u64(acc[0], acc[1], acc[2], acc[3], acc[4], acc[5], acc[6], acc[7], PRIME64_2);
+    return XXH3_finalMerge_8u64(acc[0] + len, acc[1], acc[2], acc[3], acc[4], acc[5], acc[6], acc[7] - len, PRIME64_2 + len*2);
 }
 
 

From cd626c344b2fc571825e2b08dc974226b920dded Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Wed, 27 Feb 2019 16:05:20 -0800
Subject: [PATCH 29/73] Makefile : switch default optimization to -O3

because gcc is pretty bad at vectorization with -O2.

Also : documented the clang problem with XXH32 auto-vectorization
which must be prevented for better performance.
---
 Makefile |  2 +-
 xxhash.c | 12 +++++++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 99996db6..25a5dfdc 100644
--- a/Makefile
+++ b/Makefile
@@ -41,7 +41,7 @@ else
 NOSSE4 :=
 endif
 
-CFLAGS ?= -O2 $(NOSSE4) # disables potential auto-vectorization
+CFLAGS ?= -O3 $(NOSSE4) # disables potential auto-vectorization
 DEBUGFLAGS+=-Wall -Wextra -Wconversion -Wcast-qual -Wcast-align -Wshadow \
             -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \
             -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \
diff --git a/xxhash.c b/xxhash.c
index 9e598c5e..02f5cd53 100644
--- a/xxhash.c
+++ b/xxhash.c
@@ -348,7 +348,6 @@ XXH32_finalize(U32 h32, const void* ptr, size_t len,
     return h32;   /* reaching this point is deemed impossible */
 }
 
-
 XXH_FORCE_INLINE U32
 XXH32_endian_align(const void* input, size_t len, U32 seed,
                     XXH_endianess endian, XXH_alignment align)
@@ -371,6 +370,17 @@ XXH32_endian_align(const void* input, size_t len, U32 seed,
         U32 v3 = seed + 0;
         U32 v4 = seed - PRIME32_1;
 
+        /* note : clang will try to vectorize this loop, using pmulld instruction.
+         * This is a bad idea, and will result in substantial performance reduction.
+         * To prevent clang from "optimizing" this loop,
+         * it's necessary to disable SSE4 on command line (-mno-sse4).
+         * However, this is a build instruction, so it's outside of source code.
+         * Whenever xxhash.c is used in a different code base, build flags don't follow.
+         * It would be better to ensure vectorization is disabled from within the source code.
+         * Alas, so far, I've not found a working method.
+         * I tried both `#pragma` and `__attribute__`, but clang still vectorizes.
+         * Help welcomed.
+         * In the meantime, vectorization is prevented by the `Makefile` */
         do {
             v1 = XXH32_round(v1, XXH_get32bits(p)); p+=4;
             v2 = XXH32_round(v2, XXH_get32bits(p)); p+=4;

From b348fa896a6a5fc30b67fa0ba1b2f184ea25d742 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Thu, 28 Feb 2019 16:43:44 -0800
Subject: [PATCH 30/73] restored 8-way mixer

---
 xxh3.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/xxh3.h b/xxh3.h
index 826f4c2f..5e9f11ae 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -79,14 +79,16 @@ static U64 XXH3_finalMerge_8u64(U64 ll1, U64 ll2, U64 ll3, U64 ll4,
                                 U64 mul)
 {
     U64 const ll11 = XXH_rotl64(ll1 + ll7, 21) + (XXH_rotl64(ll2, 34) + ll3) * 9;
-    U64 const ll12 = XXH_rotl64(((ll1 + ll2) ^ ll4), 17) + ll6 + PRIME64_5;
-    U64 const ll13 = XXH_rotl64(ll5 * PRIME64_4 + ll6, 46) + ll3;
-    U64 const ll14 = XXH_rotl64(ll8, 23) + XXH_rotl64(ll5 + ll7, 12);
+    U64 const ll12 = ((ll1 + ll2) ^ ll4) + ll6 + 1;
+    U64 const ll13 = XXH_rotl64(ll5 + ll6, 22) + ll3;
+    U64 const ll14 = ll5 + XXH_rotl64(ll8, 11) + ll3;
 
-    U64 const ll21 = (XXH_swap64((ll11 + ll12) * PRIME64_1) + ll13) * PRIME64_3 + ll8;
-    U64 const ll22 = (XXH_swap64((ll12 + ll14) * PRIME64_2) + ll4) * mul;
+    U64 const ll21 = XXH_swap64((ll11 + ll12) * mul) + ll8;
+    U64 const ll31 = (XXH_swap64((ll12 + ll21) * mul) + ll7) * mul;
+    U64 const ll41 = XXH_swap64((ll13 + ll14) * mul + ll31) + ll2;
+    U64 const ll51 = XXH3_mixHigh((ll14 + ll41) * mul + ll4 + ll8) * mul;
 
-    return XXH3_finalMerge_2u64(ll21, ll22, mul);
+    return ll51 + ll13;
 }
 
 

From 8d345470e6ef8d6ee80c91305a3f463021a27582 Mon Sep 17 00:00:00 2001
From: "easyaspi314 (Devin)" <easyaspi314@users.noreply.github.com>
Date: Thu, 28 Feb 2019 20:28:29 -0500
Subject: [PATCH 31/73] xxh3: add NEON support

Signed-off-by: easyaspi314 (Devin) <easyaspi314@users.noreply.github.com>
---
 xxh3.h | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)

diff --git a/xxh3.h b/xxh3.h
index 826f4c2f..28fab7ab 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -21,6 +21,10 @@
 #if defined(__GNUC__)
 #  if defined(__SSE2__)
 #    include <x86intrin.h>
+#  elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+#    define inline __inline__ /* clang bug */
+#    include <arm_neon.h>
+#    undef inline
 #  endif
 #  define ALIGN(n)      __attribute__ ((aligned(n)))
 #elif defined(_MSC_VER)
@@ -38,12 +42,16 @@
 #define XXH_SCALAR 0
 #define XXH_SSE2   1
 #define XXH_AVX2   2
+#define XXH_NEON   3
 
 #ifndef XXH_VECTOR    /* can be defined on command line */
 #  if defined(__AVX2__)
 #    define XXH_VECTOR XXH_AVX2
 #  elif defined(__SSE2__)
 #    define XXH_VECTOR XXH_SSE2
+/* msvc support maybe later */
+#  elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#    define XXH_VECTOR XXH_NEON
 #  else
 #    define XXH_VECTOR XXH_SCALAR
 #  endif
@@ -272,6 +280,59 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k
         }
     }
 
+#elif (XXH_VECTOR == XXH_NEON)
+
+    assert(((size_t)acc) & 15 == 0);
+    {                 uint64x2_t* const xacc  = (uint64x2_t *)acc;
+                  const uint32_t* const xdata = (const uint32_t *)data;
+        ALIGN(16) const uint32_t* const xkey  = (const uint32_t *)key;
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
+#if !defined(__aarch64__) && !defined(__arm64__) && !defined(XXH_NO_ARM32_HACK)
+            /* On 32-bit ARM, we can take advantage of the packed registers.
+             * This is not portable to aarch64!
+             * Basically, on 32-bit NEON, registers are stored like so:
+             *  .----------------------------------.
+             *  |                q8                | // uint32x4_t
+             *  |-----------------.----------------|
+             *  |  d16 (.val[0])  |  d17 (.val[1]) | // uint32x2x2_t
+             *  '-----------------'----------------'
+             * vld2.32 will store its values into two double registers, returning
+             * a uint32x2_t. In NEON, this will be stored in, for example, d16 and d17.
+             * Reinterpret cast it to a uint32x4_t and you get q8 for free
+             *
+             * On aarch64, this was changed completely.
+             *
+             * aarch64 gave us 16 more quad registers, but they also removed this behavior,
+             * instead matching smaller registers to the lower sections of the higher
+             * registers and zeroing the rest.
+             *  .----------------------------------..---------------------------------.
+             *  |               v8.4s              |               v9.4s               |
+             *  |-----------------.----------------|-----------------.-----------------|
+             *  | v8.2s (.val[0]) |     <zero>     | v9.2s (.val[1]) |      <zero>     |
+             *  '-----------------'----------------'-----------------'-----------------'
+             * On aarch64, ld2 will put it into v8.2s and v9.2s. Reinterpreting
+             * is not going to help us here, as half of it will end up being zero. */
+
+            uint32x2x2_t d = vld2_u32(xdata + i * 4);     /* load and swap */
+            uint32x2x2_t k = vld2_u32(xkey + i * 4);
+            /* Not sorry about breaking the strict aliasing rule.
+             * Using a union causes GCC to spit out nonsense, but an alias cast
+             * does not. */
+            uint32x4_t const dk = vaddq_u32(*(uint32x4_t*)&d, *(uint32x4_t*)&k);
+            xacc[i] = vmlal_u32(xacc[i], vget_low_u32(dk), vget_high_u32(dk));
+#else
+            /* Portable, but slightly slower version */
+            uint32x2x2_t const d = vld2_u32(xdata + i * 4);
+            uint32x2x2_t const k = vld2_u32(xkey + i * 4);
+            uint32x2_t const dkL = vadd_u32(d.val[0], k.val[0]);
+            uint32x2_t const dkH = vadd_u32(d.val[1], k.val[1]);   /* uint32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
+            /* xacc must be aligned on 16 bytes boundaries */
+            xacc[i] = vmlal_u32(xacc[i], dkL, dkH);                /* uint64 res[2] = {dk0*dk1,dk2*dk3} */
+#endif
+        }
+    }
 #else   /* scalar variant */
 
           U64* const xacc  =       (U64*) acc;
@@ -340,6 +401,35 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
         }   }
     }
 
+#elif (XXH_VECTOR == XXH_NEON)
+
+    assert(((size_t)acc) & 15 == 0);
+    {       uint64x2_t* const xacc =       (uint64x2_t*) acc;
+        const uint32_t* const xkey  = (const uint32_t *) key;
+        uint64x2_t xor_p5 = vdupq_n_u64(PRIME64_5);
+        size_t i;
+        /* Clang and GCC like to put NEON constant loads into the loop. */
+        __asm__("" : "+w" (xor_p5));
+        for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) {
+            uint64x2_t data = xacc[i];
+            uint64x2_t const shifted = vshrq_n_u64(data, 47);
+            data = veorq_u64(data, shifted);
+            data = veorq_u64(data, xor_p5);
+
+            {
+                /* shuffle: 0, 1, 2, 3 -> 0, 2, 1, 3 */
+                uint32x2x2_t const d =
+                    vzip_u32(
+                        vget_low_u32(vreinterpretq_u32_u64(data)),
+                        vget_high_u32(vreinterpretq_u32_u64(data))
+                    );
+                uint32x2x2_t const k = vld2_u32 (xkey+i*4);              /* load and swap */
+                uint64x2_t const dk  = vmull_u32(d.val[0],k.val[0]);     /* U64 dk[2]  = {d0 * k0, d2 * k2} */
+                uint64x2_t const dk2 = vmull_u32(d.val[1],k.val[1]);     /* U64 dk2[2] = {d1 * k1, d3 * k3} */
+                xacc[i] = veorq_u64(dk, dk2);                            /* xacc[i] = dk ^ dk2;             */
+        }   }
+    }
+
 #else   /* scalar variant */
 
           U64* const xacc =       (U64*) acc;

From d034ce8269f1bea127354685762bc0000a3ff134 Mon Sep 17 00:00:00 2001
From: "easyaspi314 (Devin)" <easyaspi314@users.noreply.github.com>
Date: Sat, 2 Mar 2019 01:00:28 -0500
Subject: [PATCH 32/73] Automatically warn + disable NEON implementation for
 aarch64 GCC < 7.

It generates code that runs at about 1.8 GB/s when Clang 3.8 generates
code that runs at 5 GB/s on the same machine with the same C
source code.
---
 xxh3.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/xxh3.h b/xxh3.h
index e0d7c8e9..6d0c7dd3 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -49,6 +49,11 @@
 #    define XXH_VECTOR XXH_AVX2
 #  elif defined(__SSE2__)
 #    define XXH_VECTOR XXH_SSE2
+/* GCC < 7 for aarch64 generates unreasonably slow code for the NEON
+ * implementation. We fall back to the scalar version and emit a warning. */
+#  elif defined(__aarch64__) && !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 7
+#    warning Your GCC version has broken NEON support. Please use GCC 7+ or Clang.
+#    define XXH_VECTOR XXH_SCALAR
 /* msvc support maybe later */
 #  elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__ARM_NEON))
 #    define XXH_VECTOR XXH_NEON

From 8a08cbc10ce363b13ce2df4b3b430e2ea34a4660 Mon Sep 17 00:00:00 2001
From: "easyaspi314 (Devin)" <easyaspi314@users.noreply.github.com>
Date: Sat, 2 Mar 2019 19:25:31 -0500
Subject: [PATCH 33/73] Improve aarch64 code. There is no longer need to
 disable NEON on GCC 6

This new code is faster and vectorizes properly on GCC 6.

Apparently, aarch64 really hates shuffling.
---
 xxh3.h | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/xxh3.h b/xxh3.h
index 6d0c7dd3..42c6e244 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -49,11 +49,6 @@
 #    define XXH_VECTOR XXH_AVX2
 #  elif defined(__SSE2__)
 #    define XXH_VECTOR XXH_SSE2
-/* GCC < 7 for aarch64 generates unreasonably slow code for the NEON
- * implementation. We fall back to the scalar version and emit a warning. */
-#  elif defined(__aarch64__) && !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 7
-#    warning Your GCC version has broken NEON support. Please use GCC 7+ or Clang.
-#    define XXH_VECTOR XXH_SCALAR
 /* msvc support maybe later */
 #  elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__ARM_NEON))
 #    define XXH_VECTOR XXH_NEON
@@ -320,23 +315,26 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k
              *  | v8.2s (.val[0]) |     <zero>     | v9.2s (.val[1]) |      <zero>     |
              *  '-----------------'----------------'-----------------'-----------------'
              * On aarch64, ld2 will put it into v8.2s and v9.2s. Reinterpreting
-             * is not going to help us here, as half of it will end up being zero. */
+             * is not going to help us here, as half of it will end up being zero.
+             *
+             * Even if it did, aarch64 apparently does really bad with shuffling, so
+             * we use a different method. */
 
             uint32x2x2_t d = vld2_u32(xdata + i * 4);     /* load and swap */
             uint32x2x2_t k = vld2_u32(xkey + i * 4);
             /* Not sorry about breaking the strict aliasing rule.
              * Using a union causes GCC to spit out nonsense, but an alias cast
              * does not. */
-            uint32x4_t const dk = vaddq_u32(*(uint32x4_t*)&d, *(uint32x4_t*)&k);
-            xacc[i] = vmlal_u32(xacc[i], vget_low_u32(dk), vget_high_u32(dk));
+            uint32x4_t const dk = vaddq_u32(*(uint32x4_t*)&d, *(uint32x4_t*)&k);  /* dk = d + k */
+            xacc[i] = vmlal_u32(xacc[i], vget_low_u32(dk), vget_high_u32(dk));    /* xacc[i] += (U64)dkLo * (U64)dkHi; */
 #else
-            /* Portable, but slightly slower version */
-            uint32x2x2_t const d = vld2_u32(xdata + i * 4);
-            uint32x2x2_t const k = vld2_u32(xkey + i * 4);
-            uint32x2_t const dkL = vadd_u32(d.val[0], k.val[0]);
-            uint32x2_t const dkH = vadd_u32(d.val[1], k.val[1]);   /* uint32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
-            /* xacc must be aligned on 16 bytes boundaries */
-            xacc[i] = vmlal_u32(xacc[i], dkL, dkH);                /* uint64 res[2] = {dk0*dk1,dk2*dk3} */
+            /* A portable and aarch64-friendly version. It is slower on ARMv7a, though. */
+            uint32x4_t d = vld1q_u32(xdata + i * 4);
+            uint32x4_t k = vld1q_u32(xkey + i * 4);
+            /* Add d and k, then reinterpret to a uint64x2_t. This is not a long add. */
+            uint64x2_t dk = vreinterpretq_u64_u32(vaddq_u32(d, k));           /* dk = (U64)(d[1] + k[1]) << 32) | (d[0] + k[0]); */
+            /* Long multiply high and low bits. */
+            xacc[i] = vmlal_u32(xacc[i], vmovn_u64(dk), vshrn_n_u64(dk, 32)); /* xacc[i] += (dk & 0xFFFFFFFF) * (dk >> 32); */
 #endif
         }
     }
@@ -424,6 +422,12 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
             data = veorq_u64(data, xor_p5);
 
             {
+#ifdef __aarch64__
+                /* aarch64 prefers this method, ARMv7a prefers the other. */
+                uint64x2_t k = *(uint64x2_t *)(xkey + i * 4);
+                uint64x2_t const dk = vmull_u32(vmovn_u64(data), vmovn_u64(k));
+                uint64x2_t const dk2 = vmull_u32(vshrn_n_u64(data, 32), vshrn_n_u64(k, 32));
+#else
                 /* shuffle: 0, 1, 2, 3 -> 0, 2, 1, 3 */
                 uint32x2x2_t const d =
                     vzip_u32(
@@ -433,6 +437,7 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
                 uint32x2x2_t const k = vld2_u32 (xkey+i*4);              /* load and swap */
                 uint64x2_t const dk  = vmull_u32(d.val[0],k.val[0]);     /* U64 dk[2]  = {d0 * k0, d2 * k2} */
                 uint64x2_t const dk2 = vmull_u32(d.val[1],k.val[1]);     /* U64 dk2[2] = {d1 * k1, d3 * k3} */
+#endif
                 xacc[i] = veorq_u64(dk, dk2);                            /* xacc[i] = dk ^ dk2;             */
         }   }
     }

From 982a3ab59dbe04b8241e0ce2e8043a2e699330f3 Mon Sep 17 00:00:00 2001
From: Yann Collet <Cyan4973@users.noreply.github.com>
Date: Sat, 2 Mar 2019 18:26:20 -0800
Subject: [PATCH 34/73] Revert "Improve aarch64 performance"

---
 xxh3.h | 30 ++++++++++--------------------
 1 file changed, 10 insertions(+), 20 deletions(-)

diff --git a/xxh3.h b/xxh3.h
index 42c6e244..e0d7c8e9 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -315,26 +315,23 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k
              *  | v8.2s (.val[0]) |     <zero>     | v9.2s (.val[1]) |      <zero>     |
              *  '-----------------'----------------'-----------------'-----------------'
              * On aarch64, ld2 will put it into v8.2s and v9.2s. Reinterpreting
-             * is not going to help us here, as half of it will end up being zero.
-             *
-             * Even if it did, aarch64 apparently does really bad with shuffling, so
-             * we use a different method. */
+             * is not going to help us here, as half of it will end up being zero. */
 
             uint32x2x2_t d = vld2_u32(xdata + i * 4);     /* load and swap */
             uint32x2x2_t k = vld2_u32(xkey + i * 4);
             /* Not sorry about breaking the strict aliasing rule.
              * Using a union causes GCC to spit out nonsense, but an alias cast
              * does not. */
-            uint32x4_t const dk = vaddq_u32(*(uint32x4_t*)&d, *(uint32x4_t*)&k);  /* dk = d + k */
-            xacc[i] = vmlal_u32(xacc[i], vget_low_u32(dk), vget_high_u32(dk));    /* xacc[i] += (U64)dkLo * (U64)dkHi; */
+            uint32x4_t const dk = vaddq_u32(*(uint32x4_t*)&d, *(uint32x4_t*)&k);
+            xacc[i] = vmlal_u32(xacc[i], vget_low_u32(dk), vget_high_u32(dk));
 #else
-            /* A portable and aarch64-friendly version. It is slower on ARMv7a, though. */
-            uint32x4_t d = vld1q_u32(xdata + i * 4);
-            uint32x4_t k = vld1q_u32(xkey + i * 4);
-            /* Add d and k, then reinterpret to a uint64x2_t. This is not a long add. */
-            uint64x2_t dk = vreinterpretq_u64_u32(vaddq_u32(d, k));           /* dk = (U64)(d[1] + k[1]) << 32) | (d[0] + k[0]); */
-            /* Long multiply high and low bits. */
-            xacc[i] = vmlal_u32(xacc[i], vmovn_u64(dk), vshrn_n_u64(dk, 32)); /* xacc[i] += (dk & 0xFFFFFFFF) * (dk >> 32); */
+            /* Portable, but slightly slower version */
+            uint32x2x2_t const d = vld2_u32(xdata + i * 4);
+            uint32x2x2_t const k = vld2_u32(xkey + i * 4);
+            uint32x2_t const dkL = vadd_u32(d.val[0], k.val[0]);
+            uint32x2_t const dkH = vadd_u32(d.val[1], k.val[1]);   /* uint32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
+            /* xacc must be aligned on 16 bytes boundaries */
+            xacc[i] = vmlal_u32(xacc[i], dkL, dkH);                /* uint64 res[2] = {dk0*dk1,dk2*dk3} */
 #endif
         }
     }
@@ -422,12 +419,6 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
             data = veorq_u64(data, xor_p5);
 
             {
-#ifdef __aarch64__
-                /* aarch64 prefers this method, ARMv7a prefers the other. */
-                uint64x2_t k = *(uint64x2_t *)(xkey + i * 4);
-                uint64x2_t const dk = vmull_u32(vmovn_u64(data), vmovn_u64(k));
-                uint64x2_t const dk2 = vmull_u32(vshrn_n_u64(data, 32), vshrn_n_u64(k, 32));
-#else
                 /* shuffle: 0, 1, 2, 3 -> 0, 2, 1, 3 */
                 uint32x2x2_t const d =
                     vzip_u32(
@@ -437,7 +428,6 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
                 uint32x2x2_t const k = vld2_u32 (xkey+i*4);              /* load and swap */
                 uint64x2_t const dk  = vmull_u32(d.val[0],k.val[0]);     /* U64 dk[2]  = {d0 * k0, d2 * k2} */
                 uint64x2_t const dk2 = vmull_u32(d.val[1],k.val[1]);     /* U64 dk2[2] = {d1 * k1, d3 * k3} */
-#endif
                 xacc[i] = veorq_u64(dk, dk2);                            /* xacc[i] = dk ^ dk2;             */
         }   }
     }

From 48e3d724d17877506fef39e331b80fa73b5c8679 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Wed, 6 Mar 2019 11:55:48 -0500
Subject: [PATCH 35/73] updated xxh3

---
 xxh3.h | 261 ++++++++++++++++++++++++---------------------------------
 1 file changed, 110 insertions(+), 151 deletions(-)

diff --git a/xxh3.h b/xxh3.h
index e0d7c8e9..32846a63 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -59,191 +59,113 @@
 
 
 
+
 /* ==========================================
- * Short keys
+ * XXH3 default settings
  * ========================================== */
 
-static U64 XXH3_mixHigh(U64 val) {
-  return val ^ (val >> 47);
-}
+#define KEYSET_DEFAULT_SIZE 48   /* minimum 32 */
 
-static U64 XXH3_finalMerge_2u64(U64 ll1, U64 ll2, U64 mul)
-{
-    U64 const ll11 = XXH3_mixHigh((ll1 ^ ll2) * mul);
-    U64 const ll21 = XXH3_mixHigh((ll2 ^ ll11) * mul);
-    return ll21 * mul;
-}
 
-static U64 XXH3_finalMerge_4u64(U64 ll1, U64 ll2, U64 ll3, U64 ll4, U64 mul)
-{
-    U64 const ll11 = XXH_rotl64(ll1 + ll2, 43) + XXH_rotl64(ll3, 30) + ll4;
-    U64 const ll12 = ll1 + XXH_rotl64(ll2, 18) + ll3 + PRIME64_3;
+ALIGN(64) static const U32 kKey[KEYSET_DEFAULT_SIZE] = {
+    0xb8fe6c39,0x23a44bbe,0x7c01812c,0xf721ad1c,
+    0xded46de9,0x839097db,0x7240a4a4,0xb7b3671f,
+    0xcb79e64e,0xccc0e578,0x825ad07d,0xccff7221,
+    0xb8084674,0xf743248e,0xe03590e6,0x813a264c,
+    0x3c2852bb,0x91c300cb,0x88d0658b,0x1b532ea3,
+    0x71644897,0xa20df94e,0x3819ef46,0xa9deacd8,
+    0xa8fa763f,0xe39c343f,0xf9dcbbc7,0xc70b4f1d,
+    0x8a51e04b,0xcdb45931,0xc89f7ec9,0xd9787364,
 
-    return XXH3_finalMerge_2u64(ll11, ll12, mul);
-}
+    0xeac5ac83,0x34d3ebc3,0xc581a0ff,0xfa1363eb,
+    0x170ddd51,0xb7f0da49,0xd3165526,0x29d4689e,
+    0x2b16be58,0x7d47a1fc,0x8ff8b8d1,0x7ad031ce,
+    0x45cb3a8f,0x95160428,0xafd7fbca,0xbb4b407e,
+};
 
-static U64 XXH3_finalMerge_8u64(U64 ll1, U64 ll2, U64 ll3, U64 ll4,
-                                U64 ll5, U64 ll6, U64 ll7, U64 ll8,
-                                U64 mul)
+XXH_FORCE_INLINE U64
+XXH3_mul128(U64 ll1, U64 ll2)
 {
-    U64 const ll11 = XXH_rotl64(ll1 + ll7, 21) + (XXH_rotl64(ll2, 34) + ll3) * 9;
-    U64 const ll12 = ((ll1 + ll2) ^ ll4) + ll6 + 1;
-    U64 const ll13 = XXH_rotl64(ll5 + ll6, 22) + ll3;
-    U64 const ll14 = ll5 + XXH_rotl64(ll8, 11) + ll3;
-
-    U64 const ll21 = XXH_swap64((ll11 + ll12) * mul) + ll8;
-    U64 const ll31 = (XXH_swap64((ll12 + ll21) * mul) + ll7) * mul;
-    U64 const ll41 = XXH_swap64((ll13 + ll14) * mul + ll31) + ll2;
-    U64 const ll51 = XXH3_mixHigh((ll14 + ll41) * mul + ll4 + ll8) * mul;
-
-    return ll51 + ll13;
+  __uint128_t lll = (__uint128_t)ll1 * ll2;
+  return (U64)lll + (lll >> 64);
 }
 
+static U64 XXH64_avalanche2(U64 h64)
+{
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+    return h64;
+}
 
-XXH_FORCE_INLINE U64 XXH3_len_1to3_64b(const void* data, size_t len)
+/* ==========================================
+ * Short keys
+ * ========================================== */
+XXH_FORCE_INLINE U64
+XXH3_len_1to3_64b(const void* data, size_t len, const void* keyPtr)
 {
     assert(data != NULL);
     assert(len > 0 && len <= 3);
-    {   BYTE const c1 = ((const BYTE*)data)[0];
+    assert(keyPtr != NULL);
+    {   const U32* const key32 = (const U32*) keyPtr;
+        BYTE const c1 = ((const BYTE*)data)[0];
         BYTE const c2 = ((const BYTE*)data)[len >> 1];
         BYTE const c3 = ((const BYTE*)data)[len - 1];
         U32  const l1 = (U32)(c1) + ((U32)(c2) << 8);
         U32  const l2 = (U32)(len) + ((U32)(c3) << 2);
-        U64  const ll3 = (l1 * PRIME64_2) ^ (l2 * PRIME64_1);
-        return XXH3_mixHigh(ll3) * PRIME64_3;
+        U64  const ll3 = (U64)(l1 + key32[0]) * (l2 + key32[1]);
+        return XXH64_avalanche2(ll3);
     }
 }
 
 
-XXH_FORCE_INLINE U64 XXH3_len_4to8_64b(const void* data, size_t len)
+XXH_FORCE_INLINE U64
+XXH3_len_4to8_64b(const void* data, size_t len, const void* keyPtr)
 {
     assert(data != NULL);
     assert(len >= 4 && len <= 8);
-    {   U64 const mul = PRIME64_2 + (len * 2);  /* keep it odd */
-        U64 const ll1 = XXH_read32(data);
-        U64 const ll2 = XXH_read32((const BYTE*)data + len - 4) + PRIME64_1;
-        return XXH3_finalMerge_2u64((len-1) + (ll1 << 3), ll2, mul);
+    {   const U32* const key32 = (const U32*) keyPtr;
+        U64 acc = PRIME64_1 * len;
+        U64 const l1 = XXH_read32(data) + key32[0];
+        U64 const l2 = XXH_read32((const BYTE*)data + len - 4) + key32[1];
+        acc += (U64)l1 * l2;
+        return XXH64_avalanche2(acc);
     }
 }
 
-XXH_FORCE_INLINE U64 XXH3_len_9to16_64b(const void* data, size_t len)
+XXH_FORCE_INLINE U64
+XXH3_len_9to16_64b(const void* data, size_t len, const void* keyPtr)
 {
     assert(data != NULL);
+    assert(key != NULL);
     assert(len >= 9 && len <= 16);
-    {   U64 const ll1 = XXH_read64(data) + PRIME64_1;
-        U64 const ll2 = XXH_read64((const BYTE*)data + len - 8);
-        U64 const mul = PRIME64_2 + (len * 2);  /* keep it odd */
-        U64 const ll11 = (ll1 * mul) + XXH_rotl64(ll2, 23);
-        U64 const ll12 = (ll2 * mul) + XXH_rotl64(ll1, 37);
-        return XXH3_finalMerge_2u64(ll11, ll12, mul);
+    {   const U64* const key64 = (const U64*) keyPtr;
+        U64 acc = PRIME64_1 * len;
+        U64 const ll1 = XXH_read64(data) + key64[0];
+        U64 const ll2 = XXH_read64((const BYTE*)data + len - 8) + key64[1];
+        acc += XXH3_mul128(ll1, ll2);
+        return XXH64_avalanche2(acc);
     }
 }
 
-XXH_FORCE_INLINE U64 XXH3_len_1to16_64b(const void* data, size_t len)
+XXH_FORCE_INLINE U64 XXH3_len_0to16_64b(const void* data, size_t len)
 {
     assert(data != NULL);
-    assert(len > 0 && len <= 16);
-    {   if (len > 8) return XXH3_len_9to16_64b(data, len);
-        if (len >= 4) return XXH3_len_4to8_64b(data, len);
-        return XXH3_len_1to3_64b(data, len);
-    }
-}
-
-
-static U64 XXH3_len_17to32_64b(const void* data, size_t len)
-{
-    assert(data != NULL);
-    assert(len > 16 && len <= 32);
-
-    {   const BYTE* const p = (const BYTE*)data;
-
-        U64 const mul = PRIME64_3 + len * 2;  /* keep it odd */
-        U64 const ll1 = XXH_read64(p) * PRIME64_1;
-        U64 const ll2 = XXH_read64(p + 8);
-        U64 const ll3 = XXH_read64(p + len - 8) * mul;
-        U64 const ll4 = XXH_read64(p + len - 16) * PRIME64_2;
-
-        return XXH3_finalMerge_4u64(ll1, ll2, ll3, ll4, mul);
+    assert(len <= 16);
+    {   if (len > 8) return XXH3_len_9to16_64b(data, len, kKey);
+        if (len >= 4) return XXH3_len_4to8_64b(data, len, kKey);
+        if (len) return XXH3_len_1to3_64b(data, len, kKey);
+        return 0;
     }
 }
 
 
-static U64 XXH3_len_33to64_64b(const void* data, size_t len)
-{
-    assert(data != NULL);
-    assert(len > 33 && len <= 64);
-
-    {   const BYTE* const p = (const BYTE*)data;
-
-        U64 const mul = PRIME64_2 + len * 2;   /* keep it odd */
-
-        U64 const ll1 = XXH_read64(p);
-        U64 const ll2 = XXH_read64(p + 8);
-        U64 const ll3 = XXH_read64(p + 16);
-        U64 const ll4 = XXH_read64(p + 24);
-        U64 const ll5 = XXH_read64(p + len - 32);
-        U64 const ll6 = XXH_read64(p + len - 24);
-        U64 const ll7 = XXH_read64(p + len - 16);
-        U64 const ll8 = XXH_read64(p + len - 8);
-
-        return XXH3_finalMerge_8u64(ll1, ll2, ll3, ll4, ll5, ll6, ll7, ll8, mul);
-    }
-}
-
-
-static U64 XXH3_len_65to96_64b(const void* data, size_t len)
-{
-    assert(data != NULL);
-    assert(len > 64 && len <= 96);
-
-    {   const BYTE* const p = (const BYTE*)data;
-
-        U64 const ll1 = XXH3_len_33to64_64b(data, 64);
-        U64 const ll2 = XXH3_len_17to32_64b(p + len - 32, 32);
-        return XXH3_finalMerge_2u64(ll1, ll2, PRIME64_1 + 2*len);
-    }
-}
-
-static U64 XXH3_len_97to128_64b(const void* data, size_t len)
-{
-    assert(data != NULL);
-    assert(len > 96 && len <= 128);
-
-    {   const BYTE* const p = (const BYTE*)data;
-
-        U64 const ll1 = XXH3_len_33to64_64b(data, 64);
-        U64 const ll2 = XXH3_len_33to64_64b(p + 64, len - 64);
-        return XXH3_finalMerge_2u64(ll1, ll2, PRIME64_1 + 2*len);
-    }
-}
-
-
-
 /* ==========================================
  * Long keys
  * ========================================== */
 
 #define STRIPE_LEN 64
 #define STRIPE_ELTS (STRIPE_LEN / sizeof(U32))
-#define KEYSET_DEFAULT_SIZE 48   /* minimum 32 */
-
-
-ALIGN(64) static const U32 kKey[KEYSET_DEFAULT_SIZE] = {
-    0xb8fe6c39,0x23a44bbe,0x7c01812c,0xf721ad1c,
-    0xded46de9,0x839097db,0x7240a4a4,0xb7b3671f,
-    0xcb79e64e,0xccc0e578,0x825ad07d,0xccff7221,
-    0xb8084674,0xf743248e,0xe03590e6,0x813a264c,
-    0x3c2852bb,0x91c300cb,0x88d0658b,0x1b532ea3,
-    0x71644897,0xa20df94e,0x3819ef46,0xa9deacd8,
-    0xa8fa763f,0xe39c343f,0xf9dcbbc7,0xc70b4f1d,
-    0x8a51e04b,0xcdb45931,0xc89f7ec9,0xd9787364,
-
-    0xeac5ac83,0x34d3ebc3,0xc581a0ff,0xfa1363eb,
-    0x170ddd51,0xb7f0da49,0xd3165526,0x29d4689e,
-    0x2b16be58,0x7d47a1fc,0x8ff8b8d1,0x7ad031ce,
-    0x45cb3a8f,0x95160428,0xafd7fbca,0xbb4b407e,
-};
-
 #define ACC_NB (STRIPE_LEN / sizeof(U64))
 
 XXH_FORCE_INLINE void
@@ -461,8 +383,25 @@ static void XXH3_accumulate(U64* acc, const void* restrict data, const U32* rest
     }
 }
 
+XXH_FORCE_INLINE U64 XXH3_mix16B(const void* data, const U64* key)
+{
+    return XXH3_mul128((XXH_read64(data) ^ key[0]), XXH_read64((const BYTE*)data+8) ^ key[1]);
+}
+
+static XXH64_hash_t XXH3_merge64B(const U64* data, const void* keyVoid, U64 len)
+{
+    const U64* const key = (const U64*)keyVoid;  /* presumed aligned */
+
+    U64 acc = PRIME64_1 * len;
+    acc += XXH3_mix16B(data+0, key+0);
+    acc += XXH3_mix16B(data+2, key+2);
+    acc += XXH3_mix16B(data+4, key+4);
+    acc += XXH3_mix16B(data+6, key+6);
+
+    return XXH64_avalanche2(acc);
+}
 
-__attribute__((noinline)) static U64    /* It seems better for XXH3_64b to have hashLong not inlined : may mess up the switch case ? */
+__attribute__((noinline)) static U64    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
 XXH3_hashLong(const void* data, size_t len)
 {
     ALIGN(64) U64 acc[ACC_NB] = { 0, PRIME64_1, PRIME64_2, PRIME64_3, PRIME64_4, PRIME64_5 };
@@ -491,7 +430,9 @@ XXH3_hashLong(const void* data, size_t len)
     }   }
 
     /* converge into final hash */
-    return XXH3_finalMerge_8u64(acc[0] + len, acc[1], acc[2], acc[3], acc[4], acc[5], acc[6], acc[7] - len, PRIME64_2 + len*2);
+    //return XXH3_finalMerge_8u64(acc[0] + len, acc[1], acc[2], acc[3], acc[4], acc[5], acc[6], acc[7] - len, PRIME64_2 + len*2);
+    assert(sizeof(acc) == 64);
+    return XXH3_merge64B(acc, kKey, len);
 }
 
 
@@ -502,17 +443,35 @@ XXH3_hashLong(const void* data, size_t len)
 
 XXH_PUBLIC_API XXH64_hash_t XXH3_64b(const void* data, size_t len)
 {
-    switch ((len-1) / 16) {  /* intentional underflow */
-        case 0: return XXH3_len_1to16_64b(data, len);
-        case 1: return XXH3_len_17to32_64b(data, len);
-        case 2:
-        case 3: return XXH3_len_33to64_64b(data, len);  /* 33-64 */
-        default:;
+    const BYTE* const p = (const BYTE*)data;
+    const U64* const key = (const U64*)(const void*)kKey;
+
+    if (len <= 16) return XXH3_len_0to16_64b(data, len);
+
+    {   U64 acc = PRIME64_1 * len;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    if (len > 128) return XXH3_hashLong(data, len);
+
+                    acc += XXH3_mix16B(p+48, key+12);
+                    acc += XXH3_mix16B(p+len-64, key+14);
+                }
+
+                acc += XXH3_mix16B(p+32, key+8);
+                acc += XXH3_mix16B(p+len-48, key+10);
+            }
+
+            acc += XXH3_mix16B(p+16, key+4);
+            acc += XXH3_mix16B(p+len-32, key+6);
+
+        }
+
+        acc += XXH3_mix16B(p+0, key+0);
+        acc += XXH3_mix16B(p+len-16, key+2);
+
+        return XXH64_avalanche2(acc);
     }
-    if (len==0) return 0;
-    if (len <= 96) return XXH3_len_65to96_64b(data, len);
-    if (len <= 128) return XXH3_len_97to128_64b(data, len);
-    return XXH3_hashLong(data, len);
 }
 
 

From 8d96de3e1ca60c0d29243a9118d8f30ec09fb276 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Wed, 6 Mar 2019 17:46:42 -0500
Subject: [PATCH 36/73] added variant with seed

---
 xxh3.h | 55 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 32 insertions(+), 23 deletions(-)

diff --git a/xxh3.h b/xxh3.h
index 32846a63..e81a525b 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -87,10 +87,10 @@ XXH_FORCE_INLINE U64
 XXH3_mul128(U64 ll1, U64 ll2)
 {
   __uint128_t lll = (__uint128_t)ll1 * ll2;
-  return (U64)lll + (lll >> 64);
+  return (U64)lll + (U64)(lll >> 64);
 }
 
-static U64 XXH64_avalanche2(U64 h64)
+static XXH64_hash_t XXH64_avalanche2(U64 h64)
 {
     h64 ^= h64 >> 29;
     h64 *= PRIME64_3;
@@ -98,11 +98,12 @@ static U64 XXH64_avalanche2(U64 h64)
     return h64;
 }
 
+
 /* ==========================================
  * Short keys
  * ========================================== */
-XXH_FORCE_INLINE U64
-XXH3_len_1to3_64b(const void* data, size_t len, const void* keyPtr)
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_1to3_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed)
 {
     assert(data != NULL);
     assert(len > 0 && len <= 3);
@@ -113,19 +114,19 @@ XXH3_len_1to3_64b(const void* data, size_t len, const void* keyPtr)
         BYTE const c3 = ((const BYTE*)data)[len - 1];
         U32  const l1 = (U32)(c1) + ((U32)(c2) << 8);
         U32  const l2 = (U32)(len) + ((U32)(c3) << 2);
-        U64  const ll3 = (U64)(l1 + key32[0]) * (l2 + key32[1]);
+        U64  const ll3 = (U64)(l1 + seed + key32[0]) * (l2 + key32[1]);
         return XXH64_avalanche2(ll3);
     }
 }
 
 
-XXH_FORCE_INLINE U64
-XXH3_len_4to8_64b(const void* data, size_t len, const void* keyPtr)
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_4to8_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed)
 {
     assert(data != NULL);
     assert(len >= 4 && len <= 8);
     {   const U32* const key32 = (const U32*) keyPtr;
-        U64 acc = PRIME64_1 * len;
+        U64 acc = PRIME64_1 * (len + seed);
         U64 const l1 = XXH_read32(data) + key32[0];
         U64 const l2 = XXH_read32((const BYTE*)data + len - 4) + key32[1];
         acc += (U64)l1 * l2;
@@ -133,14 +134,14 @@ XXH3_len_4to8_64b(const void* data, size_t len, const void* keyPtr)
     }
 }
 
-XXH_FORCE_INLINE U64
-XXH3_len_9to16_64b(const void* data, size_t len, const void* keyPtr)
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_9to16_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed)
 {
     assert(data != NULL);
     assert(key != NULL);
     assert(len >= 9 && len <= 16);
     {   const U64* const key64 = (const U64*) keyPtr;
-        U64 acc = PRIME64_1 * len;
+        U64 acc = PRIME64_1 * (len + seed);
         U64 const ll1 = XXH_read64(data) + key64[0];
         U64 const ll2 = XXH_read64((const BYTE*)data + len - 8) + key64[1];
         acc += XXH3_mul128(ll1, ll2);
@@ -148,14 +149,15 @@ XXH3_len_9to16_64b(const void* data, size_t len, const void* keyPtr)
     }
 }
 
-XXH_FORCE_INLINE U64 XXH3_len_0to16_64b(const void* data, size_t len)
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_0to16_64b(const void* data, size_t len, XXH64_hash_t seed)
 {
     assert(data != NULL);
     assert(len <= 16);
-    {   if (len > 8) return XXH3_len_9to16_64b(data, len, kKey);
-        if (len >= 4) return XXH3_len_4to8_64b(data, len, kKey);
-        if (len) return XXH3_len_1to3_64b(data, len, kKey);
-        return 0;
+    {   if (len > 8) return XXH3_len_9to16_64b(data, len, kKey, seed);
+        if (len >= 4) return XXH3_len_4to8_64b(data, len, kKey, seed);
+        if (len) return XXH3_len_1to3_64b(data, len, kKey, seed);
+        return seed;
     }
 }
 
@@ -401,10 +403,10 @@ static XXH64_hash_t XXH3_merge64B(const U64* data, const void* keyVoid, U64 len)
     return XXH64_avalanche2(acc);
 }
 
-__attribute__((noinline)) static U64    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
-XXH3_hashLong(const void* data, size_t len)
+__attribute__((noinline)) static XXH64_hash_t    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3_hashLong(const void* data, size_t len, XXH64_hash_t seed)
 {
-    ALIGN(64) U64 acc[ACC_NB] = { 0, PRIME64_1, PRIME64_2, PRIME64_3, PRIME64_4, PRIME64_5 };
+    ALIGN(64) U64 acc[ACC_NB] = { seed, PRIME64_1, PRIME64_2, PRIME64_3, PRIME64_4, PRIME64_5, -seed };
 
     #define NB_KEYS ((KEYSET_DEFAULT_SIZE - STRIPE_ELTS) / 2)
 
@@ -441,18 +443,19 @@ XXH3_hashLong(const void* data, size_t len)
  * Public entry point
  * ========================================== */
 
-XXH_PUBLIC_API XXH64_hash_t XXH3_64b(const void* data, size_t len)
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64b_withSeed(const void* data, size_t len, XXH64_hash_t seed)
 {
     const BYTE* const p = (const BYTE*)data;
     const U64* const key = (const U64*)(const void*)kKey;
 
-    if (len <= 16) return XXH3_len_0to16_64b(data, len);
+    if (len <= 16) return XXH3_len_0to16_64b(data, len, seed);
 
-    {   U64 acc = PRIME64_1 * len;
+    {   U64 acc = PRIME64_1 * (len + seed);
         if (len > 32) {
             if (len > 64) {
                 if (len > 96) {
-                    if (len > 128) return XXH3_hashLong(data, len);
+                    if (len > 128) return XXH3_hashLong(data, len, seed);
 
                     acc += XXH3_mix16B(p+48, key+12);
                     acc += XXH3_mix16B(p+len-64, key+14);
@@ -475,5 +478,11 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64b(const void* data, size_t len)
 }
 
 
+XXH_PUBLIC_API XXH64_hash_t XXH3_64b(const void* data, size_t len)
+{
+    return XXH3_64b_withSeed(data, len, 0);
+}
+
+
 
 #endif  /* XXH3_H */

From a951c0aebaa0b77aed43ce22b7e506b71f59c4e1 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Wed, 6 Mar 2019 23:42:04 -0500
Subject: [PATCH 37/73] xxh3: updated mul128 with a 32-bits backup path

also:
started XXH128 (not finished yet)
---
 xxh3.h | 173 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 171 insertions(+), 2 deletions(-)

diff --git a/xxh3.h b/xxh3.h
index e81a525b..3b634e34 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -83,13 +83,57 @@ ALIGN(64) static const U32 kKey[KEYSET_DEFAULT_SIZE] = {
     0x45cb3a8f,0x95160428,0xafd7fbca,0xbb4b407e,
 };
 
+
+
+
 XXH_FORCE_INLINE U64
 XXH3_mul128(U64 ll1, U64 ll2)
 {
-  __uint128_t lll = (__uint128_t)ll1 * ll2;
-  return (U64)lll + (U64)(lll >> 64);
+#if defined(__SIZEOF_INT128__) || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+
+    __uint128_t lll = (__uint128_t)ll1 * ll2;
+    return (U64)lll + (U64)(lll >> 64);
+
+#elif defined(_M_X64) || defined(_M_IA64)
+
+#   pragma intrinsic(_umul128)
+    U64 llhigh;
+    U64 const lllow = _umul128(ll1, ll2, &llhigh);
+    return lllow + llhigh;
+
+#elif defined(__aarch64__)
+
+    U64 const llow = ll1 * ll2;
+    U64 llhigh;
+    asm ("umulh %0, %1, %2" : "=r" (llhigh) : "r" (ll1), "r" (ll2));
+    return lllow + llhigh;
+
+#else
+
+    /* emulate 64x64x->128b multiplication, using four 32x32->64 */
+    U32 const h1 = ll1 >> 32;
+    U32 const h2 = ll2 >> 32;
+    U32 const l1 = (U32)ll1;
+    U32 const l2 = (U32)ll2;
+
+    U64 const llh  = (U64)h1 * h2;
+    U64 const llm1 = (U64)l1 * h2;
+    U64 const llm2 = (U64)l2 * h1;
+    U64 const lll  = (U64)l1 * l2;
+
+    U64 const t = lll + (llm1 << 32);
+    U64 const carry1 = t < lll;
+
+    U64 const lllow = t + (llm2 << 32);
+    U64 const carry2 = lllow < t;
+    U64 const llhigh = llh + (llm1 >> 32) + (llm2 >> 32) + carry1 + carry2;
+
+    return llhigh + lllow;
+
+#endif
 }
 
+
 static XXH64_hash_t XXH64_avalanche2(U64 h64)
 {
     h64 ^= h64 >> 29;
@@ -485,4 +529,129 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64b(const void* data, size_t len)
 
 
 
+/* ==========================================
+ * XXH3 128 bits
+ * Not ready yet !
+ * ========================================== */
+
+typedef struct {
+    XXH64_hash_t ll1;
+    XXH64_hash_t ll2;
+} XXH128_hash_t;
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_1to3_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed)
+{
+    assert(data != NULL);
+    assert(len > 0 && len <= 3);
+    assert(keyPtr != NULL);
+    {   const U32* const key32 = (const U32*) keyPtr;
+        BYTE const c1 = ((const BYTE*)data)[0];
+        BYTE const c2 = ((const BYTE*)data)[len >> 1];
+        BYTE const c3 = ((const BYTE*)data)[len - 1];
+        U32  const l1 = (U32)(c1) + ((U32)(c2) << 8);
+        U32  const l2 = (U32)(len) + ((U32)(c3) << 2);
+        U64  const ll1 = (U64)(l1 + seed + key32[0]) * (l2 + key32[1]);
+        U64  const ll2 = (U64)(l1 - seed + key32[2]) * (l2 + key32[3]);
+        return (XXH128_hash_t) { XXH64_avalanche2(ll1), XXH64_avalanche2(ll2) };
+    }
+}
+
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_4to8_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed)
+{
+    assert(data != NULL);
+    assert(len >= 4 && len <= 8);
+    {   const U32* const key32 = (const U32*) keyPtr;
+        U64 acc1 = PRIME64_1 * ((U64)len + seed);
+        U64 acc2 = PRIME64_2 * ((U64)len - seed);
+        U64 const l1 = XXH_read32(data) + key32[0];
+        U64 const l2 = XXH_read32((const BYTE*)data + len - 4) + key32[1];
+        acc1 += (U64)(l1 + key32[0]) * (l2 + key32[1]);
+        acc2 += (U64)(l1 + key32[2]) * (l2 + key32[3]);
+        return (XXH128_hash_t){ XXH64_avalanche2(acc1), XXH64_avalanche2(acc2) };
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_9to16_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed)
+{
+    assert(data != NULL);
+    assert(key != NULL);
+    assert(len >= 9 && len <= 16);
+    {   const U64* const key64 = (const U64*) keyPtr;
+        U64 acc1 = PRIME64_1 * ((U64)len + seed);
+        U64 acc2 = PRIME64_2 * ((U64)len - seed);
+        U64 const ll1 = XXH_read64(data);
+        U64 const ll2 = XXH_read64((const BYTE*)data + len - 8);
+        acc1 += XXH3_mul128(ll1 + key64[0], ll2 + key64[1]);
+        acc2 += XXH3_mul128(ll1 + key64[2], ll2 + key64[3]);
+        return (XXH128_hash_t){ XXH64_avalanche2(acc1), XXH64_avalanche2(acc2) };
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_0to16_128b(const void* data, size_t len, XXH64_hash_t seed)
+{
+    assert(data != NULL);
+    assert(len <= 16);
+    {   if (len > 8) return XXH3_len_9to16_128b(data, len, kKey, seed);
+        if (len >= 4) return XXH3_len_4to8_128b(data, len, kKey, seed);
+        if (len) return XXH3_len_1to3_128b(data, len, kKey, seed);
+        return (XXH128_hash_t) { seed, -seed };
+    }
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128b_withSeed(const void* data, size_t len, XXH64_hash_t seed)
+{
+    if (len <= 16) return XXH3_len_0to16_128b(data, len, seed);
+
+#if 0
+
+    {   U64 acc = PRIME64_1 * (len + seed);
+        const BYTE* const p = (const BYTE*)data;
+        const U64* const key = (const U64*)(const void*)kKey;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    if (len > 128) return XXH3_hashLong(data, len, seed);
+
+                    acc += XXH3_mix16B(p+48, key+12);
+                    acc += XXH3_mix16B(p+len-64, key+14);
+                }
+
+                acc += XXH3_mix16B(p+32, key+8);
+                acc += XXH3_mix16B(p+len-48, key+10);
+            }
+
+            acc += XXH3_mix16B(p+16, key+4);
+            acc += XXH3_mix16B(p+len-32, key+6);
+
+        }
+
+        acc += XXH3_mix16B(p+0, key+0);
+        acc += XXH3_mix16B(p+len-16, key+2);
+
+        return XXH64_avalanche2(acc);
+    }
+
+#else
+    return (XXH128_hash_t){ 0, 0 };
+#endif
+}
+
+
+XXH_PUBLIC_API XXH128_hash_t XXH3_128b(const void* data, size_t len)
+{
+    return XXH3_128b_withSeed(data, len, 0);
+}
+
+
+XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128b_withSeed(data, len, seed);
+}
+
 #endif  /* XXH3_H */

From 7558f18493484738ebcbbaa41fe0aa8d50038b51 Mon Sep 17 00:00:00 2001
From: "easyaspi314 (Devin)" <easyaspi314@users.noreply.github.com>
Date: Thu, 7 Mar 2019 17:26:49 -0500
Subject: [PATCH 38/73] Add improved 128-bit multiply routine for 32-bit  and
 use intrinsics long multiply

---
 xxh3.h | 147 ++++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 109 insertions(+), 38 deletions(-)

diff --git a/xxh3.h b/xxh3.h
index 3b634e34..53785af7 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -57,7 +57,14 @@
 #  endif
 #endif
 
-
+/* U64 XXH_mult32to64(U32 a, U64 b) { return (U64)a * (U64)b; } */
+#ifdef _MSC_VER
+#   include <intrin.h>
+    /* MSVC doesn't do a good job with the mull detection. */
+#   define XXH_mult32to64 __emulu
+#else
+#   define XXH_mult32to64(x, y) ((U64)((x) & 0xFFFFFFFF) * (U64)((y) & 0xFFFFFFFF))
+#endif
 
 
 /* ==========================================
@@ -84,12 +91,15 @@ ALIGN(64) static const U32 kKey[KEYSET_DEFAULT_SIZE] = {
 };
 
 
-
-
-XXH_FORCE_INLINE U64
+#if defined(__GNUC__) && defined(__i386__)
+/* GCC is stupid and tries to vectorize this.
+ * This tells GCC that it is wrong. */
+__attribute__((__target__("no-sse")))
+#endif
+static U64
 XXH3_mul128(U64 ll1, U64 ll2)
 {
-#if defined(__SIZEOF_INT128__) || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+#if 0 && defined(__SIZEOF_INT128__) || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
 
     __uint128_t lll = (__uint128_t)ll1 * ll2;
     return (U64)lll + (U64)(lll >> 64);
@@ -101,34 +111,95 @@ XXH3_mul128(U64 ll1, U64 ll2)
     U64 const lllow = _umul128(ll1, ll2, &llhigh);
     return lllow + llhigh;
 
-#elif defined(__aarch64__)
+#elif defined(__aarch64__) && defined(__GNUC__)
 
-    U64 const llow = ll1 * ll2;
+    U64 llow;
     U64 llhigh;
-    asm ("umulh %0, %1, %2" : "=r" (llhigh) : "r" (ll1), "r" (ll2));
-    return lllow + llhigh;
+    __asm__("umulh %0, %1, %2" : "=r" (llhigh) : "r" (ll1), "r" (ll2));
+    __asm__("madd  %0, %1, %2, %3" : "=r" (llow) : "r" (ll1), "r" (ll2), "r" (llhigh));
+    return lllow;
 
 #else
-
-    /* emulate 64x64x->128b multiplication, using four 32x32->64 */
-    U32 const h1 = ll1 >> 32;
-    U32 const h2 = ll2 >> 32;
-    U32 const l1 = (U32)ll1;
-    U32 const l2 = (U32)ll2;
-
-    U64 const llh  = (U64)h1 * h2;
-    U64 const llm1 = (U64)l1 * h2;
-    U64 const llm2 = (U64)l2 * h1;
-    U64 const lll  = (U64)l1 * l2;
-
-    U64 const t = lll + (llm1 << 32);
-    U64 const carry1 = t < lll;
-
-    U64 const lllow = t + (llm2 << 32);
-    U64 const carry2 = lllow < t;
-    U64 const llhigh = llh + (llm1 >> 32) + (llm2 >> 32) + carry1 + carry2;
-
-    return llhigh + lllow;
+    /* Do it out manually on 32-bit.
+     * This is a modified, unrolled, widened, and optimized version of the
+     * mulqdu routine from Hacker's Delight.
+     *
+     *   https://www.hackersdelight.org/hdcodetxt/mulqdu.c.txt
+     *
+     * This was modified to use U32->U64 multiplication instead
+     * of U16->U32, to add the high and low values in the end,
+     * be endian-independent, and I added a partial assembly
+     * implementation for ARM. */
+    U64 t;
+    U32 w[4] = { 0 };
+    U32 u[2] = { (U32)(ll1 >> 32), (U32)ll1 };
+    U32 v[2] = { (U32)(ll2 >> 32), (U32)ll2 };
+    U32 k;
+    /* An easy 128-bit folding multiply on ARMv6T2 and ARMv7-A/R can be done with
+     * the mighty umaal (Unsigned Multiply Accumulate Accumulate Long) which takes 4 cycles
+     * or less, doing a long multiply and adding two 32-bit integers:
+     *
+     *     void umaal(U32 *RdLo, U32 *RdHi, U32 Rn, U32 Rm)
+     *     {
+     *         U64 prodAcc = (U64)Rn * (U64)Rm;
+     *         prodAcc += *RdLo;
+     *         prodAcc += *RdHi;
+     *         *RdLo = prodAcc & 0xFFFFFFFF;
+     *         *RdHi = prodAcc >> 32;
+     *     }
+     *
+     * This is compared to umlal which adds to a single 64-bit integer:
+     *
+     *     void umlal(U32 *RdLo, U32 *RdHi, U32 Rn, U32 Rm)
+     *     {
+     *         U64 prodAcc = (U64)Rn * (U64)Rm;
+     *         prodAcc += (*RdLo | ((U64)*RdHi << 32);
+     *         *RdLo = prodAcc & 0xFFFFFFFF;
+     *         *RdHi = prodAcc >> 32;
+     *     }
+     *
+     * Getting the compiler to emit them is like pulling teeth, and checking
+     * for it is annoying because ARMv7-M lacks this instruction. However, it
+     * is worth it, because this is an otherwise expensive operation. */
+
+     /* GCC-compatible, ARMv6t2 or ARMv7+, non-M variant, and 32-bit */
+#if defined(__GNUC__) /* GCC-compatible */ \
+    && defined(__ARM_ARCH) && !defined(__aarch64__) && !defined(__arm64__) /* 32-bit ARM */\
+    && !defined(__ARM_ARCH_7M__) /* <- Not ARMv7-M  vv*/ \
+        && !(defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM == 0 && __TARGET_ARCH_THUMB == 4) \
+    && (defined(__ARM_ARCH_6T2__) || __ARM_ARCH > 6) /* ARMv6T2 or later */
+    __asm__("umull %0, %1, %2, %3"
+            : "=r" (w[3]), "=r" (k)
+            : "r" (u[1]), "r" (v[1]));
+    __asm__("umaal %0, %1, %2, %3"
+            : "+r" (w[2]), "+r" (k)
+            : "r" (u[0]), "r" (v[1]));
+    w[1] = k;
+    k = 0;
+    __asm__("umaal %0, %1, %2, %3"
+            : "+r" (w[2]), "+r" (k)
+            : "r" (u[1]), "r" (v[0]));
+    __asm__("umaal %0, %1, %2, %3"
+            : "+r" (w[1]), "+r" (k)
+            : "r" (u[0]), "r" (v[0]));
+    w[0] = k;
+#else /* Portable scalar version */
+    k = 0;
+    t = XXH_mult32to64(u[1], v[1]);
+    w[3] = t & 0xFFFFFFFF;
+    k = t >> 32;
+    t = XXH_mult32to64(u[0], v[1]) + w[2] + k;
+    w[2] = t & 0xFFFFFFFF;
+    w[1] = t >> 32;
+
+    t = XXH_mult32to64(u[1], v[0]) + w[2];
+    w[2] = t & 0xFFFFFFFF;
+    k = t >> 32;
+    t = XXH_mult32to64(u[0], v[0]) + w[1] + k;
+    w[1] = t & 0xFFFFFFFF;
+    w[0] = t >> 32;
+#endif
+    return (w[1] | ((U64)w[0] << 32)) + (w[3] | ((U64)w[2] << 32));
 
 #endif
 }
@@ -158,7 +229,7 @@ XXH3_len_1to3_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t
         BYTE const c3 = ((const BYTE*)data)[len - 1];
         U32  const l1 = (U32)(c1) + ((U32)(c2) << 8);
         U32  const l2 = (U32)(len) + ((U32)(c3) << 2);
-        U64  const ll3 = (U64)(l1 + seed + key32[0]) * (l2 + key32[1]);
+        U64  const ll3 = XXH_mult32to64((l1 + seed + key32[0]), (l2 + key32[1]));
         return XXH64_avalanche2(ll3);
     }
 }
@@ -173,7 +244,7 @@ XXH3_len_4to8_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t
         U64 acc = PRIME64_1 * (len + seed);
         U64 const l1 = XXH_read32(data) + key32[0];
         U64 const l2 = XXH_read32((const BYTE*)data + len - 4) + key32[1];
-        acc += (U64)l1 * l2;
+        acc += XXH_mult32to64(l1, l2);
         return XXH64_avalanche2(acc);
     }
 }
@@ -313,7 +384,7 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k
     for (i=0; i < (int)ACC_NB; i++) {
         int const left = 2*i;
         int const right= 2*i + 1;
-        xacc[i] += (xdata[left] + xkey[left]) * (U64)(xdata[right] + xkey[right]);
+        xacc[i] += XXH_mult32to64(xdata[left] + xkey[left], xdata[right] + xkey[right]);
     }
 
 #endif
@@ -412,8 +483,8 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
         xacc[i] ^= xacc[i] >> 47;
         xacc[i] ^= PRIME64_5;
 
-        {   U64 p1 = (xacc[i] >> 32) * xkey[left];
-            U64 p2 = (xacc[i] & 0xFFFFFFFF) * xkey[right];
+        {   U64 p1 = XXH_mult32to64(xacc[i] & 0xFFFFFFFF, xkey[left]);
+            U64 p2 = XXH_mult32to64(xacc[i] >> 32, xkey[right]);
             xacc[i] = p1 ^ p2;
     }   }
 
@@ -551,8 +622,8 @@ XXH3_len_1to3_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_
         BYTE const c3 = ((const BYTE*)data)[len - 1];
         U32  const l1 = (U32)(c1) + ((U32)(c2) << 8);
         U32  const l2 = (U32)(len) + ((U32)(c3) << 2);
-        U64  const ll1 = (U64)(l1 + seed + key32[0]) * (l2 + key32[1]);
-        U64  const ll2 = (U64)(l1 - seed + key32[2]) * (l2 + key32[3]);
+        U64  const ll1 = XXH_mult32to64(l1 + seed + key32[0], l2 + key32[1]);
+        U64  const ll2 = XXH_mult32to64(l1 - seed + key32[2], l2 + key32[3]);
         return (XXH128_hash_t) { XXH64_avalanche2(ll1), XXH64_avalanche2(ll2) };
     }
 }
@@ -568,8 +639,8 @@ XXH3_len_4to8_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_
         U64 acc2 = PRIME64_2 * ((U64)len - seed);
         U64 const l1 = XXH_read32(data) + key32[0];
         U64 const l2 = XXH_read32((const BYTE*)data + len - 4) + key32[1];
-        acc1 += (U64)(l1 + key32[0]) * (l2 + key32[1]);
-        acc2 += (U64)(l1 + key32[2]) * (l2 + key32[3]);
+        acc1 += XXH_mult32to64(l1 + key32[0], l2 + key32[1]);
+        acc2 += XXH_mult32to64(l1 + key32[2], l2 + key32[3]);
         return (XXH128_hash_t){ XXH64_avalanche2(acc1), XXH64_avalanche2(acc2) };
     }
 }

From 97952e90295885d9d8e127d91aa199a876c22981 Mon Sep 17 00:00:00 2001
From: "easyaspi314 (Devin)" <easyaspi314@users.noreply.github.com>
Date: Thu, 7 Mar 2019 17:29:26 -0500
Subject: [PATCH 39/73] Workaround for Clang vectorization bug

Inline assembly fences are the only thing I have found that will
prevent Clang from vectorizing XXH32. I explained it in a lot of
detail.
---
 Makefile | 10 +---------
 xxhash.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 50 insertions(+), 14 deletions(-)

diff --git a/Makefile b/Makefile
index 25a5dfdc..ddb2bb96 100644
--- a/Makefile
+++ b/Makefile
@@ -33,15 +33,7 @@ LIBVER_MINOR := $(shell echo $(LIBVER_MINOR_SCRIPT))
 LIBVER_PATCH := $(shell echo $(LIBVER_PATCH_SCRIPT))
 LIBVER := $(LIBVER_MAJOR).$(LIBVER_MINOR).$(LIBVER_PATCH)
 
-# SSE4 detection
-HAVE_SSE4 := $(shell $(CC) -dM -E - < /dev/null | grep "SSE4" > /dev/null && echo 1 || echo 0)
-ifeq ($(HAVE_SSE4), 1)
-NOSSE4 := -mno-sse4
-else
-NOSSE4 :=
-endif
-
-CFLAGS ?= -O3 $(NOSSE4) # disables potential auto-vectorization
+CFLAGS ?= -O3
 DEBUGFLAGS+=-Wall -Wextra -Wconversion -Wcast-qual -Wcast-align -Wshadow \
             -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \
             -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \
diff --git a/xxhash.c b/xxhash.c
index 02f5cd53..cba16f07 100644
--- a/xxhash.c
+++ b/xxhash.c
@@ -267,12 +267,56 @@ static const U32 PRIME32_3 = 3266489917U;   /* 0b1100001010110010101011100011110
 static const U32 PRIME32_4 =  668265263U;   /* 0b00100111110101001110101100101111 */
 static const U32 PRIME32_5 =  374761393U;   /* 0b00010110010101100110011110110001 */
 
-static U32 XXH32_round(U32 seed, U32 input)
+static U32 XXH32_round(U32 acc, U32 input)
 {
-    seed += input * PRIME32_2;
-    seed  = XXH_rotl32(seed, 13);
-    seed *= PRIME32_1;
-    return seed;
+    acc += input * PRIME32_2;
+    acc  = XXH_rotl32(acc, 13);
+    acc *= PRIME32_1;
+#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+    /* UGLY HACK:
+     * This inline assembly hack forces acc into a normal register. This is the
+     * only thing that prevents GCC and Clang from autovectorizing the XXH32 loop
+     * (pragmas and attributes don't work for some resason) without globally
+     * disabling SSE4.1.
+     *
+     * The reason we want to avoid vectorization is because despite working on
+     * 4 integers at a time, there are multiple factors slowing XXH32 down on
+     * SSE4:
+     * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on newer chips!)
+     *   making it slightly slower to multiply four integers at once compared to four
+     *   integers independently. Even when pmulld was fastest, Sandy/Ivy Bridge, it is
+     *   still not worth it to go into SSE just to multiply unless doing a long operation.
+     *
+     * - Four instructions are required to rotate,
+     *      movqda tmp,  v // not required with VEX encoding
+     *      pslld  tmp, 13 // tmp <<= 13
+     *      psrld  v,   19 // x >>= 19
+     *      por    v,  tmp // x |= tmp
+     *   compared to one for scalar:
+     *      roll   v, 13    // reliably fast across the board
+     *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
+     *
+     * - Instruction level parallelism is actually more beneficial here because the
+     *   SIMD actually serializes this operation: While v1 is rotating, v2 can load data,
+     *   while v3 can multiply. SSE forces them to operate together.
+     *
+     * How this hack works:
+     * __asm__(""       // Declare an assembly block but don't declare any instructions
+     *          :       // However, as an Input/Output Operand,
+     *          "+r"    // constrain a read/write operand (+) as a general purpose register (r).
+     *          (acc)   // and set acc as the operand
+     * );
+     *
+     * Because of the 'r', the compiler has promised that seed will be in a
+     * general purpose register and the '+' says that it will be 'read/write',
+     * so it has to assume it has changed. It is like volatile without all the
+     * loads and stores.
+     *
+     * Since the argument has to be in a normal register (not an SSE register),
+     * each time XXH32_round is called, it is impossible to vectorize. */
+    __asm__("" : "+r" (acc));
+#endif
+    return acc;
 }
 
 /* mix all bits */

From 1b78d030aa7ee847a4b4234cef53a733e2e5a276 Mon Sep 17 00:00:00 2001
From: "easyaspi314 (Devin)" <easyaspi314@users.noreply.github.com>
Date: Thu, 7 Mar 2019 17:32:18 -0500
Subject: [PATCH 40/73] Remove comment about the bug because it is fixed now

---
 xxhash.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/xxhash.c b/xxhash.c
index cba16f07..c5fec9b0 100644
--- a/xxhash.c
+++ b/xxhash.c
@@ -414,17 +414,6 @@ XXH32_endian_align(const void* input, size_t len, U32 seed,
         U32 v3 = seed + 0;
         U32 v4 = seed - PRIME32_1;
 
-        /* note : clang will try to vectorize this loop, using pmulld instruction.
-         * This is a bad idea, and will result in substantial performance reduction.
-         * To prevent clang from "optimizing" this loop,
-         * it's necessary to disable SSE4 on command line (-mno-sse4).
-         * However, this is a build instruction, so it's outside of source code.
-         * Whenever xxhash.c is used in a different code base, build flags don't follow.
-         * It would be better to ensure vectorization is disabled from within the source code.
-         * Alas, so far, I've not found a working method.
-         * I tried both `#pragma` and `__attribute__`, but clang still vectorizes.
-         * Help welcomed.
-         * In the meantime, vectorization is prevented by the `Makefile` */
         do {
             v1 = XXH32_round(v1, XXH_get32bits(p)); p+=4;
             v2 = XXH32_round(v2, XXH_get32bits(p)); p+=4;

From 02d0ba79a01384e887fe4d976c5a159b698b292a Mon Sep 17 00:00:00 2001
From: "easyaspi314 (Devin)" <easyaspi314@users.noreply.github.com>
Date: Thu, 7 Mar 2019 19:51:39 -0500
Subject: [PATCH 41/73] Remove preprocessor statement leftover from testing

What '0 &&' ? No idea what you are talking about...
---
 xxh3.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xxh3.h b/xxh3.h
index 53785af7..fac97808 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -99,7 +99,7 @@ __attribute__((__target__("no-sse")))
 static U64
 XXH3_mul128(U64 ll1, U64 ll2)
 {
-#if 0 && defined(__SIZEOF_INT128__) || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+#if defined(__SIZEOF_INT128__) || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
 
     __uint128_t lll = (__uint128_t)ll1 * ll2;
     return (U64)lll + (U64)(lll >> 64);

From 4f4f63c73b86e57f49ed89da08b0943f288ade58 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Fri, 8 Mar 2019 15:37:06 -0500
Subject: [PATCH 42/73] modified xxh128 so that low part == xxh3_64b

---
 xxh3.h | 96 ++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 56 insertions(+), 40 deletions(-)

diff --git a/xxh3.h b/xxh3.h
index 3b634e34..64fac8a4 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -110,7 +110,7 @@ XXH3_mul128(U64 ll1, U64 ll2)
 
 #else
 
-    /* emulate 64x64x->128b multiplication, using four 32x32->64 */
+    /* emulate 64x64->128b multiplication, using four 32x32->64 */
     U32 const h1 = ll1 >> 32;
     U32 const h2 = ll2 >> 32;
     U32 const l1 = (U32)ll1;
@@ -158,8 +158,8 @@ XXH3_len_1to3_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t
         BYTE const c3 = ((const BYTE*)data)[len - 1];
         U32  const l1 = (U32)(c1) + ((U32)(c2) << 8);
         U32  const l2 = (U32)(len) + ((U32)(c3) << 2);
-        U64  const ll3 = (U64)(l1 + seed + key32[0]) * (l2 + key32[1]);
-        return XXH64_avalanche2(ll3);
+        U64  const ll11 = (U64)(l1 + seed + key32[0]) * (l2 + key32[1]);
+        return XXH64_avalanche2(ll11);
     }
 }
 
@@ -171,8 +171,8 @@ XXH3_len_4to8_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t
     assert(len >= 4 && len <= 8);
     {   const U32* const key32 = (const U32*) keyPtr;
         U64 acc = PRIME64_1 * (len + seed);
-        U64 const l1 = XXH_read32(data) + key32[0];
-        U64 const l2 = XXH_read32((const BYTE*)data + len - 4) + key32[1];
+        U32 const l1 = XXH_read32(data) + key32[0];
+        U32 const l2 = XXH_read32((const BYTE*)data + len - 4) + key32[1];
         acc += (U64)l1 * l2;
         return XXH64_avalanche2(acc);
     }
@@ -434,11 +434,11 @@ XXH_FORCE_INLINE U64 XXH3_mix16B(const void* data, const U64* key)
     return XXH3_mul128((XXH_read64(data) ^ key[0]), XXH_read64((const BYTE*)data+8) ^ key[1]);
 }
 
-static XXH64_hash_t XXH3_merge64B(const U64* data, const void* keyVoid, U64 len)
+static XXH64_hash_t XXH3_merge64B(const U64* data, const void* keyVoid, U64 start)
 {
     const U64* const key = (const U64*)keyVoid;  /* presumed aligned */
 
-    U64 acc = PRIME64_1 * len;
+    U64 acc = start;
     acc += XXH3_mix16B(data+0, key+0);
     acc += XXH3_mix16B(data+2, key+2);
     acc += XXH3_mix16B(data+4, key+4);
@@ -447,11 +447,9 @@ static XXH64_hash_t XXH3_merge64B(const U64* data, const void* keyVoid, U64 len)
     return XXH64_avalanche2(acc);
 }
 
-__attribute__((noinline)) static XXH64_hash_t    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
-XXH3_hashLong(const void* data, size_t len, XXH64_hash_t seed)
+static void
+XXH3_hashLong(U64* acc, const void* data, size_t len)
 {
-    ALIGN(64) U64 acc[ACC_NB] = { seed, PRIME64_1, PRIME64_2, PRIME64_3, PRIME64_4, PRIME64_5, -seed };
-
     #define NB_KEYS ((KEYSET_DEFAULT_SIZE - STRIPE_ELTS) / 2)
 
     size_t const block_len = STRIPE_LEN * NB_KEYS;
@@ -474,15 +472,21 @@ XXH3_hashLong(const void* data, size_t len, XXH64_hash_t seed)
             const BYTE* const p = (const BYTE*) data + len - STRIPE_LEN;
             XXH3_accumulate_512(acc, p, kKey + nbStripes*2);
     }   }
+}
+
+__attribute__((noinline)) static XXH64_hash_t    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3_hashLong_64b(const void* data, size_t len, XXH64_hash_t seed)
+{
+    ALIGN(64) U64 acc[ACC_NB] = { seed, PRIME64_1, PRIME64_2, PRIME64_3, PRIME64_4, PRIME64_5, -seed, 0 };
+
+    XXH3_hashLong(acc, data, len);
 
     /* converge into final hash */
-    //return XXH3_finalMerge_8u64(acc[0] + len, acc[1], acc[2], acc[3], acc[4], acc[5], acc[6], acc[7] - len, PRIME64_2 + len*2);
     assert(sizeof(acc) == 64);
-    return XXH3_merge64B(acc, kKey, len);
+    return XXH3_merge64B(acc, kKey, (U64)len * PRIME64_1);
 }
 
 
-
 /* ==========================================
  * Public entry point
  * ========================================== */
@@ -499,7 +503,7 @@ XXH3_64b_withSeed(const void* data, size_t len, XXH64_hash_t seed)
         if (len > 32) {
             if (len > 64) {
                 if (len > 96) {
-                    if (len > 128) return XXH3_hashLong(data, len, seed);
+                    if (len > 128) return XXH3_hashLong_64b(data, len, seed);
 
                     acc += XXH3_mix16B(p+48, key+12);
                     acc += XXH3_mix16B(p+len-64, key+14);
@@ -530,10 +534,8 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64b(const void* data, size_t len)
 
 
 /* ==========================================
- * XXH3 128 bits
- * Not ready yet !
+ * XXH3 128 bits (=> XXH128)
  * ========================================== */
-
 typedef struct {
     XXH64_hash_t ll1;
     XXH64_hash_t ll2;
@@ -551,9 +553,9 @@ XXH3_len_1to3_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_
         BYTE const c3 = ((const BYTE*)data)[len - 1];
         U32  const l1 = (U32)(c1) + ((U32)(c2) << 8);
         U32  const l2 = (U32)(len) + ((U32)(c3) << 2);
-        U64  const ll1 = (U64)(l1 + seed + key32[0]) * (l2 + key32[1]);
-        U64  const ll2 = (U64)(l1 - seed + key32[2]) * (l2 + key32[3]);
-        return (XXH128_hash_t) { XXH64_avalanche2(ll1), XXH64_avalanche2(ll2) };
+        U64  const ll11 = (U64)(l1 + seed + key32[0]) * (l2 + key32[1]);
+        U64  const ll12 = (U64)(l1 - seed + key32[2]) * (l2 + key32[3]);
+        return (XXH128_hash_t) { XXH64_avalanche2(ll11), XXH64_avalanche2(ll12) };
     }
 }
 
@@ -566,8 +568,8 @@ XXH3_len_4to8_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_
     {   const U32* const key32 = (const U32*) keyPtr;
         U64 acc1 = PRIME64_1 * ((U64)len + seed);
         U64 acc2 = PRIME64_2 * ((U64)len - seed);
-        U64 const l1 = XXH_read32(data) + key32[0];
-        U64 const l2 = XXH_read32((const BYTE*)data + len - 4) + key32[1];
+        U32 const l1 = XXH_read32(data);
+        U32 const l2 = XXH_read32((const BYTE*)data + len - 4);
         acc1 += (U64)(l1 + key32[0]) * (l2 + key32[1]);
         acc2 += (U64)(l1 + key32[2]) * (l2 + key32[3]);
         return (XXH128_hash_t){ XXH64_avalanche2(acc1), XXH64_avalanche2(acc2) };
@@ -603,43 +605,57 @@ XXH3_len_0to16_128b(const void* data, size_t len, XXH64_hash_t seed)
     }
 }
 
+__attribute__((noinline)) static XXH128_hash_t    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3_hashLong_128b(const void* data, size_t len, XXH64_hash_t seed)
+{
+    ALIGN(64) U64 acc[ACC_NB] = { seed, PRIME64_1, PRIME64_2, PRIME64_3, PRIME64_4, PRIME64_5, -seed, 0 };
+    assert(len > 128);
+
+    XXH3_hashLong(acc, data, len);
+
+    /* converge into final hash */
+    assert(sizeof(acc) == 64);
+    {   U64 const part1 = XXH3_merge64B(acc, kKey, (U64)len * PRIME64_1);
+        U64 const part2 = XXH3_merge64B(acc, kKey+16, ((U64)len+1) * PRIME64_2);
+        return (XXH128_hash_t) { part1, part2 };
+    }
+}
+
 XXH_PUBLIC_API XXH128_hash_t
 XXH3_128b_withSeed(const void* data, size_t len, XXH64_hash_t seed)
 {
     if (len <= 16) return XXH3_len_0to16_128b(data, len, seed);
 
-#if 0
-
-    {   U64 acc = PRIME64_1 * (len + seed);
+    {   U64 acc1 = PRIME64_1 * (len + seed);
+        U64 acc2 = 0;
         const BYTE* const p = (const BYTE*)data;
         const U64* const key = (const U64*)(const void*)kKey;
         if (len > 32) {
             if (len > 64) {
                 if (len > 96) {
-                    if (len > 128) return XXH3_hashLong(data, len, seed);
+                    if (len > 128) return XXH3_hashLong_128b(data, len, seed);
 
-                    acc += XXH3_mix16B(p+48, key+12);
-                    acc += XXH3_mix16B(p+len-64, key+14);
+                    acc1 += XXH3_mix16B(p+48, key+12);
+                    acc2 += XXH3_mix16B(p+len-64, key+14);
                 }
 
-                acc += XXH3_mix16B(p+32, key+8);
-                acc += XXH3_mix16B(p+len-48, key+10);
+                acc1 += XXH3_mix16B(p+32, key+8);
+                acc2 += XXH3_mix16B(p+len-48, key+10);
             }
 
-            acc += XXH3_mix16B(p+16, key+4);
-            acc += XXH3_mix16B(p+len-32, key+6);
+            acc1 += XXH3_mix16B(p+16, key+4);
+            acc2 += XXH3_mix16B(p+len-32, key+6);
 
         }
 
-        acc += XXH3_mix16B(p+0, key+0);
-        acc += XXH3_mix16B(p+len-16, key+2);
+        acc1 += XXH3_mix16B(p+0, key+0);
+        acc2 += XXH3_mix16B(p+len-16, key+2);
 
-        return XXH64_avalanche2(acc);
+        {   U64 const part1 = acc1 + acc2;
+            U64 const part2 = (acc1 * PRIME64_3) + (acc2 * PRIME64_4) + ((len - seed) * PRIME64_2);
+            return (XXH128_hash_t) { XXH64_avalanche2(part1), -XXH64_avalanche2(part2) };
+        }
     }
-
-#else
-    return (XXH128_hash_t){ 0, 0 };
-#endif
 }
 
 

From 2afd24d8bb564eba44543f617118a5e85137b1a9 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Fri, 8 Mar 2019 16:03:24 -0500
Subject: [PATCH 43/73] xxh128: minor modifications to improve bias

1.4% => 0.6%
---
 xxh3.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xxh3.h b/xxh3.h
index 55bc12a0..40c1f78e 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -634,7 +634,7 @@ XXH3_len_1to3_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_
         U32  const l1 = (U32)(c1) + ((U32)(c2) << 8);
         U32  const l2 = (U32)(len) + ((U32)(c3) << 2);
         U64  const ll11 = XXH_mult32to64(l1 + seed + key32[0], l2 + key32[1]);
-        U64  const ll12 = XXH_mult32to64(l1 - seed + key32[2], l2 + key32[3]);
+        U64  const ll12 = XXH_mult32to64(l1 + key32[2], l2 - seed + key32[3]);
         return (XXH128_hash_t) { XXH64_avalanche2(ll11), XXH64_avalanche2(ll12) };
     }
 }
@@ -651,7 +651,7 @@ XXH3_len_4to8_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_
         U32 const l1 = XXH_read32(data);
         U32 const l2 = XXH_read32((const BYTE*)data + len - 4);
         acc1 += XXH_mult32to64(l1 + key32[0], l2 + key32[1]);
-        acc2 += XXH_mult32to64(l1 + key32[2], l2 + key32[3]);
+        acc2 += XXH_mult32to64(l1 - key32[2], l2 + key32[3]);
         return (XXH128_hash_t){ XXH64_avalanche2(acc1), XXH64_avalanche2(acc2) };
     }
 }

From c5953f132c548a0413a4635c8ecc6d801f04f75a Mon Sep 17 00:00:00 2001
From: "easyaspi314 (Devin)" <easyaspi314@users.noreply.github.com>
Date: Fri, 8 Mar 2019 22:07:08 -0500
Subject: [PATCH 44/73] Add unroll pragma for Clang in XXH3_accumulate.

Clang doesn't unroll the XXH3_accumulate loop for some reason. Using
`#pragma clang loop unroll(enable)` to hint to Clang that it should
unroll results in a huge 1.4-1.5x speedup.

Before: 15 GB/s
After:  21 GB/s
---
 xxh3.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/xxh3.h b/xxh3.h
index 40c1f78e..ee636305 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -503,6 +503,11 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
 static void XXH3_accumulate(U64* acc, const void* restrict data, const U32* restrict key, size_t nbStripes)
 {
     size_t n;
+
+/* Clang doesn't unroll this loop without the pragma. Unrolling results in code that is about 1.4x faster. */
+#if defined(__clang__) && !defined(__OPTIMIZE_SIZE__)
+#  pragma clang loop unroll(enable)
+#endif
     for (n = 0; n < nbStripes; n++ ) {
         XXH3_accumulate_512(acc, (const BYTE*)data + n*STRIPE_LEN, key);
         key += 2;

From 60215c5bfb81518eec16afa122af457681b68d03 Mon Sep 17 00:00:00 2001
From: "easyaspi314 (Devin)" <easyaspi314@users.noreply.github.com>
Date: Fri, 8 Mar 2019 22:26:25 -0500
Subject: [PATCH 45/73] Fix typo causing build failure on 32-bit

---
 xxh3.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/xxh3.h b/xxh3.h
index ee636305..724cb1a3 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -197,9 +197,9 @@ XXH3_mul128(U64 ll1, U64 ll2)
     U32 const l2 = (U32)ll2;
 
     U64 const llh  = XXH_mult32to64(h1, h2);
-    U64 const llm1 = XXH_mult32to64(l1, h2;
-    U64 const llm2 = XXH_mult32to64(h1, l2;
-    U64 const lll  = XXH_mult32to64(l1, l2;
+    U64 const llm1 = XXH_mult32to64(l1, h2);
+    U64 const llm2 = XXH_mult32to64(h1, l2);
+    U64 const lll  = XXH_mult32to64(l1, l2);
 
     U64 const t = lll + (llm1 << 32);
     U64 const carry1 = t < lll;

From a5d5bf778f631412fd14a466457878f1a53eace2 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Fri, 8 Mar 2019 22:32:11 -0500
Subject: [PATCH 46/73] improve algorithm by compensating UMAC deficiency

no longer possibly to nullify one member through another
---
 xxh3.h | 40 +++++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/xxh3.h b/xxh3.h
index 40c1f78e..50d7dbb5 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -197,9 +197,9 @@ XXH3_mul128(U64 ll1, U64 ll2)
     U32 const l2 = (U32)ll2;
 
     U64 const llh  = XXH_mult32to64(h1, h2);
-    U64 const llm1 = XXH_mult32to64(l1, h2;
-    U64 const llm2 = XXH_mult32to64(h1, l2;
-    U64 const lll  = XXH_mult32to64(l1, l2;
+    U64 const llm1 = XXH_mult32to64(l1, h2);
+    U64 const llm2 = XXH_mult32to64(h1, l2);
+    U64 const lll  = XXH_mult32to64(l1, l2);
 
     U64 const t = lll + (llm1 << 32);
     U64 const carry1 = t < lll;
@@ -308,8 +308,9 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k
             __m256i const d   = _mm256_loadu_si256 (xdata+i);
             __m256i const k   = _mm256_loadu_si256 (xkey+i);
             __m256i const dk  = _mm256_add_epi32 (d,k);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
-            __m256i const res = _mm256_mul_epu32 (dk, _mm256_shuffle_epi32 (dk,0x31));   /* uint64 res[4] = {dk0*dk1, dk2*dk3, ...} */
-            xacc[i]           = _mm256_add_epi64(res, xacc[i]);                          /* xacc must be aligned on 32 bytes boundaries */
+            __m256i const res = _mm256_mul_epu32 (dk, _mm256_shuffle_epi32 (dk, 0x31));   /* uint64 res[4] = {dk0*dk1, dk2*dk3, ...} */
+            xacc[i]  = _mm256_add_epi64(res, xacc[i]);
+            xacc[i]  = _mm256_add_epi32(d, xacc[i]);
         }
     }
 
@@ -324,13 +325,14 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k
         for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
             __m128i const d   = _mm_loadu_si128 (xdata+i);
             __m128i const k   = _mm_loadu_si128 (xkey+i);
-            __m128i const dk  = _mm_add_epi32 (d,k);                               /* uint32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
-            __m128i const res = _mm_mul_epu32 (dk, _mm_shuffle_epi32 (dk,0x31));   /* uint64 res[2] = {dk0*dk1,dk2*dk3} */
-            xacc[i]           = _mm_add_epi64(res, xacc[i]);                       /* xacc must be aligned on 16 bytes boundaries */
+            __m128i const dk  = _mm_add_epi32 (d,k);                                 /* uint32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
+            __m128i const res = _mm_mul_epu32 (dk, _mm_shuffle_epi32 (dk, 0x31));    /* uint64 res[2] = {dk0*dk1,dk2*dk3} */
+            xacc[i]  = _mm_add_epi64(res, xacc[i]);
+            xacc[i]  = _mm_add_epi32(d, xacc[i]);
         }
     }
 
-#elif (XXH_VECTOR == XXH_NEON)
+#elif (XXH_VECTOR == XXH_NEON)  /* note : no longer correct, must be updated to match new formula */
 
     assert(((size_t)acc) & 15 == 0);
     {                 uint64x2_t* const xacc  = (uint64x2_t *)acc;
@@ -394,6 +396,7 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k
         int const left = 2*i;
         int const right= 2*i + 1;
         xacc[i] += XXH_mult32to64(xdata[left] + xkey[left], xdata[right] + xkey[right]);
+        xacc[i] += xdata[left] + ((U64)xdata[right] << 32);
     }
 
 #endif
@@ -407,13 +410,10 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
     {   __m256i* const xacc = (__m256i*) acc;
         const __m256i* const xkey  = (const __m256i *) key;
 
-        __m256i const xor_p5 = _mm256_set1_epi64x(PRIME64_5);
-
         for (size_t i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
             __m256i data = xacc[i];
             __m256i const shifted = _mm256_srli_epi64(data, 47);
             data = _mm256_xor_si256(data, shifted);
-            data = _mm256_xor_si256(data, xor_p5);
 
             {   __m256i const k   = _mm256_loadu_si256 (xkey+i);
                 __m256i const dk  = _mm256_mul_epu32 (data,k);          /* U32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
@@ -422,7 +422,7 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
                 __m256i const k2  = _mm256_shuffle_epi32 (k,0x31);
                 __m256i const dk2 = _mm256_mul_epu32 (d2,k2);           /* U32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
 
-                xacc[i] = _mm256_xor_si256(dk, dk2);
+                xacc[i]  = _mm256_xor_si256(dk, dk2);
         }   }
     }
 
@@ -431,27 +431,25 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
     assert(((size_t)acc) & 15 == 0);
     {   __m128i* const xacc = (__m128i*) acc;
         const __m128i* const xkey  = (const __m128i *) key;
-        __m128i const xor_p5 = _mm_set1_epi64((__m64)PRIME64_5);
 
         size_t i;
         for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
             __m128i data = xacc[i];
             __m128i const shifted = _mm_srli_epi64(data, 47);
             data = _mm_xor_si128(data, shifted);
-            data = _mm_xor_si128(data, xor_p5);
 
             {   __m128i const k   = _mm_loadu_si128 (xkey+i);
-                __m128i const dk  = _mm_mul_epu32 (data,k);          /* U32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
+                __m128i const dk  = _mm_mul_epu32 (data,k);
 
-                __m128i const d2  = _mm_shuffle_epi32 (data,0x31);
-                __m128i const k2  = _mm_shuffle_epi32 (k,0x31);
-                __m128i const dk2 = _mm_mul_epu32 (d2,k2);           /* U32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
+                __m128i const d2  = _mm_shuffle_epi32 (data, 0x31);
+                __m128i const k2  = _mm_shuffle_epi32 (k, 0x31);
+                __m128i const dk2 = _mm_mul_epu32 (d2,k2);
 
-                xacc[i] = _mm_xor_si128(dk, dk2);
+                xacc[i]  = _mm_xor_si128(dk, dk2);
         }   }
     }
 
-#elif (XXH_VECTOR == XXH_NEON)
+#elif (XXH_VECTOR == XXH_NEON)   /* note : no longer correct, must be updated to match new formula */
 
     assert(((size_t)acc) & 15 == 0);
     {       uint64x2_t* const xacc =       (uint64x2_t*) acc;

From ed0dbb8fdd97c7ef1c51b5338ac8086dc9d789c4 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Fri, 8 Mar 2019 23:59:02 -0500
Subject: [PATCH 47/73] ensure xxhash.c and xxhsum.c are recompiled

when their header change
---
 Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile b/Makefile
index ddb2bb96..1b5ee6a4 100644
--- a/Makefile
+++ b/Makefile
@@ -82,6 +82,10 @@ xxhsum32: CFLAGS += -m32
 xxhsum32: xxhash.c xxhsum.c
 	$(CC) $(FLAGS) $^ $(LDFLAGS) -o $@$(EXT)
 
+xxhash.o: xxhash.h xxh3.h
+
+xxhsum.o: xxhash.h
+
 .PHONY: xxhsum_and_links
 xxhsum_and_links: xxhsum xxh32sum xxh64sum
 

From 2010b7e7de7a9ef540c43b0e9bdb8c5e773600aa Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Sat, 9 Mar 2019 00:19:40 -0500
Subject: [PATCH 48/73] fixed addition discrepancy between scalar and vector
 code

let's both have a 64-bit addition with carry
---
 xxh3.h | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/xxh3.h b/xxh3.h
index 5dbec179..866d5131 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -310,7 +310,7 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k
             __m256i const dk  = _mm256_add_epi32 (d,k);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
             __m256i const res = _mm256_mul_epu32 (dk, _mm256_shuffle_epi32 (dk, 0x31));   /* uint64 res[4] = {dk0*dk1, dk2*dk3, ...} */
             xacc[i]  = _mm256_add_epi64(res, xacc[i]);
-            xacc[i]  = _mm256_add_epi32(d, xacc[i]);
+            xacc[i]  = _mm256_add_epi64(d, xacc[i]);
         }
     }
 
@@ -328,7 +328,7 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k
             __m128i const dk  = _mm_add_epi32 (d,k);                                 /* uint32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
             __m128i const res = _mm_mul_epu32 (dk, _mm_shuffle_epi32 (dk, 0x31));    /* uint64 res[2] = {dk0*dk1,dk2*dk3} */
             xacc[i]  = _mm_add_epi64(res, xacc[i]);
-            xacc[i]  = _mm_add_epi32(d, xacc[i]);
+            xacc[i]  = _mm_add_epi64(d, xacc[i]);
         }
     }
 
@@ -501,11 +501,6 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
 static void XXH3_accumulate(U64* acc, const void* restrict data, const U32* restrict key, size_t nbStripes)
 {
     size_t n;
-
-/* Clang doesn't unroll this loop without the pragma. Unrolling results in code that is about 1.4x faster. */
-#if defined(__clang__) && !defined(__OPTIMIZE_SIZE__)
-#  pragma clang loop unroll(enable)
-#endif
     for (n = 0; n < nbStripes; n++ ) {
         XXH3_accumulate_512(acc, (const BYTE*)data + n*STRIPE_LEN, key);
         key += 2;

From 638993f16b29c00346bfedf8e5b24d37dee50b12 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Mon, 11 Mar 2019 15:09:27 -0700
Subject: [PATCH 49/73] added consistency tests for XXH3_64b

validated against SSE2 path
---
 xxh3.h   | 14 +++++-----
 xxhash.h | 38 +++++++++++++++++----------
 xxhsum.c | 78 ++++++++++++++++++++++++++++++++++++++++++--------------
 3 files changed, 90 insertions(+), 40 deletions(-)

diff --git a/xxh3.h b/xxh3.h
index 866d5131..63a279fb 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -570,7 +570,7 @@ XXH3_hashLong_64b(const void* data, size_t len, XXH64_hash_t seed)
  * ========================================== */
 
 XXH_PUBLIC_API XXH64_hash_t
-XXH3_64b_withSeed(const void* data, size_t len, XXH64_hash_t seed)
+XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed)
 {
     const BYTE* const p = (const BYTE*)data;
     const U64* const key = (const U64*)(const void*)kKey;
@@ -604,9 +604,9 @@ XXH3_64b_withSeed(const void* data, size_t len, XXH64_hash_t seed)
 }
 
 
-XXH_PUBLIC_API XXH64_hash_t XXH3_64b(const void* data, size_t len)
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len)
 {
-    return XXH3_64b_withSeed(data, len, 0);
+    return XXH3_64bits_withSeed(data, len, 0);
 }
 
 
@@ -700,7 +700,7 @@ XXH3_hashLong_128b(const void* data, size_t len, XXH64_hash_t seed)
 }
 
 XXH_PUBLIC_API XXH128_hash_t
-XXH3_128b_withSeed(const void* data, size_t len, XXH64_hash_t seed)
+XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed)
 {
     if (len <= 16) return XXH3_len_0to16_128b(data, len, seed);
 
@@ -737,15 +737,15 @@ XXH3_128b_withSeed(const void* data, size_t len, XXH64_hash_t seed)
 }
 
 
-XXH_PUBLIC_API XXH128_hash_t XXH3_128b(const void* data, size_t len)
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len)
 {
-    return XXH3_128b_withSeed(data, len, 0);
+    return XXH3_128bits_withSeed(data, len, 0);
 }
 
 
 XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed)
 {
-    return XXH3_128b_withSeed(data, len, seed);
+    return XXH3_128bits_withSeed(data, len, seed);
 }
 
 #endif  /* XXH3_H */
diff --git a/xxhash.h b/xxhash.h
index 1782789e..5b887223 100644
--- a/xxhash.h
+++ b/xxhash.h
@@ -158,8 +158,8 @@ typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
 *  Version
 ***************************************/
 #define XXH_VERSION_MAJOR    0
-#define XXH_VERSION_MINOR    6
-#define XXH_VERSION_RELEASE  6
+#define XXH_VERSION_MINOR    7
+#define XXH_VERSION_RELEASE  0
 #define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
 XXH_PUBLIC_API unsigned XXH_versionNumber (void);
 
@@ -249,18 +249,6 @@ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t
 XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
 
 
-/*-**********************************************************************
-*  XXH3
-*  New experimental hash
-************************************************************************/
-
-#ifdef XXH_NAMESPACE
-#  define XXH3_64b XXH_NAME2(XXH_NAMESPACE, XXH3_64b)
-#endif
-
-XXH_PUBLIC_API XXH64_hash_t XXH3_64b(const void* data, size_t len);
-
-
 #endif  /* XXH_NO_LONG_LONG */
 
 
@@ -336,10 +324,32 @@ struct XXH64_state_s {
 # endif
 
 
+/*-**********************************************************************
+*  XXH3
+*  New experimental hash
+************************************************************************/
+
+#ifdef XXH_NAMESPACE
+#  define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
+#  define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
+#endif
+
+/* note : variant without seed produces same result as variant with seed == 0 */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, unsigned long long seed);
+
+
+
+
+/*-**********************************************************************
+*  XXH_INLINE_ALL
+************************************************************************/
 #if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
 #  include "xxhash.c"   /* include xxhash function bodies as `static`, for inlining */
 #endif
 
+
+
 #endif /* XXH_STATIC_LINKING_ONLY */
 
 
diff --git a/xxhsum.c b/xxhsum.c
index 657cf783..7428b62d 100644
--- a/xxhsum.c
+++ b/xxhsum.c
@@ -265,7 +265,7 @@ static U32 localXXH32(const void* buffer, size_t bufferSize, U32 seed) { return
 
 static U32 localXXH64(const void* buffer, size_t bufferSize, U32 seed) { return (U32)XXH64(buffer, bufferSize, seed); }
 
-static U32 localXXH3_64b(const void* buffer, size_t bufferSize, U32 seed) { (void)seed; return (U32)XXH3_64b(buffer, bufferSize); }
+static U32 localXXH3_64b(const void* buffer, size_t bufferSize, U32 seed) { (void)seed; return (U32)XXH3_64bits(buffer, bufferSize); }
 
 static void BMK_benchHash(hashFunction h, const char* hName, const void* buffer, size_t bufferSize)
 {
@@ -406,7 +406,6 @@ static int BMK_benchFiles(const char** fileNamesTable, int nbFiles, U32 specific
 }
 
 
-
 static int BMK_benchInternal(size_t keySize, U32 specificTest)
 {
     void* const buffer = calloc(keySize+16+3, 1);
@@ -434,32 +433,34 @@ static int BMK_benchInternal(size_t keySize, U32 specificTest)
 }
 
 
-static void BMK_checkResult(U32 r1, U32 r2)
+/* ************************************************
+ * Self-test :
+ * ensure results consistency accross platforms
+ *********************************************** */
+
+static void BMK_checkResult32(U32 r1, U32 r2)
 {
     static int nbTests = 1;
-    if (r1==r2) {
-        DISPLAYLEVEL(3, "\rTest%3i : %08X == %08X   ok   ", nbTests, r1, r2);
-    } else {
-        DISPLAY("\rERROR : Test%3i : %08X <> %08X   !!!!!   \n", nbTests, r1, r2);
+    if (r1!=r2) {
+        DISPLAY("\rERROR : Test%3i : 0x%08X <> 0x%08X   !!!!!   \n", nbTests, r1, r2);
         exit(1);
     }
     nbTests++;
 }
 
-
 static void BMK_checkResult64(U64 r1, U64 r2)
 {
     static int nbTests = 1;
     if (r1!=r2) {
         DISPLAY("\rERROR : Test%3i : 64-bit values non equals   !!!!!   \n", nbTests);
-        DISPLAY("\r %08X%08X != %08X%08X \n", (U32)(r1>>32), (U32)r1, (U32)(r2>>32), (U32)r2);
+        DISPLAY("\r 0x%08X%08XULL != 0x%08X%08XULL \n", (U32)(r1>>32), (U32)r1, (U32)(r2>>32), (U32)r2);
         exit(1);
     }
     nbTests++;
 }
 
 
-static void BMK_testSequence64(void* sentence, size_t len, U64 seed, U64 Nresult)
+static void BMK_testSequence64(const void* sentence, size_t len, U64 seed, U64 Nresult)
 {
     XXH64_state_t state;
     U64 Dresult;
@@ -475,11 +476,23 @@ static void BMK_testSequence64(void* sentence, size_t len, U64 seed, U64 Nresult
 
     (void)XXH64_reset(&state, seed);
     for (pos=0; pos<len; pos++)
-        (void)XXH64_update(&state, ((char*)sentence)+pos, 1);
+        (void)XXH64_update(&state, ((const char*)sentence)+pos, 1);
     Dresult = XXH64_digest(&state);
     BMK_checkResult64(Dresult, Nresult);
 }
 
+static void BMK_testXXH3(const void* data, size_t len, U64 seed, U64 Nresult)
+{
+    {   U64 const Dresult = XXH3_64bits_withSeed(data, len, seed);
+        BMK_checkResult64(Dresult, Nresult);
+    }
+
+    /* check that the no-seed variant produces same result as seed==0 */
+    if (seed == 0) {
+        U64 const Dresult = XXH3_64bits(data, len);
+        BMK_checkResult64(Dresult, Nresult);
+    }
+}
 
 static void BMK_testSequence(const void* sequence, size_t len, U32 seed, U32 Nresult)
 {
@@ -488,22 +501,22 @@ static void BMK_testSequence(const void* sequence, size_t len, U32 seed, U32 Nre
     size_t pos;
 
     Dresult = XXH32(sequence, len, seed);
-    BMK_checkResult(Dresult, Nresult);
+    BMK_checkResult32(Dresult, Nresult);
 
     (void)XXH32_reset(&state, seed);
     (void)XXH32_update(&state, sequence, len);
     Dresult = XXH32_digest(&state);
-    BMK_checkResult(Dresult, Nresult);
+    BMK_checkResult32(Dresult, Nresult);
 
     (void)XXH32_reset(&state, seed);
     for (pos=0; pos<len; pos++)
         (void)XXH32_update(&state, ((const char*)sequence)+pos, 1);
     Dresult = XXH32_digest(&state);
-    BMK_checkResult(Dresult, Nresult);
+    BMK_checkResult32(Dresult, Nresult);
 }
 
 
-#define SANITY_BUFFER_SIZE 101
+#define SANITY_BUFFER_SIZE 2243
 static void BMK_sanityCheck(void)
 {
     static const U32 prime = 2654435761U;
@@ -522,8 +535,8 @@ static void BMK_sanityCheck(void)
     BMK_testSequence(sanityBuffer,  1, prime, 0xD5845D64);
     BMK_testSequence(sanityBuffer, 14, 0,     0xE5AA0AB4);
     BMK_testSequence(sanityBuffer, 14, prime, 0x4481951D);
-    BMK_testSequence(sanityBuffer, SANITY_BUFFER_SIZE, 0,     0x1F1AA412);
-    BMK_testSequence(sanityBuffer, SANITY_BUFFER_SIZE, prime, 0x498EC8E2);
+    BMK_testSequence(sanityBuffer,222, 0,     0xC8070816);
+    BMK_testSequence(sanityBuffer,222, prime, 0xF3CFC852);
 
     BMK_testSequence64(NULL        ,  0, 0,     0xEF46DB3751D8E999ULL);
     BMK_testSequence64(NULL        ,  0, prime, 0xAC75FDA2929B17EFULL);
@@ -531,8 +544,35 @@ static void BMK_sanityCheck(void)
     BMK_testSequence64(sanityBuffer,  1, prime, 0x739840CB819FA723ULL);
     BMK_testSequence64(sanityBuffer, 14, 0,     0xCFFA8DB881BC3A3DULL);
     BMK_testSequence64(sanityBuffer, 14, prime, 0x5B9611585EFCC9CBULL);
-    BMK_testSequence64(sanityBuffer, SANITY_BUFFER_SIZE, 0,     0x0EAB543384F878ADULL);
-    BMK_testSequence64(sanityBuffer, SANITY_BUFFER_SIZE, prime, 0xCAA65939306F1E21ULL);
+    BMK_testSequence64(sanityBuffer,222, 0,     0x9DD507880DEBB03DULL);
+    BMK_testSequence64(sanityBuffer,222, prime, 0xDC515172B8EE0600ULL);
+
+    BMK_testXXH3(NULL,           0, 0,     0);                      /* zero-length hash is the seed == 0 by default */
+    BMK_testXXH3(NULL,           0, prime, prime);
+    BMK_testXXH3(sanityBuffer,   1, 0,     0xE2C6D3B40D6F9203ULL);  /*  1 -  3 */
+    BMK_testXXH3(sanityBuffer,   1, prime, 0xCEE5DF124E6135DCULL);  /*  1 -  3 */
+    BMK_testXXH3(sanityBuffer,   6, 0,     0x585D6F8D1AAD96A2ULL);  /*  4 -  8 */
+    BMK_testXXH3(sanityBuffer,   6, prime, 0x133EC8CA1739250FULL);  /*  4 -  8 */
+    BMK_testXXH3(sanityBuffer,  12, 0,     0x0E85E122FE5356ACULL);  /*  9 - 16 */
+    BMK_testXXH3(sanityBuffer,  12, prime, 0xE0DB5E70DA67EB16ULL);  /*  9 - 16 */
+    BMK_testXXH3(sanityBuffer,  24, 0,     0x6C213B15B89230C9ULL);  /* 17 - 32 */
+    BMK_testXXH3(sanityBuffer,  24, prime, 0x71892DB847A8F53CULL);  /* 17 - 32 */
+    BMK_testXXH3(sanityBuffer,  48, 0,     0xECED834E8E99DA1EULL);  /* 33 - 64 */
+    BMK_testXXH3(sanityBuffer,  48, prime, 0xA901250B336F9133ULL);  /* 33 - 64 */
+    BMK_testXXH3(sanityBuffer,  80, 0,     0xC67B3A9C6D69E022ULL);  /* 65 - 96 */
+    BMK_testXXH3(sanityBuffer,  80, prime, 0x5054F266D6A65EE4ULL);  /* 65 - 96 */
+    BMK_testXXH3(sanityBuffer, 112, 0,     0x84B99B2137A264A5ULL);  /* 97 -128 */
+    BMK_testXXH3(sanityBuffer, 112, prime, 0xD6BF88A668E69F2AULL);  /* 97 -128 */
+    BMK_testXXH3(sanityBuffer, 192, 0,     0x6D96AC3F415CFCFEULL);  /* one block, finishing at stripe boundary */
+    BMK_testXXH3(sanityBuffer, 192, prime, 0xE4BD30AA1673B966ULL);  /* one block, finishing at stripe boundary */
+    BMK_testXXH3(sanityBuffer, 222, 0,     0xB62929C362EF3BF5ULL);  /* one block, last stripe is overlapping */
+    BMK_testXXH3(sanityBuffer, 222, prime, 0x2782C3C49E3FD25EULL);  /* one block, last stripe is overlapping */
+    BMK_testXXH3(sanityBuffer,2048, 0,     0x802EB54C97564FD7ULL);  /* 2 blocks, finishing at block boundary */
+    BMK_testXXH3(sanityBuffer,2048, prime, 0xC9F188CFAFDA22CDULL);  /* 2 blocks, finishing at block boundary */
+    BMK_testXXH3(sanityBuffer,2240, 0,     0x16B0035F6ABC1F46ULL);  /* 3 blocks, finishing at stripe boundary */
+    BMK_testXXH3(sanityBuffer,2240, prime, 0x389E68C2348B9161ULL);  /* 3 blocks, finishing at stripe boundary */
+    BMK_testXXH3(sanityBuffer,2243, 0,     0xE7C1890BDBD2B245ULL);  /* 3 blocks, last stripe is overlapping */
+    BMK_testXXH3(sanityBuffer,2243, prime, 0x3A68386AED0C50A7ULL);  /* 3 blocks, last stripe is overlapping */
 
     DISPLAYLEVEL(3, "\r%70s\r", "");       /* Clean display line */
     DISPLAYLEVEL(3, "Sanity check -- all tests ok\n");

From 405e49403cd6c53f1ece36307ce84c0cf22dc9ce Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Mon, 11 Mar 2019 15:40:01 -0700
Subject: [PATCH 50/73] xxh3: fixed scalar variant

scrambling stage wasn't updated to match new formula
---
 xxh3.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/xxh3.h b/xxh3.h
index 63a279fb..88b5fdf3 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -488,7 +488,6 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
         int const left = 2*i;
         int const right= 2*i + 1;
         xacc[i] ^= xacc[i] >> 47;
-        xacc[i] ^= PRIME64_5;
 
         {   U64 p1 = XXH_mult32to64(xacc[i] & 0xFFFFFFFF, xkey[left]);
             U64 p2 = XXH_mult32to64(xacc[i] >> 32, xkey[right]);

From feedac5ccab06168bb02e74051e5b502e18cffd2 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 12 Mar 2019 11:31:57 -0700
Subject: [PATCH 51/73] updated travis tests

to ensure results consistency across scalar/sse2/avx2 on x64/x86
---
 .travis.yml | 34 +++++++++++++++++++++++++---------
 Makefile    |  2 ++
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 3c37a826..29923cd2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,10 +1,26 @@
 language: c
-compiler: gcc
-script: make -B test-all
-before_install:
-  - sudo apt-get update  -qq
-  - sudo apt-get install -qq gcc-arm-linux-gnueabi
-  - sudo apt-get install -qq clang
-  - sudo apt-get install -qq g++-multilib
-  - sudo apt-get install -qq gcc-multilib
-  - sudo apt-get install -qq cppcheck
+
+matrix:
+  fast_finish: true
+  include:
+
+    - name: General linux tests (Xenial)
+      dist: xenial
+      script:
+        - make -B test-all
+      install:
+        - sudo apt-get update  -qq
+        - sudo apt-get install -qq
+            gcc-arm-linux-gnueabi
+            clang
+            g++-multilib
+            gcc-multilib
+            cppcheck
+
+    - name: check results consistency
+      script:
+        - CPPFLAGS=-DXXH_VECTOR=0 make check   # Scalar code
+        - make clean
+        - CPPFLAGS=-DXXH_VECTOR=1 make check   # SSE2 code path
+        - make clean
+        - CPPFLAGS="-mavx2 -DXXH_VECTOR=2" make check   # AVX2 code path
diff --git a/Makefile b/Makefile
index 1b5ee6a4..c0422d07 100644
--- a/Makefile
+++ b/Makefile
@@ -118,7 +118,9 @@ libxxhash : $(LIBXXH)
 lib: libxxhash.a libxxhash
 
 
+# =================================================
 # tests
+# =================================================
 
 .PHONY: check
 check: xxhsum

From c76d96454b5ec417872db364883014b61ca6f105 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 12 Mar 2019 11:44:44 -0700
Subject: [PATCH 52/73] xxh3: fixed declaration after statement in AVX2 path

also :
- added header license
- fixed alignment declaration
---
 xxh3.h | 92 ++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 67 insertions(+), 25 deletions(-)

diff --git a/xxh3.h b/xxh3.h
index 88b5fdf3..420c463e 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -1,3 +1,42 @@
+/*
+   xxHash - Extremely Fast Hash algorithm
+   Development source file for `xxh3`
+   Copyright (C) 2019-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+/* Note :
+   This file is separated for development purposes.
+   It will be integrated into `xxhash.c` when development phase is complete.
+*/
+
 #ifndef XXH3_H
 #define XXH3_H
 
@@ -300,26 +339,27 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k
 #if (XXH_VECTOR == XXH_AVX2)
 
     assert(((size_t)acc) & 31 == 0);
-    {                   __m256i* const xacc  =       (__m256i *) acc;
-                  const __m256i* const xdata = (const __m256i *) data;
-        ALIGN(32) const __m256i* const xkey  = (const __m256i *) key;
+    {   ALIGN(32) __m256i* const xacc  =       (__m256i *) acc;
+        const     __m256i* const xdata = (const __m256i *) data;
+        const     __m256i* const xkey  = (const __m256i *) key;
 
-        for (size_t i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
             __m256i const d   = _mm256_loadu_si256 (xdata+i);
             __m256i const k   = _mm256_loadu_si256 (xkey+i);
             __m256i const dk  = _mm256_add_epi32 (d,k);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
-            __m256i const res = _mm256_mul_epu32 (dk, _mm256_shuffle_epi32 (dk, 0x31));   /* uint64 res[4] = {dk0*dk1, dk2*dk3, ...} */
-            xacc[i]  = _mm256_add_epi64(res, xacc[i]);
-            xacc[i]  = _mm256_add_epi64(d, xacc[i]);
+            __m256i const res = _mm256_mul_epu32 (dk, _mm256_shuffle_epi32 (dk, 0x31));  /* uint64 res[4] = {dk0*dk1, dk2*dk3, ...} */
+            __m256i const add = _mm256_add_epi64(d, xacc[i]);
+            xacc[i]  = _mm256_add_epi64(res, add);
         }
     }
 
 #elif (XXH_VECTOR == XXH_SSE2)
 
     assert(((size_t)acc) & 15 == 0);
-    {                   __m128i* const xacc  =       (__m128i *) acc;
-                  const __m128i* const xdata = (const __m128i *) data;
-        ALIGN(16) const __m128i* const xkey  = (const __m128i *) key;
+    {   ALIGN(16) __m128i* const xacc  =       (__m128i *) acc;
+        const     __m128i* const xdata = (const __m128i *) data;
+        const     __m128i* const xkey  = (const __m128i *) key;
 
         size_t i;
         for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
@@ -327,21 +367,21 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k
             __m128i const k   = _mm_loadu_si128 (xkey+i);
             __m128i const dk  = _mm_add_epi32 (d,k);                                 /* uint32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
             __m128i const res = _mm_mul_epu32 (dk, _mm_shuffle_epi32 (dk, 0x31));    /* uint64 res[2] = {dk0*dk1,dk2*dk3} */
-            xacc[i]  = _mm_add_epi64(res, xacc[i]);
-            xacc[i]  = _mm_add_epi64(d, xacc[i]);
+            __m128i const add = _mm_add_epi64(d, xacc[i]);
+            xacc[i]  = _mm_add_epi64(res, add);
         }
     }
 
 #elif (XXH_VECTOR == XXH_NEON)  /* note : no longer correct, must be updated to match new formula */
 
     assert(((size_t)acc) & 15 == 0);
-    {                 uint64x2_t* const xacc  = (uint64x2_t *)acc;
-                  const uint32_t* const xdata = (const uint32_t *)data;
-        ALIGN(16) const uint32_t* const xkey  = (const uint32_t *)key;
+    {         uint64x2_t* const xacc  =     (uint64x2_t *)acc;
+        const uint32_t* const xdata = (const uint32_t *)data;
+        const uint32_t* const xkey  = (const uint32_t *)key;
 
         size_t i;
         for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
-#if !defined(__aarch64__) && !defined(__arm64__) && !defined(XXH_NO_ARM32_HACK)
+#  if !defined(__aarch64__) && !defined(__arm64__) && !defined(XXH_NO_ARM32_HACK)
             /* On 32-bit ARM, we can take advantage of the packed registers.
              * This is not portable to aarch64!
              * Basically, on 32-bit NEON, registers are stored like so:
@@ -374,7 +414,7 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k
              * does not. */
             uint32x4_t const dk = vaddq_u32(*(uint32x4_t*)&d, *(uint32x4_t*)&k);
             xacc[i] = vmlal_u32(xacc[i], vget_low_u32(dk), vget_high_u32(dk));
-#else
+#  else
             /* Portable, but slightly slower version */
             uint32x2x2_t const d = vld2_u32(xdata + i * 4);
             uint32x2x2_t const k = vld2_u32(xkey + i * 4);
@@ -382,10 +422,11 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k
             uint32x2_t const dkH = vadd_u32(d.val[1], k.val[1]);   /* uint32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
             /* xacc must be aligned on 16 bytes boundaries */
             xacc[i] = vmlal_u32(xacc[i], dkL, dkH);                /* uint64 res[2] = {dk0*dk1,dk2*dk3} */
-#endif
+#  endif
         }
     }
-#else   /* scalar variant */
+
+#else   /* scalar variant - universal */
 
           U64* const xacc  =       (U64*) acc;
     const U32* const xdata = (const U32*) data;
@@ -407,10 +448,11 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
 #if (XXH_VECTOR == XXH_AVX2)
 
     assert(((size_t)acc) & 31 == 0);
-    {   __m256i* const xacc = (__m256i*) acc;
-        const __m256i* const xkey  = (const __m256i *) key;
+    {   ALIGN(32) __m256i* const xacc = (__m256i*) acc;
+        const     __m256i* const xkey  = (const __m256i *) key;
 
-        for (size_t i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
             __m256i data = xacc[i];
             __m256i const shifted = _mm256_srli_epi64(data, 47);
             data = _mm256_xor_si256(data, shifted);
@@ -429,8 +471,8 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
 #elif (XXH_VECTOR == XXH_SSE2)
 
     assert(((size_t)acc) & 15 == 0);
-    {   __m128i* const xacc = (__m128i*) acc;
-        const __m128i* const xkey  = (const __m128i *) key;
+    {   ALIGN(16) __m128i* const xacc = (__m128i*) acc;
+        const     __m128i* const xkey  = (const __m128i *) key;
 
         size_t i;
         for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
@@ -478,7 +520,7 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
         }   }
     }
 
-#else   /* scalar variant */
+#else   /* scalar variant - universal */
 
           U64* const xacc =       (U64*) acc;
     const U32* const xkey = (const U32*) key;

From b74c215b363362cd94a031dc0eedd92e109eae50 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 12 Mar 2019 12:00:30 -0700
Subject: [PATCH 53/73] try to fix travis install script

---
 .travis.yml | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 29923cd2..4a53c846 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,16 +6,15 @@ matrix:
 
     - name: General linux tests (Xenial)
       dist: xenial
+      before_install:
+        - sudo apt-get update  -qq
+        - sudo apt-get install -qq gcc-arm-linux-gnueabi
+        - sudo apt-get install -qq clang
+        - sudo apt-get install -qq g++-multilib
+        - sudo apt-get install -qq gcc-multilib
+        - sudo apt-get install -qq cppcheck
       script:
         - make -B test-all
-      install:
-        - sudo apt-get update  -qq
-        - sudo apt-get install -qq
-            gcc-arm-linux-gnueabi
-            clang
-            g++-multilib
-            gcc-multilib
-            cppcheck
 
     - name: check results consistency
       script:

From 30c8fb59c55ccfdcd08bfcd447df53e8b2de485e Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 12 Mar 2019 12:44:42 -0700
Subject: [PATCH 54/73] added ARM tests on travis

---
 .travis.yml | 22 +++++++++++++++++++++-
 Makefile    | 10 ++++++----
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 4a53c846..f93b240d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -16,10 +16,30 @@ matrix:
       script:
         - make -B test-all
 
-    - name: check results consistency
+    - name: Check results consistency on x64
       script:
         - CPPFLAGS=-DXXH_VECTOR=0 make check   # Scalar code
         - make clean
         - CPPFLAGS=-DXXH_VECTOR=1 make check   # SSE2 code path
         - make clean
         - CPPFLAGS="-mavx2 -DXXH_VECTOR=2" make check   # AVX2 code path
+
+    - name: ARM + aarch64 compilation
+      install:
+        - sudo apt-get install -qq
+            qemu-system-arm
+            qemu-user-static
+            gcc-arm-linux-gnueabi
+            libc6-dev-armel-cross
+            gcc-aarch64-linux-gnu
+            libc6-dev-arm64-cross
+      script:
+        - CC=arm-linux-gnueabi-gcc RUN_ENV=qemu-arm-static CPPFLAGS=-DXXH_VECTOR=0 LDFLAGS=-static make check   # Scalar code path
+        - make clean
+        - CC=arm-linux-gnueabi-gcc RUN_ENV=qemu-arm-static CPPFLAGS=-DXXH_VECTOR=3 LDFLAGS=-static make check   # NEON code path
+        - make clean
+        # aarch64
+        - CC=aarch64-linux-gnu-gcc RUN_ENV=qemu-aarch64-static CPPFLAGS=-DXXH_VECTOR=0 LDFLAGS=-static make check   # Scalar code path
+        - make clean
+        - CC=aarch64-linux-gnu-gcc RUN_ENV=qemu-aarch64-static CPPFLAGS=-DXXH_VECTOR=3 LDFLAGS=-static make check   # NEON code path
+        - make clean
diff --git a/Makefile b/Makefile
index c0422d07..df4319ab 100644
--- a/Makefile
+++ b/Makefile
@@ -122,16 +122,18 @@ lib: libxxhash.a libxxhash
 # tests
 # =================================================
 
+# make check can be run with cross-compiled binaries on emulated environments (qemu user mode)
+# by setting $(RUN_ENV) to the target emulation environment
 .PHONY: check
 check: xxhsum
 	# stdin
-	./xxhsum < xxhash.c
+	$(RUN_ENV) ./xxhsum < xxhash.c
 	# multiple files
-	./xxhsum xxhash.* xxhsum.*
+	$(RUN_ENV) ./xxhsum xxhash.* xxhsum.*
 	# internal bench
-	./xxhsum -bi1
+	$(RUN_ENV) ./xxhsum -bi1
 	# file bench
-	./xxhsum -bi1 xxhash.c
+	$(RUN_ENV) ./xxhsum -bi1 xxhash.c
 
 .PHONY: test-mem
 test-mem: xxhsum

From 51ac7dc7e9289bd782201e48c8c3534fcf0f0555 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 12 Mar 2019 12:56:52 -0700
Subject: [PATCH 55/73] fixed minor conversion warning

detected on ARM 32-bit
---
 .travis.yml | 5 +++--
 xxh3.h      | 6 +++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index f93b240d..7598ae86 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -18,13 +18,13 @@ matrix:
 
     - name: Check results consistency on x64
       script:
-        - CPPFLAGS=-DXXH_VECTOR=0 make check   # Scalar code
+        - CPPFLAGS=-DXXH_VECTOR=0 make check   # Scalar code path
         - make clean
         - CPPFLAGS=-DXXH_VECTOR=1 make check   # SSE2 code path
         - make clean
         - CPPFLAGS="-mavx2 -DXXH_VECTOR=2" make check   # AVX2 code path
 
-    - name: ARM + aarch64 compilation
+    - name: ARM + aarch64 compilation and consistency checks
       install:
         - sudo apt-get install -qq
             qemu-system-arm
@@ -34,6 +34,7 @@ matrix:
             gcc-aarch64-linux-gnu
             libc6-dev-arm64-cross
       script:
+        # arm (32-bit)
         - CC=arm-linux-gnueabi-gcc RUN_ENV=qemu-arm-static CPPFLAGS=-DXXH_VECTOR=0 LDFLAGS=-static make check   # Scalar code path
         - make clean
         - CC=arm-linux-gnueabi-gcc RUN_ENV=qemu-arm-static CPPFLAGS=-DXXH_VECTOR=3 LDFLAGS=-static make check   # NEON code path
diff --git a/xxh3.h b/xxh3.h
index 420c463e..a019ce0e 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -230,8 +230,8 @@ XXH3_mul128(U64 ll1, U64 ll2)
 #else /* Portable scalar version */
 
     /* emulate 64x64->128b multiplication, using four 32x32->64 */
-    U32 const h1 = ll1 >> 32;
-    U32 const h2 = ll2 >> 32;
+    U32 const h1 = (U32)(ll1 >> 32);
+    U32 const h2 = (U32)(ll2 >> 32);
     U32 const l1 = (U32)ll1;
     U32 const l2 = (U32)ll2;
 
@@ -375,7 +375,7 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k
 #elif (XXH_VECTOR == XXH_NEON)  /* note : no longer correct, must be updated to match new formula */
 
     assert(((size_t)acc) & 15 == 0);
-    {         uint64x2_t* const xacc  =     (uint64x2_t *)acc;
+    {       uint64x2_t* const xacc  =     (uint64x2_t *)acc;
         const uint32_t* const xdata = (const uint32_t *)data;
         const uint32_t* const xkey  = (const uint32_t *)key;
 

From a767eaa074dffedeba62cebc4bc2f98872c0e711 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 12 Mar 2019 13:58:26 -0700
Subject: [PATCH 56/73] added PowerPC tests

on TravisCI

will be useful to check endianess.
---
 .travis.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index 7598ae86..d0cd4e70 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -44,3 +44,12 @@ matrix:
         - make clean
         - CC=aarch64-linux-gnu-gcc RUN_ENV=qemu-aarch64-static CPPFLAGS=-DXXH_VECTOR=3 LDFLAGS=-static make check   # NEON code path
         - make clean
+
+    - name: PowerPC + PPC64 compilation and consistency checks
+      install:
+        - sudo apt-get install -qq qemu-system-ppc qemu-user-static gcc-powerpc-linux-gnu
+      script:
+        - CC=powerpc-linux-gnu-gcc RUN_ENV=qemu-ppc-static CPPFLAGS=-m32 LDFLAGS=-static make check   # Only scalar code path available
+        - make clean
+        - CC=powerpc-linux-gnu-gcc RUN_ENV=qemu-ppc64-static CPPFLAGS=-m64 LDFLAGS=-static make check   # Only scalar code path available
+        - make clean

From 3fe53a4ab97df94fdb9d97f2b0b77649e321a604 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 12 Mar 2019 14:21:24 -0700
Subject: [PATCH 57/73] fixed endianess issue

---
 .travis.yml |   2 +-
 xxh3.h      | 125 ++++++++++++++++++++++++++--------------------
 xxhash.c    | 139 +++++++++++++++++++++++-----------------------------
 3 files changed, 135 insertions(+), 131 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index d0cd4e70..075a947a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -51,5 +51,5 @@ matrix:
       script:
         - CC=powerpc-linux-gnu-gcc RUN_ENV=qemu-ppc-static CPPFLAGS=-m32 LDFLAGS=-static make check   # Only scalar code path available
         - make clean
-        - CC=powerpc-linux-gnu-gcc RUN_ENV=qemu-ppc64-static CPPFLAGS=-m64 LDFLAGS=-static make check   # Only scalar code path available
+        - CC=powerpc-linux-gnu-gcc RUN_ENV=qemu-ppc64-static CFLAGS="-O3 -m64" LDFLAGS="-static -m64" make check   # Only scalar code path available
         - make clean
diff --git a/xxh3.h b/xxh3.h
index a019ce0e..3ffa6825 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -253,7 +253,7 @@ XXH3_mul128(U64 ll1, U64 ll2)
 }
 
 
-static XXH64_hash_t XXH64_avalanche2(U64 h64)
+static XXH64_hash_t XXH3_avalanche(U64 h64)
 {
     h64 ^= h64 >> 29;
     h64 *= PRIME64_3;
@@ -265,6 +265,7 @@ static XXH64_hash_t XXH64_avalanche2(U64 h64)
 /* ==========================================
  * Short keys
  * ========================================== */
+
 XXH_FORCE_INLINE XXH64_hash_t
 XXH3_len_1to3_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed)
 {
@@ -278,11 +279,10 @@ XXH3_len_1to3_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t
         U32  const l1 = (U32)(c1) + ((U32)(c2) << 8);
         U32  const l2 = (U32)(len) + ((U32)(c3) << 2);
         U64  const ll11 = XXH_mult32to64((l1 + seed + key32[0]), (l2 + key32[1]));
-        return XXH64_avalanche2(ll11);
+        return XXH3_avalanche(ll11);
     }
 }
 
-
 XXH_FORCE_INLINE XXH64_hash_t
 XXH3_len_4to8_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed)
 {
@@ -290,10 +290,22 @@ XXH3_len_4to8_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t
     assert(len >= 4 && len <= 8);
     {   const U32* const key32 = (const U32*) keyPtr;
         U64 acc = PRIME64_1 * (len + seed);
-        U32 const l1 = XXH_read32(data) + key32[0];
-        U32 const l2 = XXH_read32((const BYTE*)data + len - 4) + key32[1];
+        U32 const l1 = XXH_readLE32(data) + key32[0];
+        U32 const l2 = XXH_readLE32((const BYTE*)data + len - 4) + key32[1];
         acc += XXH_mult32to64(l1, l2);
-        return XXH64_avalanche2(acc);
+        return XXH3_avalanche(acc);
+    }
+}
+
+XXH_FORCE_INLINE U64
+XXH3_readKey64(const void* ptr)
+{
+    assert(((size_t)ptr & 7) == 0);   /* aligned on 8-bytes boundaries */
+    if (XXH_CPU_LITTLE_ENDIAN) {
+        return *(const U64*)ptr;
+    } else {
+        const U32* const ptr32 = (const U32*)ptr;
+        return (U64)ptr32[0] + (((U64)ptr32[1]) << 32);
     }
 }
 
@@ -305,10 +317,10 @@ XXH3_len_9to16_64b(const void* data, size_t len, const void* keyPtr, XXH64_hash_
     assert(len >= 9 && len <= 16);
     {   const U64* const key64 = (const U64*) keyPtr;
         U64 acc = PRIME64_1 * (len + seed);
-        U64 const ll1 = XXH_read64(data) + key64[0];
-        U64 const ll2 = XXH_read64((const BYTE*)data + len - 8) + key64[1];
+        U64 const ll1 = XXH_readLE64(data) + XXH3_readKey64(key64);
+        U64 const ll2 = XXH_readLE64((const BYTE*)data + len - 8) + XXH3_readKey64(key64+1);
         acc += XXH3_mul128(ll1, ll2);
-        return XXH64_avalanche2(acc);
+        return XXH3_avalanche(acc);
     }
 }
 
@@ -325,9 +337,7 @@ XXH3_len_0to16_64b(const void* data, size_t len, XXH64_hash_t seed)
 }
 
 
-/* ==========================================
- * Long keys
- * ========================================== */
+/* ===    Long Keys    === */
 
 #define STRIPE_LEN 64
 #define STRIPE_ELTS (STRIPE_LEN / sizeof(U32))
@@ -428,7 +438,7 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k
 
 #else   /* scalar variant - universal */
 
-          U64* const xacc  =       (U64*) acc;
+          U64* const xacc  =       (U64*) acc;   /* presumed aligned */
     const U32* const xdata = (const U32*) data;
     const U32* const xkey  = (const U32*) key;
 
@@ -436,8 +446,10 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k
     for (i=0; i < (int)ACC_NB; i++) {
         int const left = 2*i;
         int const right= 2*i + 1;
-        xacc[i] += XXH_mult32to64(xdata[left] + xkey[left], xdata[right] + xkey[right]);
-        xacc[i] += xdata[left] + ((U64)xdata[right] << 32);
+        U32 const dataLeft  = XXH_readLE32(xdata + left);
+        U32 const dataRight = XXH_readLE32(xdata + right);
+        xacc[i] += XXH_mult32to64(dataLeft + xkey[left], dataRight + xkey[right]);
+        xacc[i] += dataLeft + ((U64)dataRight << 32);
     }
 
 #endif
@@ -531,8 +543,8 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
         int const right= 2*i + 1;
         xacc[i] ^= xacc[i] >> 47;
 
-        {   U64 p1 = XXH_mult32to64(xacc[i] & 0xFFFFFFFF, xkey[left]);
-            U64 p2 = XXH_mult32to64(xacc[i] >> 32, xkey[right]);
+        {   U64 const p1 = XXH_mult32to64(xacc[i] & 0xFFFFFFFF, xkey[left]);
+            U64 const p2 = XXH_mult32to64(xacc[i] >> 32,        xkey[right]);
             xacc[i] = p1 ^ p2;
     }   }
 
@@ -548,24 +560,6 @@ static void XXH3_accumulate(U64* acc, const void* restrict data, const U32* rest
     }
 }
 
-XXH_FORCE_INLINE U64 XXH3_mix16B(const void* data, const U64* key)
-{
-    return XXH3_mul128((XXH_read64(data) ^ key[0]), XXH_read64((const BYTE*)data+8) ^ key[1]);
-}
-
-static XXH64_hash_t XXH3_merge64B(const U64* data, const void* keyVoid, U64 start)
-{
-    const U64* const key = (const U64*)keyVoid;  /* presumed aligned */
-
-    U64 acc = start;
-    acc += XXH3_mix16B(data+0, key+0);
-    acc += XXH3_mix16B(data+2, key+2);
-    acc += XXH3_mix16B(data+4, key+4);
-    acc += XXH3_mix16B(data+6, key+6);
-
-    return XXH64_avalanche2(acc);
-}
-
 static void
 XXH3_hashLong(U64* acc, const void* data, size_t len)
 {
@@ -593,6 +587,35 @@ XXH3_hashLong(U64* acc, const void* data, size_t len)
     }   }
 }
 
+
+XXH_FORCE_INLINE U64 XXH3_mix16B(const void* data, const void* key)
+{
+    const U64* const key64 = (const U64*)key;
+    return XXH3_mul128(
+               XXH_readLE64(data) ^ XXH3_readKey64(key64),
+               XXH_readLE64((const BYTE*)data+8) ^ XXH3_readKey64(key64+1) );
+}
+
+XXH_FORCE_INLINE U64 XXH3_mix2Accs(const U64* acc, const void* key)
+{
+    const U64* const key64 = (const U64*)key;
+    return XXH3_mul128(
+               acc[0] ^ XXH3_readKey64(key64),
+               acc[1] ^ XXH3_readKey64(key64+1) );
+}
+
+static XXH64_hash_t XXH3_mergeAccs(const U64* acc, const U32* key, U64 start)
+{
+    U64 result64 = start;
+
+    result64 += XXH3_mix2Accs(acc+0, key+0);
+    result64 += XXH3_mix2Accs(acc+2, key+4);
+    result64 += XXH3_mix2Accs(acc+4, key+8);
+    result64 += XXH3_mix2Accs(acc+6, key+12);
+
+    return XXH3_avalanche(result64);
+}
+
 __attribute__((noinline)) static XXH64_hash_t    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
 XXH3_hashLong_64b(const void* data, size_t len, XXH64_hash_t seed)
 {
@@ -602,13 +625,11 @@ XXH3_hashLong_64b(const void* data, size_t len, XXH64_hash_t seed)
 
     /* converge into final hash */
     assert(sizeof(acc) == 64);
-    return XXH3_merge64B(acc, kKey, (U64)len * PRIME64_1);
+    return XXH3_mergeAccs(acc, kKey, (U64)len * PRIME64_1);
 }
 
 
-/* ==========================================
- * Public entry point
- * ========================================== */
+/* ===   Public entry point   === */
 
 XXH_PUBLIC_API XXH64_hash_t
 XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed)
@@ -640,7 +661,7 @@ XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed)
         acc += XXH3_mix16B(p+0, key+0);
         acc += XXH3_mix16B(p+len-16, key+2);
 
-        return XXH64_avalanche2(acc);
+        return XXH3_avalanche(acc);
     }
 }
 
@@ -674,7 +695,7 @@ XXH3_len_1to3_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_
         U32  const l2 = (U32)(len) + ((U32)(c3) << 2);
         U64  const ll11 = XXH_mult32to64(l1 + seed + key32[0], l2 + key32[1]);
         U64  const ll12 = XXH_mult32to64(l1 + key32[2], l2 - seed + key32[3]);
-        return (XXH128_hash_t) { XXH64_avalanche2(ll11), XXH64_avalanche2(ll12) };
+        return (XXH128_hash_t) { XXH3_avalanche(ll11), XXH3_avalanche(ll12) };
     }
 }
 
@@ -687,11 +708,11 @@ XXH3_len_4to8_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_
     {   const U32* const key32 = (const U32*) keyPtr;
         U64 acc1 = PRIME64_1 * ((U64)len + seed);
         U64 acc2 = PRIME64_2 * ((U64)len - seed);
-        U32 const l1 = XXH_read32(data);
-        U32 const l2 = XXH_read32((const BYTE*)data + len - 4);
+        U32 const l1 = XXH_readLE32(data);
+        U32 const l2 = XXH_readLE32((const BYTE*)data + len - 4);
         acc1 += XXH_mult32to64(l1 + key32[0], l2 + key32[1]);
         acc2 += XXH_mult32to64(l1 - key32[2], l2 + key32[3]);
-        return (XXH128_hash_t){ XXH64_avalanche2(acc1), XXH64_avalanche2(acc2) };
+        return (XXH128_hash_t){ XXH3_avalanche(acc1), XXH3_avalanche(acc2) };
     }
 }
 
@@ -704,11 +725,11 @@ XXH3_len_9to16_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash
     {   const U64* const key64 = (const U64*) keyPtr;
         U64 acc1 = PRIME64_1 * ((U64)len + seed);
         U64 acc2 = PRIME64_2 * ((U64)len - seed);
-        U64 const ll1 = XXH_read64(data);
-        U64 const ll2 = XXH_read64((const BYTE*)data + len - 8);
-        acc1 += XXH3_mul128(ll1 + key64[0], ll2 + key64[1]);
-        acc2 += XXH3_mul128(ll1 + key64[2], ll2 + key64[3]);
-        return (XXH128_hash_t){ XXH64_avalanche2(acc1), XXH64_avalanche2(acc2) };
+        U64 const ll1 = XXH_readLE64(data);
+        U64 const ll2 = XXH_readLE64((const BYTE*)data + len - 8);
+        acc1 += XXH3_mul128(ll1 + XXH3_readKey64(key64+0), ll2 + XXH3_readKey64(key64+1));
+        acc2 += XXH3_mul128(ll1 + XXH3_readKey64(key64+2), ll2 + XXH3_readKey64(key64+3));
+        return (XXH128_hash_t){ XXH3_avalanche(acc1), XXH3_avalanche(acc2) };
     }
 }
 
@@ -734,8 +755,8 @@ XXH3_hashLong_128b(const void* data, size_t len, XXH64_hash_t seed)
 
     /* converge into final hash */
     assert(sizeof(acc) == 64);
-    {   U64 const part1 = XXH3_merge64B(acc, kKey, (U64)len * PRIME64_1);
-        U64 const part2 = XXH3_merge64B(acc, kKey+16, ((U64)len+1) * PRIME64_2);
+    {   U64 const part1 = XXH3_mergeAccs(acc, kKey, (U64)len * PRIME64_1);
+        U64 const part2 = XXH3_mergeAccs(acc, kKey+16, ((U64)len+1) * PRIME64_2);
         return (XXH128_hash_t) { part1, part2 };
     }
 }
@@ -772,7 +793,7 @@ XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed)
 
         {   U64 const part1 = acc1 + acc2;
             U64 const part2 = (acc1 * PRIME64_3) + (acc2 * PRIME64_4) + ((len - seed) * PRIME64_2);
-            return (XXH128_hash_t) { XXH64_avalanche2(part1), -XXH64_avalanche2(part2) };
+            return (XXH128_hash_t) { XXH3_avalanche(part1), -XXH3_avalanche(part2) };
         }
     }
 }
diff --git a/xxhash.c b/xxhash.c
index c5fec9b0..82ee887b 100644
--- a/xxhash.c
+++ b/xxhash.c
@@ -154,6 +154,9 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcp
 # endif
 #endif
 
+
+/* ===   Memory access   === */
+
 #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
 
 /* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
@@ -181,6 +184,22 @@ static U32 XXH_read32(const void* memPtr)
 #endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
 
 
+/* ===   Endianess   === */
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+
+/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+static int XXH_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
+#endif
+
+
+
+
 /* ****************************************
 *  Compiler-specific Functions and Macros
 ******************************************/
@@ -210,44 +229,29 @@ static U32 XXH_swap32 (U32 x)
 #endif
 
 
-/* *************************************
-*  Architecture Macros
-***************************************/
-typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
-
-/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
-#ifndef XXH_CPU_LITTLE_ENDIAN
-static int XXH_isLittleEndian(void)
-{
-    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
-    return one.c[0];
-}
-#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
-#endif
-
-
 /* ***************************
 *  Memory reads
 *****************************/
 typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
 
-XXH_FORCE_INLINE U32
-XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+XXH_FORCE_INLINE U32 XXH_readLE32(const void* ptr)
 {
-    if (align==XXH_unaligned)
-        return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
-    else
-        return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr);
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
 }
 
-XXH_FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian)
+static U32 XXH_readBE32(const void* ptr)
 {
-    return XXH_readLE32_align(ptr, endian, XXH_unaligned);
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
 }
 
-static U32 XXH_readBE32(const void* ptr)
+XXH_FORCE_INLINE U32
+XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
 {
-    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+    if (align==XXH_unaligned) {
+        return XXH_readLE32(ptr);
+    } else {
+        return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr);
+    }
 }
 
 
@@ -492,8 +496,8 @@ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int s
 }
 
 
-XXH_FORCE_INLINE XXH_errorcode
-XXH32_update_endian(XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian)
+XXH_PUBLIC_API XXH_errorcode
+XXH32_update(XXH32_state_t* state, const void* input, size_t len)
 {
     if (input==NULL)
 #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
@@ -517,10 +521,10 @@ XXH32_update_endian(XXH32_state_t* state, const void* input, size_t len, XXH_end
         if (state->memsize) {   /* some data left from previous update */
             XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize);
             {   const U32* p32 = state->mem32;
-                state->v1 = XXH32_round(state->v1, XXH_readLE32(p32, endian)); p32++;
-                state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian)); p32++;
-                state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian)); p32++;
-                state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian));
+                state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++;
+                state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++;
+                state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++;
+                state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
             }
             p += 16-state->memsize;
             state->memsize = 0;
@@ -534,10 +538,10 @@ XXH32_update_endian(XXH32_state_t* state, const void* input, size_t len, XXH_end
             U32 v4 = state->v4;
 
             do {
-                v1 = XXH32_round(v1, XXH_readLE32(p, endian)); p+=4;
-                v2 = XXH32_round(v2, XXH_readLE32(p, endian)); p+=4;
-                v3 = XXH32_round(v3, XXH_readLE32(p, endian)); p+=4;
-                v4 = XXH32_round(v4, XXH_readLE32(p, endian)); p+=4;
+                v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4;
+                v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4;
+                v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4;
+                v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4;
             } while (p<=limit);
 
             state->v1 = v1;
@@ -556,17 +560,6 @@ XXH32_update_endian(XXH32_state_t* state, const void* input, size_t len, XXH_end
 }
 
 
-XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len)
-{
-    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
-
-    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
-    else
-        return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
-}
-
-
 XXH_FORCE_INLINE U32
 XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian)
 {
@@ -686,23 +679,23 @@ static U64 XXH_swap64 (U64 x)
 }
 #endif
 
-XXH_FORCE_INLINE U64
-XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+XXH_FORCE_INLINE U64 XXH_readLE64(const void* ptr)
 {
-    if (align==XXH_unaligned)
-        return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
-    else
-        return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr);
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
 }
 
-XXH_FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian)
+static U64 XXH_readBE64(const void* ptr)
 {
-    return XXH_readLE64_align(ptr, endian, XXH_unaligned);
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
 }
 
-static U64 XXH_readBE64(const void* ptr)
+XXH_FORCE_INLINE U64
+XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
 {
-    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+    if (align==XXH_unaligned)
+        return XXH_readLE64(ptr);
+    else
+        return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr);
 }
 
 
@@ -953,8 +946,8 @@ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long
     return XXH_OK;
 }
 
-XXH_FORCE_INLINE XXH_errorcode
-XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian)
+XXH_PUBLIC_API XXH_errorcode
+XXH64_update (XXH64_state_t* state, const void* input, size_t len)
 {
     if (input==NULL)
 #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
@@ -976,10 +969,10 @@ XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_en
 
         if (state->memsize) {   /* tmp buffer is full */
             XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize);
-            state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0, endian));
-            state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1, endian));
-            state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2, endian));
-            state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3, endian));
+            state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0));
+            state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1));
+            state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2));
+            state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3));
             p += 32-state->memsize;
             state->memsize = 0;
         }
@@ -992,10 +985,10 @@ XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_en
             U64 v4 = state->v4;
 
             do {
-                v1 = XXH64_round(v1, XXH_readLE64(p, endian)); p+=8;
-                v2 = XXH64_round(v2, XXH_readLE64(p, endian)); p+=8;
-                v3 = XXH64_round(v3, XXH_readLE64(p, endian)); p+=8;
-                v4 = XXH64_round(v4, XXH_readLE64(p, endian)); p+=8;
+                v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8;
+                v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8;
+                v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8;
+                v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8;
             } while (p<=limit);
 
             state->v1 = v1;
@@ -1013,16 +1006,6 @@ XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_en
     return XXH_OK;
 }
 
-XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len)
-{
-    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
-
-    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
-    else
-        return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
-}
-
 XXH_FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian)
 {
     U64 h64;
@@ -1077,7 +1060,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
 /* *******************************************************************
 *  XXH3
 *  New generation hash designed for speed on small keys and vectorization
-*********************************************************************/
+********************************************************************** */
 
 #include "xxh3.h"
 

From e6433e8dfda583809bbc7f9ce733331827a62c63 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 12 Mar 2019 17:36:37 -0700
Subject: [PATCH 58/73] restored clang #pragma unroll statement

that has been accidentally lost in an update.
---
 .travis.yml | 8 ++++----
 xxh3.h      | 5 ++++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 075a947a..78d918d7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -35,14 +35,14 @@ matrix:
             libc6-dev-arm64-cross
       script:
         # arm (32-bit)
-        - CC=arm-linux-gnueabi-gcc RUN_ENV=qemu-arm-static CPPFLAGS=-DXXH_VECTOR=0 LDFLAGS=-static make check   # Scalar code path
+        - CC=arm-linux-gnueabi-gcc CPPFLAGS=-DXXH_VECTOR=0 LDFLAGS=-static RUN_ENV=qemu-arm-static make check   # Scalar code path
         - make clean
-        - CC=arm-linux-gnueabi-gcc RUN_ENV=qemu-arm-static CPPFLAGS=-DXXH_VECTOR=3 LDFLAGS=-static make check   # NEON code path
+        - CC=arm-linux-gnueabi-gcc CPPFLAGS=-DXXH_VECTOR=3 CFLAGS="-O3 -march=armv7-a -mfloat-abi=hard -mfpu=neon" LDFLAGS=-static RUN_ENV=qemu-arm-static make check   # NEON code path
         - make clean
         # aarch64
-        - CC=aarch64-linux-gnu-gcc RUN_ENV=qemu-aarch64-static CPPFLAGS=-DXXH_VECTOR=0 LDFLAGS=-static make check   # Scalar code path
+        - CC=aarch64-linux-gnu-gcc CPPFLAGS=-DXXH_VECTOR=0 LDFLAGS=-static RUN_ENV=qemu-aarch64-static make check   # Scalar code path
         - make clean
-        - CC=aarch64-linux-gnu-gcc RUN_ENV=qemu-aarch64-static CPPFLAGS=-DXXH_VECTOR=3 LDFLAGS=-static make check   # NEON code path
+        - CC=aarch64-linux-gnu-gcc CPPFLAGS=-DXXH_VECTOR=3 LDFLAGS=-static RUN_ENV=qemu-aarch64-static make check   # NEON code path
         - make clean
 
     - name: PowerPC + PPC64 compilation and consistency checks
diff --git a/xxh3.h b/xxh3.h
index 3ffa6825..0a5c4ccf 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -203,7 +203,6 @@ XXH3_mul128(U64 ll1, U64 ll2)
         && !(defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM == 0 && __TARGET_ARCH_THUMB == 4) \
     && (defined(__ARM_ARCH_6T2__) || __ARM_ARCH > 6) /* ARMv6T2 or later */
 
-    U64 t;
     U32 w[4] = { 0 };
     U32 u[2] = { (U32)(ll1 >> 32), (U32)ll1 };
     U32 v[2] = { (U32)(ll2 >> 32), (U32)ll2 };
@@ -554,6 +553,10 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
 static void XXH3_accumulate(U64* acc, const void* restrict data, const U32* restrict key, size_t nbStripes)
 {
     size_t n;
+    /* Clang doesn't unroll this loop without the pragma. Unrolling can be up to 1.4x faster. */
+#if defined(__clang__) && !defined(__OPTIMIZE_SIZE__)
+#  pragma clang loop unroll(enable)
+#endif
     for (n = 0; n < nbStripes; n++ ) {
         XXH3_accumulate_512(acc, (const BYTE*)data + n*STRIPE_LEN, key);
         key += 2;

From af852ac75212661ceb764ee50081b6fe5ddddf5e Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 12 Mar 2019 17:48:59 -0700
Subject: [PATCH 59/73] fixed last strict aliasing issues

---
 xxh3.h | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/xxh3.h b/xxh3.h
index 0a5c4ccf..df1b04cf 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -638,7 +638,7 @@ XXH_PUBLIC_API XXH64_hash_t
 XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed)
 {
     const BYTE* const p = (const BYTE*)data;
-    const U64* const key = (const U64*)(const void*)kKey;
+    const char* const key = (const char*)kKey;
 
     if (len <= 16) return XXH3_len_0to16_64b(data, len, seed);
 
@@ -648,21 +648,21 @@ XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed)
                 if (len > 96) {
                     if (len > 128) return XXH3_hashLong_64b(data, len, seed);
 
-                    acc += XXH3_mix16B(p+48, key+12);
-                    acc += XXH3_mix16B(p+len-64, key+14);
+                    acc += XXH3_mix16B(p+48, key+96);
+                    acc += XXH3_mix16B(p+len-64, key+112);
                 }
 
-                acc += XXH3_mix16B(p+32, key+8);
-                acc += XXH3_mix16B(p+len-48, key+10);
+                acc += XXH3_mix16B(p+32, key+64);
+                acc += XXH3_mix16B(p+len-48, key+80);
             }
 
-            acc += XXH3_mix16B(p+16, key+4);
-            acc += XXH3_mix16B(p+len-32, key+6);
+            acc += XXH3_mix16B(p+16, key+32);
+            acc += XXH3_mix16B(p+len-32, key+48);
 
         }
 
         acc += XXH3_mix16B(p+0, key+0);
-        acc += XXH3_mix16B(p+len-16, key+2);
+        acc += XXH3_mix16B(p+len-16, key+16);
 
         return XXH3_avalanche(acc);
     }
@@ -772,27 +772,27 @@ XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed)
     {   U64 acc1 = PRIME64_1 * (len + seed);
         U64 acc2 = 0;
         const BYTE* const p = (const BYTE*)data;
-        const U64* const key = (const U64*)(const void*)kKey;
+        const char* const key = (const char*)kKey;
         if (len > 32) {
             if (len > 64) {
                 if (len > 96) {
                     if (len > 128) return XXH3_hashLong_128b(data, len, seed);
 
-                    acc1 += XXH3_mix16B(p+48, key+12);
-                    acc2 += XXH3_mix16B(p+len-64, key+14);
+                    acc1 += XXH3_mix16B(p+48, key+96);
+                    acc2 += XXH3_mix16B(p+len-64, key+112);
                 }
 
-                acc1 += XXH3_mix16B(p+32, key+8);
-                acc2 += XXH3_mix16B(p+len-48, key+10);
+                acc1 += XXH3_mix16B(p+32, key+64);
+                acc2 += XXH3_mix16B(p+len-48, key+80);
             }
 
-            acc1 += XXH3_mix16B(p+16, key+4);
-            acc2 += XXH3_mix16B(p+len-32, key+6);
+            acc1 += XXH3_mix16B(p+16, key+32);
+            acc2 += XXH3_mix16B(p+len-32, key+48);
 
         }
 
         acc1 += XXH3_mix16B(p+0, key+0);
-        acc2 += XXH3_mix16B(p+len-16, key+2);
+        acc2 += XXH3_mix16B(p+len-16, key+16);
 
         {   U64 const part1 = acc1 + acc2;
             U64 const part2 = (acc1 * PRIME64_3) + (acc2 * PRIME64_4) + ((len - seed) * PRIME64_2);

From 8423e82ef82b8fe1ecd290d8d2ffa66b82c11524 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 12 Mar 2019 18:13:46 -0700
Subject: [PATCH 60/73] fixed last integration issues

---
 xxh3.h   |  4 ----
 xxhash.h | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/xxh3.h b/xxh3.h
index df1b04cf..39bc09d1 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -679,10 +679,6 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len)
 /* ==========================================
  * XXH3 128 bits (=> XXH128)
  * ========================================== */
-typedef struct {
-    XXH64_hash_t ll1;
-    XXH64_hash_t ll2;
-} XXH128_hash_t;
 
 XXH_FORCE_INLINE XXH128_hash_t
 XXH3_len_1to3_128b(const void* data, size_t len, const void* keyPtr, XXH64_hash_t seed)
diff --git a/xxhash.h b/xxhash.h
index 5b887223..d12ba0b5 100644
--- a/xxhash.h
+++ b/xxhash.h
@@ -328,17 +328,31 @@ struct XXH64_state_s {
 *  XXH3
 *  New experimental hash
 ************************************************************************/
+#ifndef XXH_NO_LONG_LONG
+
+typedef struct {
+    XXH64_hash_t ll1;
+    XXH64_hash_t ll2;
+} XXH128_hash_t;
+
 
 #ifdef XXH_NAMESPACE
+#  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
 #  define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
 #  define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
+#  define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
+#  define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
 #endif
 
 /* note : variant without seed produces same result as variant with seed == 0 */
+XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, unsigned long long seed);
 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);
 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, unsigned long long seed);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, unsigned long long seed);
 
 
+#endif  /* XXH_NO_LONG_LONG */
 
 
 /*-**********************************************************************

From 79014872e9b8d7165b8c88b412bcc880f21370a5 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 12 Mar 2019 18:27:32 -0700
Subject: [PATCH 61/73] separating ARM tests

---
 .travis.yml | 1 -
 Makefile    | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 78d918d7..ce23c918 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -8,7 +8,6 @@ matrix:
       dist: xenial
       before_install:
         - sudo apt-get update  -qq
-        - sudo apt-get install -qq gcc-arm-linux-gnueabi
         - sudo apt-get install -qq clang
         - sudo apt-get install -qq g++-multilib
         - sudo apt-get install -qq gcc-multilib
diff --git a/Makefile b/Makefile
index df4319ab..72e04f82 100644
--- a/Makefile
+++ b/Makefile
@@ -227,7 +227,7 @@ preview-man: clean-man man
 test: all namespaceTest check test-xxhsum-c c90test
 
 test-all: CFLAGS += -Werror
-test-all: test test32 armtest clangtest cxxtest usan listL120 trailingWhitespace staticAnalyze cppcheck
+test-all: test test32 clangtest cxxtest usan listL120 trailingWhitespace staticAnalyze cppcheck
 
 .PHONY: listL120
 listL120:  # extract lines >= 120 characters in *.{c,h}, by Takayuki Matsuoka (note : $$, for Makefile compatibility)

From c1ae3287a17d1a123e9bd717806cba10a90420ee Mon Sep 17 00:00:00 2001
From: "easyaspi314 (Devin)" <easyaspi314@users.noreply.github.com>
Date: Tue, 12 Mar 2019 22:20:45 -0400
Subject: [PATCH 62/73] Update ARM NEON code

The NEON algorithms have now been updated to match the SSE2 algorithm.
---
 xxh3.h | 108 +++++++++++++++++++++++++++------------------------------
 1 file changed, 51 insertions(+), 57 deletions(-)

diff --git a/xxh3.h b/xxh3.h
index 39bc09d1..2dc61a35 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -208,17 +208,32 @@ XXH3_mul128(U64 ll1, U64 ll2)
     U32 v[2] = { (U32)(ll2 >> 32), (U32)ll2 };
     U32 k;
 
+    /* U64 t = (U64)u[1] * (U64)v[1];
+     * w[3] = t & 0xFFFFFFFF;
+     * k = t >> 32; */
     __asm__("umull %0, %1, %2, %3"
             : "=r" (w[3]), "=r" (k)
             : "r" (u[1]), "r" (v[1]));
+
+    /* t = (U64)u[0] * (U64)v[1] + w[2] + k;
+     * w[2] = t & 0xFFFFFFFF;
+     * k = t >> 32; */
     __asm__("umaal %0, %1, %2, %3"
             : "+r" (w[2]), "+r" (k)
             : "r" (u[0]), "r" (v[1]));
     w[1] = k;
     k = 0;
+
+    /* t = (U64)u[1] * (U64)v[0] + w[2] + k;
+     * w[2] = t & 0xFFFFFFFF;
+     * k = t >> 32; */
     __asm__("umaal %0, %1, %2, %3"
             : "+r" (w[2]), "+r" (k)
             : "r" (u[1]), "r" (v[0]));
+
+    /* t = (U64)u[0] * (U64)v[0] + w[1] + k;
+     * w[1] = t & 0xFFFFFFFF;
+     * k = t >> 32; */
     __asm__("umaal %0, %1, %2, %3"
             : "+r" (w[1]), "+r" (k)
             : "r" (u[0]), "r" (v[0]));
@@ -381,7 +396,7 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k
         }
     }
 
-#elif (XXH_VECTOR == XXH_NEON)  /* note : no longer correct, must be updated to match new formula */
+#elif (XXH_VECTOR == XXH_NEON)
 
     assert(((size_t)acc) & 15 == 0);
     {       uint64x2_t* const xacc  =     (uint64x2_t *)acc;
@@ -390,48 +405,31 @@ XXH3_accumulate_512(void* acc, const void *restrict data, const void *restrict k
 
         size_t i;
         for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
-#  if !defined(__aarch64__) && !defined(__arm64__) && !defined(XXH_NO_ARM32_HACK)
-            /* On 32-bit ARM, we can take advantage of the packed registers.
-             * This is not portable to aarch64!
-             * Basically, on 32-bit NEON, registers are stored like so:
-             *  .----------------------------------.
-             *  |                q8                | // uint32x4_t
-             *  |-----------------.----------------|
-             *  |  d16 (.val[0])  |  d17 (.val[1]) | // uint32x2x2_t
-             *  '-----------------'----------------'
-             * vld2.32 will store its values into two double registers, returning
-             * a uint32x2_t. In NEON, this will be stored in, for example, d16 and d17.
-             * Reinterpret cast it to a uint32x4_t and you get q8 for free
-             *
-             * On aarch64, this was changed completely.
-             *
-             * aarch64 gave us 16 more quad registers, but they also removed this behavior,
-             * instead matching smaller registers to the lower sections of the higher
-             * registers and zeroing the rest.
-             *  .----------------------------------..---------------------------------.
-             *  |               v8.4s              |               v9.4s               |
-             *  |-----------------.----------------|-----------------.-----------------|
-             *  | v8.2s (.val[0]) |     <zero>     | v9.2s (.val[1]) |      <zero>     |
-             *  '-----------------'----------------'-----------------'-----------------'
-             * On aarch64, ld2 will put it into v8.2s and v9.2s. Reinterpreting
-             * is not going to help us here, as half of it will end up being zero. */
-
-            uint32x2x2_t d = vld2_u32(xdata + i * 4);     /* load and swap */
-            uint32x2x2_t k = vld2_u32(xkey + i * 4);
-            /* Not sorry about breaking the strict aliasing rule.
-             * Using a union causes GCC to spit out nonsense, but an alias cast
-             * does not. */
-            uint32x4_t const dk = vaddq_u32(*(uint32x4_t*)&d, *(uint32x4_t*)&k);
-            xacc[i] = vmlal_u32(xacc[i], vget_low_u32(dk), vget_high_u32(dk));
-#  else
-            /* Portable, but slightly slower version */
-            uint32x2x2_t const d = vld2_u32(xdata + i * 4);
-            uint32x2x2_t const k = vld2_u32(xkey + i * 4);
-            uint32x2_t const dkL = vadd_u32(d.val[0], k.val[0]);
-            uint32x2_t const dkH = vadd_u32(d.val[1], k.val[1]);   /* uint32 dk[4]  = {d0+k0, d1+k1, d2+k2, d3+k3} */
-            /* xacc must be aligned on 16 bytes boundaries */
-            xacc[i] = vmlal_u32(xacc[i], dkL, dkH);                /* uint64 res[2] = {dk0*dk1,dk2*dk3} */
-#  endif
+            uint32x4_t const d = vld1q_u32(xdata+i*4);                           /* U32 d[4] = xdata[i]; */
+            uint32x4_t const k = vld1q_u32(xkey+i*4);                            /* U32 k[4] = xkey[i]; */
+            uint32x4_t dk = vaddq_u32(d, k);                                     /* U32 dk[4] = {d0+k0, d1+k1, d2+k2, d3+k3} */
+#if !defined(__aarch64__) && !defined(__arm64__) /* ARM32-specific hack */
+            /* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this.
+             * vzip on 32-bit ARM NEON will overwrite the original register, and I think that Clang
+             * assumes I don't want to destroy it and tries to make a copy. This slows down the code
+             * a lot.
+             * aarch64 not only uses an entirely different syntax, but it requires three
+             * instructions...
+             *    ext    v1.16B, v0.16B, #8    // select high bits because aarch64 can't address them directly
+             *    zip1   v3.2s, v0.2s, v1.2s   // first zip
+             *    zip2   v2.2s, v0.2s, v1.2s   // second zip
+             * ...to do what ARM does in one:
+             *    vzip.32 d0, d1               // Interleave high and low bits and overwrite. */
+            __asm__("vzip.32 %e0, %f0" : "+w" (dk));                             /* dk = { dk0, dk2, dk1, dk3 }; */
+            xacc[i] = vaddq_u64(xacc[i], vreinterpretq_u64_u32(d));              /* xacc[i] += (U64x2)d; */
+            xacc[i] = vmlal_u32(xacc[i], vget_low_u32(dk), vget_high_u32(dk));   /* xacc[i] += { (U64)dk0*dk1, (U64)dk2*dk3 }; */
+#else
+            /* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */
+            uint32x2_t dkL = vmovn_u64(vreinterpretq_u64_u32(dk));               /* U32 dkL[2] = dk & 0xFFFFFFFF; */
+            uint32x2_t dkH = vshrn_n_u64(vreinterpretq_u64_u32(dk), 32);         /* U32 dkH[2] = dk >> 32; */
+            xacc[i] = vaddq_u64(xacc[i], vreinterpretq_u64_u32(d));              /* xacc[i] += (U64x2)d; */
+            xacc[i] = vmlal_u32(xacc[i], dkL, dkH);                              /* xacc[i] += (U64x2)dkL*(U64x2)dkH; */
+#endif
         }
     }
 
@@ -502,21 +500,17 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
         }   }
     }
 
-#elif (XXH_VECTOR == XXH_NEON)   /* note : no longer correct, must be updated to match new formula */
+#elif (XXH_VECTOR == XXH_NEON)
 
     assert(((size_t)acc) & 15 == 0);
-    {       uint64x2_t* const xacc =       (uint64x2_t*) acc;
-        const uint32_t* const xkey  = (const uint32_t *) key;
-        uint64x2_t xor_p5 = vdupq_n_u64(PRIME64_5);
+    {       uint64x2_t* const xacc =     (uint64x2_t*) acc;
+        const uint32_t* const xkey = (const uint32_t*) key;
         size_t i;
-        /* Clang and GCC like to put NEON constant loads into the loop. */
-        __asm__("" : "+w" (xor_p5));
+
         for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) {
             uint64x2_t data = xacc[i];
-            uint64x2_t const shifted = vshrq_n_u64(data, 47);
-            data = veorq_u64(data, shifted);
-            data = veorq_u64(data, xor_p5);
-
+            uint64x2_t const shifted = vshrq_n_u64(data, 47);          /* uint64 shifted[2] = data >> 47; */
+            data = veorq_u64(data, shifted);                           /* data ^= shifted; */
             {
                 /* shuffle: 0, 1, 2, 3 -> 0, 2, 1, 3 */
                 uint32x2x2_t const d =
@@ -524,10 +518,10 @@ static void XXH3_scrambleAcc(void* acc, const void* key)
                         vget_low_u32(vreinterpretq_u32_u64(data)),
                         vget_high_u32(vreinterpretq_u32_u64(data))
                     );
-                uint32x2x2_t const k = vld2_u32 (xkey+i*4);              /* load and swap */
-                uint64x2_t const dk  = vmull_u32(d.val[0],k.val[0]);     /* U64 dk[2]  = {d0 * k0, d2 * k2} */
-                uint64x2_t const dk2 = vmull_u32(d.val[1],k.val[1]);     /* U64 dk2[2] = {d1 * k1, d3 * k3} */
-                xacc[i] = veorq_u64(dk, dk2);                            /* xacc[i] = dk ^ dk2;             */
+                uint32x2x2_t const k = vld2_u32(xkey+i*4);               /* load and swap */
+                uint64x2_t const dk  = vmull_u32(d.val[0],k.val[0]);     /* U64 dk[2]  = {(U64)d0*k0, (U64)d2*k2} */
+                uint64x2_t const dk2 = vmull_u32(d.val[1],k.val[1]);     /* U64 dk2[2] = {(U64)d1*k1, (U64)d3*k3} */
+                xacc[i] = veorq_u64(dk, dk2);                            /* xacc[i] = dk^dk2;             */
         }   }
     }
 

From ba14aed723deb8adc701cb09fa5e92d63e8bd6e7 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Wed, 13 Mar 2019 10:42:08 -0700
Subject: [PATCH 63/73] removed cppcheck from test-all

this test is unreliable:
dubious warning messages,
and results vary depending on version.
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 72e04f82..79c0ca64 100644
--- a/Makefile
+++ b/Makefile
@@ -227,7 +227,7 @@ preview-man: clean-man man
 test: all namespaceTest check test-xxhsum-c c90test
 
 test-all: CFLAGS += -Werror
-test-all: test test32 clangtest cxxtest usan listL120 trailingWhitespace staticAnalyze cppcheck
+test-all: test test32 clangtest cxxtest usan listL120 trailingWhitespace staticAnalyze
 
 .PHONY: listL120
 listL120:  # extract lines >= 120 characters in *.{c,h}, by Takayuki Matsuoka (note : $$, for Makefile compatibility)

From d7419363d3003e78e332d237b4cb3cdee2d9d131 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Wed, 13 Mar 2019 10:47:41 -0700
Subject: [PATCH 64/73] travis: moved ARM tests to Xenial

in an effort to replicate success on local Xenial VM
---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index ce23c918..25f89735 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -24,6 +24,7 @@ matrix:
         - CPPFLAGS="-mavx2 -DXXH_VECTOR=2" make check   # AVX2 code path
 
     - name: ARM + aarch64 compilation and consistency checks
+      dist: xenial
       install:
         - sudo apt-get install -qq
             qemu-system-arm

From 2e86e206963e147b8d3bf0f60877963f252b035a Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Wed, 13 Mar 2019 12:14:21 -0700
Subject: [PATCH 65/73] added list of opened questions for xxh3

---
 xxhash.h | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/xxhash.h b/xxhash.h
index d12ba0b5..6a42838a 100644
--- a/xxhash.h
+++ b/xxhash.h
@@ -330,12 +330,67 @@ struct XXH64_state_s {
 ************************************************************************/
 #ifndef XXH_NO_LONG_LONG
 
+
+/* ============================================
+ * XXH3 is a new hash algorithm,
+ * featuring vastly improved speed performance
+ * for both small and large inputs.
+ * A full speed analysis will be published,
+ * it requires a lot more space than this comment can handle.
+ * In general, expect XXH3 to run about ~2x faster on large inputs,
+ * and >3x faster on small ones, though exact difference depend on platform.
+ *
+ * The algorithm is portable, will generate the same hash on all platforms.
+ * It benefits greatly from vectorization units, but does not require it.
+ *
+ * XXH3 offers 2 variants, _64bits and _128bits.
+ * The low 64-bits of the _128bits variant are the same as the _64bits variant.
+ * However, if only 64-bits are needed, prefer calling the _64bits variant.
+ * It reduces the amount of mixing, resulting in faster speed on small inputs.
+ *
+ * The XXH3 algorithm is still considered experimental.
+ * It's possible to use it for ephemeral data, but avoid storing long-term values for later re-use.
+ * While labelled experimental, the produced result can still change between versions.
+ *
+ * The API currently supports one-shot hashing only.
+ * The full version will include streaming capability, and canonical representation
+ * Long term optional feature may include custom secret keys, and secret key generation.
+ *
+ * There are still a number of opened questions that community can influence during the experimental period.
+ * I'm trying to list a few of them below, though don't consider this list as complete.
+ *
+ * - 128-bits output type : currently defined as a structure of 2 64-bits fields.
+ *                          That's because 128-bits values do not exist in C standard.
+ *                          Note that it means that, at byte level, result is not identical depending on endianess.
+ *                          However, at field level, they are identical on all platforms.
+ *                          The canonical representation will solve the issue of identical byte-level representation across platforms,
+ *                          which is necessary for serialization.
+ *
+ * - Canonical representation : for the 64-bits variant, it's the same as XXH64() (aka big-endian).
+ *                          What should it be for the 128-bits variant ?
+ *                          Since it's no longer a scalar value, big-endian representation is no longer an obvious choice.
+ *                          One possibility : represent it as the concatenation of two 64-bits canonical representation (aka 2x big-endian)
+ *                          Another one : represent it in the same order as natural order for little-endian platforms.
+ *                                        Less consistent with existing convention for XXH32/XXH64, but may be more natural for little-endian platforms.
+ *
+ * - Seed type for 128-bits variant : currently, it's a single 64-bit value, like the 64-bits variant.
+ *                          It could be argued that it's more logical to offer a 128-bit seed capability for a 128-bit hash.
+ *                          Although it's also more difficult to use, since it requires to declare and pass a structure instead of a value.
+ *                          It would either replace current choice, or add a new one.
+ *                          Farmhash, for example, offers both variants (the 128-bits seed variant is called `doubleSeed`).
+ *
+ * - Result for len==0 : Currently, the result of hashing a zero-length input is the seed.
+ *                          This mimics the behavior of a crc : in which case, a seed is effectively an accumulator, so it's not updated if input is empty.
+ *                          Consequently, by default, when no seed specified, it returns zero. That part seems okay (it used to be a request for XXH32/XXH64).
+ *                          But is it still fine to return the seed when the seed is non-zero ?
+ *                          Are there use case which would depend on this behavior, or would prefer a mixing of the seed ?
+ */
+
 typedef struct {
     XXH64_hash_t ll1;
     XXH64_hash_t ll2;
 } XXH128_hash_t;
 
-
 #ifdef XXH_NAMESPACE
 #  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
 #  define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)

From aaea63b97921a3c17c2e6bbb981053c0fc13d7f3 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Wed, 13 Mar 2019 14:44:41 -0700
Subject: [PATCH 66/73] added XXH128 consistency tests

---
 xxhash.h |  26 ++++++++----
 xxhsum.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 136 insertions(+), 9 deletions(-)

diff --git a/xxhash.h b/xxhash.h
index 6a42838a..1dc86536 100644
--- a/xxhash.h
+++ b/xxhash.h
@@ -344,7 +344,7 @@ struct XXH64_state_s {
  * It benefits greatly from vectorization units, but does not require it.
  *
  * XXH3 offers 2 variants, _64bits and _128bits.
- * The low 64-bits of the _128bits variant are the same as the _64bits variant.
+ * The first 64-bits field of the _128bits variant is the same as _64bits result.
  * However, if only 64-bits are needed, prefer calling the _64bits variant.
  * It reduces the amount of mixing, resulting in faster speed on small inputs.
  *
@@ -360,21 +360,33 @@ struct XXH64_state_s {
  * I'm trying to list a few of them below, though don't consider this list as complete.
  *
  * - 128-bits output type : currently defined as a structure of 2 64-bits fields.
- *                          That's because 128-bits values do not exist in C standard.
+ *                          That's because 128-bit values do not exist in C standard.
  *                          Note that it means that, at byte level, result is not identical depending on endianess.
  *                          However, at field level, they are identical on all platforms.
  *                          The canonical representation will solve the issue of identical byte-level representation across platforms,
  *                          which is necessary for serialization.
+ *                          Would there be a better representation for a 128-bit hash result ?
+ *                          Are the names of the inner 64-bit fields important ? Should they be changed ?
  *
- * - Canonical representation : for the 64-bits variant, it's the same as XXH64() (aka big-endian).
- *                          What should it be for the 128-bits variant ?
+ * - Canonical representation : for the 64-bit variant, canonical representation is the same as XXH64() (aka big-endian).
+ *                          What should it be for the 128-bit variant ?
  *                          Since it's no longer a scalar value, big-endian representation is no longer an obvious choice.
  *                          One possibility : represent it as the concatenation of two 64-bits canonical representation (aka 2x big-endian)
- *                          Another one : represent it in the same order as natural order for little-endian platforms.
+ *                          Another one : represent it in the same order as natural order in the struct for little-endian platforms.
  *                                        Less consistent with existing convention for XXH32/XXH64, but may be more natural for little-endian platforms.
  *
- * - Seed type for 128-bits variant : currently, it's a single 64-bit value, like the 64-bits variant.
- *                          It could be argued that it's more logical to offer a 128-bit seed capability for a 128-bit hash.
+ * - Associated functions for 128-bit hash : simple things, such as checking if 2 hashes are equal, become more difficult with struct.
+ *                          Granted, it's not terribly difficult to create a comparator, but it's still a workload.
+ *                          Would it be beneficial to declare and define a comparator function for XXH128_hash_t ?
+ *                          Are there other operations on XXH128_hash_t which would be desirable ?
+ *
+ * - Variant compatibility : The first 64-bit field of the _128bits variant is the same as the result of _64bits.
+ *                          This is not a compulsory behavior. It just felt that it "wouldn't hurt", and might even help in some (unidentified) cases.
+ *                          But it might influence the design of XXH128_hash_t, in ways which may block other possibilities.
+ *                          Good idea, bad idea ?
+ *
+ * - Seed type for 128-bits variant : currently, it's a single 64-bit value, like the 64-bit variant.
+ *                          It could be argued that it's more logical to offer a 128-bit seed input parameter for a 128-bit hash.
  *                          Although it's also more difficult to use, since it requires to declare and pass a structure instead of a value.
  *                          It would either replace current choice, or add a new one.
  *                          Farmhash, for example, offers both variants (the 128-bits seed variant is called `doubleSeed`).
diff --git a/xxhsum.c b/xxhsum.c
index 7428b62d..1b661cd5 100644
--- a/xxhsum.c
+++ b/xxhsum.c
@@ -438,7 +438,7 @@ static int BMK_benchInternal(size_t keySize, U32 specificTest)
  * ensure results consistency accross platforms
  *********************************************** */
 
-static void BMK_checkResult32(U32 r1, U32 r2)
+static void BMK_checkResult32(XXH32_hash_t r1, XXH32_hash_t r2)
 {
     static int nbTests = 1;
     if (r1!=r2) {
@@ -448,7 +448,7 @@ static void BMK_checkResult32(U32 r1, U32 r2)
     nbTests++;
 }
 
-static void BMK_checkResult64(U64 r1, U64 r2)
+static void BMK_checkResult64(XXH64_hash_t r1, XXH64_hash_t r2)
 {
     static int nbTests = 1;
     if (r1!=r2) {
@@ -459,6 +459,19 @@ static void BMK_checkResult64(U64 r1, U64 r2)
     nbTests++;
 }
 
+static void BMK_checkResult128(XXH128_hash_t r1, XXH128_hash_t r2)
+{
+    static int nbTests = 1;
+    if ((r1.ll1 != r2.ll1) || (r1.ll2 != r2.ll2)) {
+        DISPLAY("\rERROR : Test%3i : 128-bit values non equals   !!!!!   \n", nbTests);
+        DISPLAY("\r { 0x%08X%08XULL, 0x%08X%08XULL } != { 0x%08X%08XULL, %08X%08XULL } \n",
+                (U32)(r1.ll1>>32), (U32)r1.ll1, (U32)(r1.ll2>>32), (U32)r1.ll2,
+                (U32)(r2.ll1>>32), (U32)r2.ll1, (U32)(r2.ll2>>32), (U32)r2.ll2 );
+        exit(1);
+    }
+    nbTests++;
+}
+
 
 static void BMK_testSequence64(const void* sentence, size_t len, U64 seed, U64 Nresult)
 {
@@ -494,6 +507,28 @@ static void BMK_testXXH3(const void* data, size_t len, U64 seed, U64 Nresult)
     }
 }
 
+static void BMK_testXXH128(const void* data, size_t len, U64 seed, XXH128_hash_t Nresult)
+{
+    {   XXH128_hash_t const Dresult = XXH3_128bits_withSeed(data, len, seed);
+        BMK_checkResult128(Dresult, Nresult);
+
+        /* check that XXH128() is identical to XXH3_128bits_withSeed() */
+        {   XXH128_hash_t const Dresult2 = XXH128(data, len, seed);
+            BMK_checkResult128(Dresult2, Nresult);
+        }
+
+        /* check that first field is equal to _64bits variant */
+        {   U64 const result64 = XXH3_64bits_withSeed(data, len, seed);
+            BMK_checkResult64(result64, Nresult.ll1);
+    }   }
+
+    /* check that the no-seed variant produces same result as seed==0 */
+    if (seed == 0) {
+        XXH128_hash_t const Dresult = XXH3_128bits(data, len);
+        BMK_checkResult128(Dresult, Nresult);
+    }
+}
+
 static void BMK_testSequence(const void* sequence, size_t len, U32 seed, U32 Nresult)
 {
     XXH32_state_t state;
@@ -574,6 +609,86 @@ static void BMK_sanityCheck(void)
     BMK_testXXH3(sanityBuffer,2243, 0,     0xE7C1890BDBD2B245ULL);  /* 3 blocks, last stripe is overlapping */
     BMK_testXXH3(sanityBuffer,2243, prime, 0x3A68386AED0C50A7ULL);  /* 3 blocks, last stripe is overlapping */
 
+    {   XXH128_hash_t const expected = { 0, 0 };
+        BMK_testXXH128(NULL,           0, 0,     expected);         /* zero-length hash is { seed, -seed } by default */
+    }
+    {   XXH128_hash_t const expected = { prime, -(U64)prime };
+        BMK_testXXH128(NULL,           0, prime, expected);
+    }
+    {   XXH128_hash_t const expected = { 0xE2C6D3B40D6F9203ULL, 0x82895983D246CA74ULL };
+        BMK_testXXH128(sanityBuffer,   1, 0,     expected);         /* 1-3 */
+    }
+    {   XXH128_hash_t const expected = { 0xCEE5DF124E6135DCULL, 0xFA2DA0269396F32DULL };
+        BMK_testXXH128(sanityBuffer,   1, prime, expected);         /* 1-3 */
+    }
+    {   XXH128_hash_t const expected = { 0x585D6F8D1AAD96A2ULL, 0x2791F3B193F0AB86ULL };
+        BMK_testXXH128(sanityBuffer,   6, 0,     expected);         /* 4-8 */
+    }
+    {   XXH128_hash_t const expected = { 0x133EC8CA1739250FULL, 0xDF3F422D70BDE07FULL };
+        BMK_testXXH128(sanityBuffer,   6, prime, expected);         /* 4-8 */
+    }
+    {   XXH128_hash_t const expected = { 0x0E85E122FE5356ACULL, 0xD933CC7EDF4D95DAULL };
+        BMK_testXXH128(sanityBuffer,  12, 0,     expected);         /* 9-16 */
+    }
+    {   XXH128_hash_t const expected = { 0xE0DB5E70DA67EB16ULL, 0x114C8C76E74C669FULL };
+        BMK_testXXH128(sanityBuffer,  12, prime, expected);         /* 9-16 */
+    }
+    {   XXH128_hash_t const expected = { 0x6C213B15B89230C9ULL, 0x3F3AACF5F277AC02ULL };
+        BMK_testXXH128(sanityBuffer,  24, 0,     expected);         /* 17-32 */
+    }
+    {   XXH128_hash_t const expected = { 0x71892DB847A8F53CULL, 0xD11561AC7D0F5ECDULL };
+        BMK_testXXH128(sanityBuffer,  24, prime, expected);         /* 17-32 */
+    }
+    {   XXH128_hash_t const expected = { 0xECED834E8E99DA1EULL, 0x0F85E76A60898313ULL };
+        BMK_testXXH128(sanityBuffer,  48, 0,     expected);         /* 33-64 */
+    }
+    {   XXH128_hash_t const expected = { 0xA901250B336F9133ULL, 0xA35D3FB395E1DDE0ULL };
+        BMK_testXXH128(sanityBuffer,  48, prime, expected);         /* 33-64 */
+    }
+    {   XXH128_hash_t const expected = { 0x338B2F6E103D5B4EULL, 0x5DD1777C8FA671ABULL };
+        BMK_testXXH128(sanityBuffer,  81, 0,     expected);         /* 65-96 */
+    }
+    {   XXH128_hash_t const expected = { 0x0718382B6D4264C3ULL, 0x1D542DAFEFA1790EULL };
+        BMK_testXXH128(sanityBuffer,  81, prime, expected);         /* 65-96 */
+    }
+    {   XXH128_hash_t const expected = { 0x7DE871A4FE41C90EULL, 0x786CB41C46C6B7B6ULL };
+        BMK_testXXH128(sanityBuffer, 103, 0,     expected);         /* 97-128 */
+    }
+    {   XXH128_hash_t const expected = { 0xAD8B0B428C940A2CULL, 0xF8BA6D8B8CB05EB7ULL };
+        BMK_testXXH128(sanityBuffer, 103, prime, expected);         /* 97-128 */
+    }
+    {   XXH128_hash_t const expected = { 0x6D96AC3F415CFCFEULL, 0x947EDFA54DD68990ULL };
+        BMK_testXXH128(sanityBuffer, 192, 0,     expected);         /* one block, ends at stripe boundary */
+    }
+    {   XXH128_hash_t const expected = { 0xE4BD30AA1673B966ULL, 0x8132EF45FF3D51F2ULL };
+        BMK_testXXH128(sanityBuffer, 192, prime, expected);         /* one block, ends at stripe boundary */
+    }
+    {   XXH128_hash_t const expected = { 0xB62929C362EF3BF5ULL, 0x1946A7A9E6DD3032ULL };
+        BMK_testXXH128(sanityBuffer, 222, 0,     expected);         /* one block, last stripe is overlapping */
+    }
+    {   XXH128_hash_t const expected = { 0x2782C3C49E3FD25EULL, 0x98CE16C40C2D59F6ULL };
+        BMK_testXXH128(sanityBuffer, 222, prime, expected);         /* one block, last stripe is overlapping */
+    }
+    {   XXH128_hash_t const expected = { 0x802EB54C97564FD7ULL, 0x384AADF242348D00ULL };
+        BMK_testXXH128(sanityBuffer,2048, 0,     expected);         /* two blocks, finishing at block boundary */
+    }
+    {   XXH128_hash_t const expected = { 0xC9F188CFAFDA22CDULL, 0x7936B69445BE9EEDULL };
+        BMK_testXXH128(sanityBuffer,2048, prime, expected);         /* two blocks, finishing at block boundary */
+    }
+    {   XXH128_hash_t const expected = { 0x16B0035F6ABC1F46ULL, 0x1F6602850A1AA7EEULL };
+        BMK_testXXH128(sanityBuffer,2240, 0,     expected);         /* two blocks, ends at stripe boundary */
+    }
+    {   XXH128_hash_t const expected = { 0x389E68C2348B9161ULL, 0xA7D1E8C96586A052ULL };
+        BMK_testXXH128(sanityBuffer,2240, prime, expected);         /* two blocks, ends at stripe boundary */
+    }
+    {   XXH128_hash_t const expected = { 0x8B1DE79158C397D3ULL, 0x9B6B2EEFAC2DE0ADULL };
+        BMK_testXXH128(sanityBuffer,2237, 0,     expected);         /* two blocks, ends at stripe boundary */
+    }
+    {   XXH128_hash_t const expected = { 0x9DDF09ABA2B93DD6ULL, 0xB9CEDBE2582CA371ULL };
+        BMK_testXXH128(sanityBuffer,2237, prime, expected);         /* two blocks, ends at stripe boundary */
+    }
+
+
     DISPLAYLEVEL(3, "\r%70s\r", "");       /* Clean display line */
     DISPLAYLEVEL(3, "Sanity check -- all tests ok\n");
 }

From 70f9d859594f8ead8105e2264182cb47e691dc03 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Wed, 13 Mar 2019 15:08:04 -0700
Subject: [PATCH 67/73] minor doc edits

---
 xxhash.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/xxhash.h b/xxhash.h
index 1dc86536..cab61fdf 100644
--- a/xxhash.h
+++ b/xxhash.h
@@ -390,6 +390,7 @@ struct XXH64_state_s {
  *                          Although it's also more difficult to use, since it requires to declare and pass a structure instead of a value.
  *                          It would either replace current choice, or add a new one.
  *                          Farmhash, for example, offers both variants (the 128-bits seed variant is called `doubleSeed`).
+ *                          If both 64-bit and 128-bit seeds are possible, which variant should be called XXH128 ?
  *
  * - Result for len==0 : Currently, the result of hashing a zero-length input is the seed.
  *                          This mimics the behavior of a crc : in which case, a seed is effectively an accumulator, so it's not updated if input is empty.
@@ -411,12 +412,13 @@ typedef struct {
 #  define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
 #endif
 
-/* note : variant without seed produces same result as variant with seed == 0 */
 XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, unsigned long long seed);
-XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);
-XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, unsigned long long seed);
+
+/* note : variants without seed produce same result as variant with seed == 0 */
+XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits(const void* data, size_t len);
+XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits_withSeed(const void* data, size_t len, unsigned long long seed);
 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len);
-XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, unsigned long long seed);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, unsigned long long seed);  /* == XXH128() */
 
 
 #endif  /* XXH_NO_LONG_LONG */

From 2b8b68cee527f53e5bb668238bd8dc7cd484b59b Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Wed, 13 Mar 2019 15:15:37 -0700
Subject: [PATCH 68/73] disable ARM 32-bit + NEON tests

does not work (yet) on Travis CI
---
 .travis.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 25f89735..fab28664 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -37,7 +37,10 @@ matrix:
         # arm (32-bit)
         - CC=arm-linux-gnueabi-gcc CPPFLAGS=-DXXH_VECTOR=0 LDFLAGS=-static RUN_ENV=qemu-arm-static make check   # Scalar code path
         - make clean
-        - CC=arm-linux-gnueabi-gcc CPPFLAGS=-DXXH_VECTOR=3 CFLAGS="-O3 -march=armv7-a -mfloat-abi=hard -mfpu=neon" LDFLAGS=-static RUN_ENV=qemu-arm-static make check   # NEON code path
+        # Note : the following test (ARM 32-bit + NEON) is disabled for the time being.
+        # I haven't yet found a way to make it link on Travis CI using gcc cross-compilation.
+        # NEON code path is fortunately validated through `aarch64` below.
+        # - CC=arm-linux-gnueabi-gcc CPPFLAGS=-DXXH_VECTOR=3 CFLAGS="-O3 -march=armv7-a -mfloat-abi=hard -mfpu=neon" LDFLAGS=-static RUN_ENV=qemu-arm-static make check   # NEON code path
         - make clean
         # aarch64
         - CC=aarch64-linux-gnu-gcc CPPFLAGS=-DXXH_VECTOR=0 LDFLAGS=-static RUN_ENV=qemu-aarch64-static make check   # Scalar code path

From f622c806ef89352c3c91170253daa3164550e4da Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Wed, 13 Mar 2019 15:55:24 -0700
Subject: [PATCH 69/73] xxhsum: fixed benchmark on low resolution timers

triggered an assert when time measured == 0
---
 xxhsum.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/xxhsum.c b/xxhsum.c
index 1b661cd5..12b89fb2 100644
--- a/xxhsum.c
+++ b/xxhsum.c
@@ -284,20 +284,29 @@ static void BMK_benchHash(hashFunction h, const char* hName, const void* buffer,
         while (clock() == cStart);   /* starts clock() at its exact beginning */
         cStart = clock();
 
-        {   U32 i;
-            for (i=0; i<nbh_perIteration; i++)
-                r += h(buffer, bufferSize, i);
+        {   U32 u;
+            for (u=0; u<nbh_perIteration; u++)
+                r += h(buffer, bufferSize, u);
         }
-        if (r==0) DISPLAYLEVEL(3,".\r");  /* do something with r to avoid compiler "optimizing" away hash function */
-        {   double const timeS = ((double)BMK_clockSpan(cStart) / CLOCKS_PER_SEC) / nbh_perIteration;
+        if (r==0) DISPLAYLEVEL(3,".\r");  /* do something with r to defeat compiler "optimizing" away hash */
+
+        {   clock_t const nbTicks = BMK_clockSpan(cStart);
+            double const timeS = ((double)nbTicks / CLOCKS_PER_SEC) / nbh_perIteration;
+            if (nbTicks == 0) { /* faster than resolution timer */
+                nbh_perIteration *= 100;
+                iterationNb--;   /* try again */
+                continue;
+            }
             if (timeS < fastestH) fastestH = timeS;
             DISPLAYLEVEL(2, "%1u-%-17.17s : %10u -> %8.0f it/s (%7.1f MB/s) \r",
                     iterationNb, hName, (U32)bufferSize,
                     (double)1 / fastestH,
                     ((double)bufferSize / (1<<20)) / fastestH );
         }
-        assert(fastestH > 1./2000000000);  /* avoid U32 overflow */
-        nbh_perIteration = (U32)(1 / fastestH) + 1;  /* adjust nbh_perIteration to last roughtly one second */
+        {   double nbh_perSecond = (1 / fastestH) + 1;
+            if (nbh_perSecond > (double)(4000U<<20)) nbh_perSecond = (double)(4000U<<20);
+            nbh_perIteration = (U32)nbh_perSecond;
+        }
     }
     DISPLAYLEVEL(1, "%-19.19s : %10u -> %8.0f it/s (%7.1f MB/s) \n", hName, (U32)bufferSize,
         (double)1 / fastestH,

From 40dbf78fa950069ff02089fb9f9961ad01a1a46c Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Thu, 14 Mar 2019 13:08:38 -0700
Subject: [PATCH 70/73] renamed XXH128_hash_t members to low64 and high64

---
 xxh3.h   |  6 +++---
 xxhash.h | 20 +++++++++++---------
 xxhsum.c |  8 ++++----
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/xxh3.h b/xxh3.h
index 2dc61a35..9197b680 100644
--- a/xxh3.h
+++ b/xxh3.h
@@ -748,9 +748,9 @@ XXH3_hashLong_128b(const void* data, size_t len, XXH64_hash_t seed)
 
     /* converge into final hash */
     assert(sizeof(acc) == 64);
-    {   U64 const part1 = XXH3_mergeAccs(acc, kKey, (U64)len * PRIME64_1);
-        U64 const part2 = XXH3_mergeAccs(acc, kKey+16, ((U64)len+1) * PRIME64_2);
-        return (XXH128_hash_t) { part1, part2 };
+    {   U64 const low64 = XXH3_mergeAccs(acc, kKey, (U64)len * PRIME64_1);
+        U64 const high64 = XXH3_mergeAccs(acc, kKey+16, ((U64)len+1) * PRIME64_2);
+        return (XXH128_hash_t) { low64, high64 };
     }
 }
 
diff --git a/xxhash.h b/xxhash.h
index cab61fdf..7f3d6603 100644
--- a/xxhash.h
+++ b/xxhash.h
@@ -344,9 +344,10 @@ struct XXH64_state_s {
  * It benefits greatly from vectorization units, but does not require it.
  *
  * XXH3 offers 2 variants, _64bits and _128bits.
- * The first 64-bits field of the _128bits variant is the same as _64bits result.
- * However, if only 64-bits are needed, prefer calling the _64bits variant.
- * It reduces the amount of mixing, resulting in faster speed on small inputs.
+ * When only 64 bits are needed, prefer calling the _64bits variant :
+ * it reduces the amount of mixing, resulting in faster speed on small inputs.
+ * It's also generally simpler to manipulate a scalar type than a struct.
+ * Note : the low 64-bit field of the _128bits variant is the same as _64bits result.
  *
  * The XXH3 algorithm is still considered experimental.
  * It's possible to use it for ephemeral data, but avoid storing long-term values for later re-use.
@@ -380,7 +381,7 @@ struct XXH64_state_s {
  *                          Would it be beneficial to declare and define a comparator function for XXH128_hash_t ?
  *                          Are there other operations on XXH128_hash_t which would be desirable ?
  *
- * - Variant compatibility : The first 64-bit field of the _128bits variant is the same as the result of _64bits.
+ * - Variant compatibility : The low 64-bit field of the _128bits variant is the same as the result of _64bits.
  *                          This is not a compulsory behavior. It just felt that it "wouldn't hurt", and might even help in some (unidentified) cases.
  *                          But it might influence the design of XXH128_hash_t, in ways which may block other possibilities.
  *                          Good idea, bad idea ?
@@ -399,11 +400,6 @@ struct XXH64_state_s {
  *                          Are there use case which would depend on this behavior, or would prefer a mixing of the seed ?
  */
 
-typedef struct {
-    XXH64_hash_t ll1;
-    XXH64_hash_t ll2;
-} XXH128_hash_t;
-
 #ifdef XXH_NAMESPACE
 #  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
 #  define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
@@ -412,6 +408,12 @@ typedef struct {
 #  define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
 #endif
 
+
+typedef struct {
+    XXH64_hash_t low64;
+    XXH64_hash_t high64;
+} XXH128_hash_t;
+
 XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, unsigned long long seed);
 
 /* note : variants without seed produce same result as variant with seed == 0 */
diff --git a/xxhsum.c b/xxhsum.c
index 12b89fb2..07b5ff29 100644
--- a/xxhsum.c
+++ b/xxhsum.c
@@ -471,11 +471,11 @@ static void BMK_checkResult64(XXH64_hash_t r1, XXH64_hash_t r2)
 static void BMK_checkResult128(XXH128_hash_t r1, XXH128_hash_t r2)
 {
     static int nbTests = 1;
-    if ((r1.ll1 != r2.ll1) || (r1.ll2 != r2.ll2)) {
+    if ((r1.low64 != r2.low64) || (r1.high64 != r2.high64)) {
         DISPLAY("\rERROR : Test%3i : 128-bit values non equals   !!!!!   \n", nbTests);
         DISPLAY("\r { 0x%08X%08XULL, 0x%08X%08XULL } != { 0x%08X%08XULL, %08X%08XULL } \n",
-                (U32)(r1.ll1>>32), (U32)r1.ll1, (U32)(r1.ll2>>32), (U32)r1.ll2,
-                (U32)(r2.ll1>>32), (U32)r2.ll1, (U32)(r2.ll2>>32), (U32)r2.ll2 );
+                (U32)(r1.low64>>32), (U32)r1.low64, (U32)(r1.high64>>32), (U32)r1.high64,
+                (U32)(r2.low64>>32), (U32)r2.low64, (U32)(r2.high64>>32), (U32)r2.high64 );
         exit(1);
     }
     nbTests++;
@@ -528,7 +528,7 @@ static void BMK_testXXH128(const void* data, size_t len, U64 seed, XXH128_hash_t
 
         /* check that first field is equal to _64bits variant */
         {   U64 const result64 = XXH3_64bits_withSeed(data, len, seed);
-            BMK_checkResult64(result64, Nresult.ll1);
+            BMK_checkResult64(result64, Nresult.low64);
     }   }
 
     /* check that the no-seed variant produces same result as seed==0 */

From cf5694603db5df13450918c9904108bfb7826aed Mon Sep 17 00:00:00 2001
From: "easyaspi314 (Devin)" <easyaspi314@users.noreply.github.com>
Date: Fri, 15 Mar 2019 11:56:58 -0400
Subject: [PATCH 71/73] Improve xxhsum output message quality

- xxhsum now prints more professional-looking error messages:
Before:
    Pb opening foo
After:
    Error: Could not open 'foo': No such file or directory.

- xxhsum will now attempt to display the architecture and the compiler
  version in the benchmark WELCOME_MESSAGE.
  It detects the following compilers:
    - Clang
    - GCC
    - Intel Compiler
    - MSVC
    - tcc
  and it should detect the following architectures:
    - x86 (+SSE2/AVX/AVX2)
    - x86_64 (+SSE2/AVX/AVX2)
    - ARM (+NEON)
    - aarch64
    - PowerPC 64
    - PowerPC
    - AVR
    - MIPS 64
    - MIPS
Before:
    ./xxhsum 0.7.0 (64-bits little endian), by Yann Collet
After:
    ./xxhsum 0.7.0 (64-bits x86_64 + SSE2 little endian), GCC 8.3.0, by Yann Collet

- Sanity checks are consistent now and give better warning messages:
Before:
    ERROR : Test  1 : 0x12345678 <> 0x02CC5D05   !!!!!

    ERROR : Test  1 : 64-bit values non equals   !!!!!
     0x1234567890ABCDEFULL != 0xEF46DB3751D8E999ULL
After:
    Error: 32-bit hash test 1: Internal sanity check failed!
    Got 0x12345678, expected 0x02CC5D05.
    Note: If you modified the hash functions, make sure to either update the values
    or temporarily comment out the tests in BMK_sanityCheck.

...and the 64-bit and 128-bit messages now match. I eventually want to name the
tests instead of just using the test number, but this is still better than before.

- xxhsum now displays "stdin" instead of "-" when reading from stdin.
---
 xxhsum.c | 151 +++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 118 insertions(+), 33 deletions(-)

diff --git a/xxhsum.c b/xxhsum.c
index 07b5ff29..0ec11c01 100644
--- a/xxhsum.c
+++ b/xxhsum.c
@@ -44,7 +44,6 @@
 #  define _LARGEFILE64_SOURCE
 #endif
 
-
 /* ************************************
  *  Includes
  **************************************/
@@ -55,6 +54,7 @@
 #include <sys/stat.h>   /* stat, stat64, _stat64 */
 #include <time.h>       /* clock_t, clock, CLOCKS_PER_SEC */
 #include <assert.h>     /* assert */
+#include <errno.h>      /* errno */
 
 #define XXH_STATIC_LINKING_ONLY   /* *_state_t */
 #include "xxhash.h"
@@ -164,13 +164,86 @@ static unsigned BMK_isLittleEndian(void)
 #define QUOTE(str) #str
 #define EXPAND_AND_QUOTE(str) QUOTE(str)
 #define PROGRAM_VERSION EXPAND_AND_QUOTE(LIB_VERSION)
+
+/* Show compiler versions in WELCOME_MESSAGE. VERSION_FMT will return the printf specifiers,
+ * and VERSION will contain the comma separated list of arguments to the VERSION_FMT string. */
+#if defined(__clang_version__)
+/* Clang does its own thing. */
+#  ifdef __apple_build_version__
+#    define VERSION_FMT ", Apple Clang %s"
+#  else
+#    define VERSION_FMT ", Clang %s"
+#  endif
+#  define VERSION  __clang_version__
+#elif defined(__VERSION__)
+/* GCC and ICC */
+#  define VERSION_FMT ", %s"
+#  ifdef __INTEL_COMPILER /* icc adds its prefix */
+#    define VERSION_STRING __VERSION__
+#  else /* assume GCC */
+#    define VERSION "GCC " __VERSION__
+#  endif
+#elif defined(_MSC_FULL_VER) && defined(_MSC_BUILD)
+/* "For example, if the version number of the Visual C++ compiler is 15.00.20706.01, the _MSC_FULL_VER macro
+ * evaluates to 150020706." https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros?view=vs-2017 */
+#  define VERSION  _MSC_FULL_VER / 10000000 % 100, _MSC_FULL_VER / 100000 % 100, _MSC_FULL_VER % 100000, _MSC_BUILD
+#  define VERSION_FMT ", MSVC %02i.%02i.%05i.%02i"
+#elif defined(__TINYC__)
+/* tcc stores its version in the __TINYC__ macro. */
+#  define VERSION_FMT ", tcc %i.%i.%i"
+#  define VERSION __TINYC__ / 10000 % 100, __TINYC__ / 100 % 100, __TINYC__ % 100
+#else
+#  define VERSION_FMT "%s"
+#  define VERSION ""
+#endif
+
+/* makes the next part easier */
+#if defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
+#   define ARCH_X86 "x86_64"
+#elif defined(__i386__) || defined(_M_X86) || defined(_M_X86_FP)
+#   define ARCH_X86 "i386"
+#endif
+
+/* Try to detect the architecture. */
+#if defined(ARCH_X86)
+#  if defined(__AVX2__)
+#    define ARCH ARCH_X86 " + AVX2"
+#  elif defined(__AVX__)
+#    define ARCH ARCH_X86 " + AVX"
+#  elif defined(__SSE2__)
+#     define ARCH ARCH_X86 " + SSE2"
+#  else
+#      define ARCH ARCH_X86
+#  endif
+#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64)
+#  define ARCH "aarch64"
+#elif defined(__arm__) || defined(__thumb__) || defined(__thumb2__) || defined(_M_ARM)
+#  if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#    define ARCH "arm + NEON"
+#  else
+#    define ARCH "arm"
+#  endif
+#elif defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__)
+#  define ARCH "ppc64"
+#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)
+#  define ARCH "ppc"
+#elif defined(__AVR)
+#  define ARCH "AVR"
+#elif defined(__mips64)
+#  define ARCH "mips64"
+#elif defined(__mips)
+#  define ARCH "mips"
+#else
+#  define ARCH "unknown"
+#endif
+
 static const int g_nbBits = (int)(sizeof(void*)*8);
 static const char g_lename[] = "little endian";
 static const char g_bename[] = "big endian";
 #define ENDIAN_NAME (BMK_isLittleEndian() ? g_lename : g_bename)
 static const char author[] = "Yann Collet";
-#define WELCOME_MESSAGE(exename) "%s %s (%i-bits %s), by %s \n", \
-                    exename, PROGRAM_VERSION, g_nbBits, ENDIAN_NAME, author
+#define WELCOME_MESSAGE(exename) "%s %s (%i-bits %s %s)" VERSION_FMT ", by %s \n", \
+                    exename, PROGRAM_VERSION, g_nbBits, ARCH, ENDIAN_NAME, VERSION, author
 
 #define KB *( 1<<10)
 #define MB *( 1<<20)
@@ -350,7 +423,7 @@ static int BMK_benchMem(const void* buffer, size_t bufferSize, U32 specificTest)
         BMK_benchHash(localXXH3_64b, "XXH3_64b unaligned", ((const char*)buffer)+3, bufferSize);
 
     if (specificTest > 6) {
-        DISPLAY("benchmark mode invalid \n");
+        DISPLAY("Benchmark mode invalid.\n");
         return 1;
     }
     return 0;
@@ -384,12 +457,12 @@ static int BMK_benchFiles(const char** fileNamesTable, int nbFiles, U32 specific
 
             /* Checks */
             if (inFile==NULL){
-                DISPLAY("Pb opening %s\n", inFileName);
+                DISPLAY("Error: Could not open '%s': %s.\n", inFileName, strerror(errno));
                 free(buffer);
                 return 11;
             }
             if(!buffer) {
-                DISPLAY("\nError: not enough memory!\n");
+                DISPLAY("\nError: Out of memory.\n");
                 fclose(inFile);
                 return 12;
             }
@@ -399,7 +472,7 @@ static int BMK_benchFiles(const char** fileNamesTable, int nbFiles, U32 specific
             {   size_t const readSize = fread(alignedBuffer, 1, benchedSize, inFile);
                 fclose(inFile);
                 if(readSize != benchedSize) {
-                    DISPLAY("\nError: problem reading file '%s' !!    \n", inFileName);
+                    DISPLAY("\nError: Could not read '%s': %s.\n", inFileName, strerror(errno));
                     free(buffer);
                     return 13;
             }   }
@@ -419,7 +492,7 @@ static int BMK_benchInternal(size_t keySize, U32 specificTest)
 {
     void* const buffer = calloc(keySize+16+3, 1);
     if (!buffer) {
-        DISPLAY("\nError: not enough memory!\n");
+        DISPLAY("\nError: Out of memory.\n");
         return 12;
     }
 
@@ -451,7 +524,10 @@ static void BMK_checkResult32(XXH32_hash_t r1, XXH32_hash_t r2)
 {
     static int nbTests = 1;
     if (r1!=r2) {
-        DISPLAY("\rERROR : Test%3i : 0x%08X <> 0x%08X   !!!!!   \n", nbTests, r1, r2);
+        DISPLAY("\rError: 32-bit hash test %i: Internal sanity check failed!\n", nbTests);
+        DISPLAY("\rGot 0x%08X, expected 0x%08X.\n", r1, r2);
+        DISPLAY("\rNote: If you modified the hash functions, make sure to either update the values\n"
+                  "or temporarily comment out the tests in BMK_sanityCheck.\n");
         exit(1);
     }
     nbTests++;
@@ -461,8 +537,10 @@ static void BMK_checkResult64(XXH64_hash_t r1, XXH64_hash_t r2)
 {
     static int nbTests = 1;
     if (r1!=r2) {
-        DISPLAY("\rERROR : Test%3i : 64-bit values non equals   !!!!!   \n", nbTests);
-        DISPLAY("\r 0x%08X%08XULL != 0x%08X%08XULL \n", (U32)(r1>>32), (U32)r1, (U32)(r2>>32), (U32)r2);
+        DISPLAY("\rError: 64-bit hash test %i: Internal sanity check failed!\n", nbTests);
+        DISPLAY("\rGot 0x%08X%08XULL, expected 0x%08X%08XULL.\n", (U32)(r1>>32), (U32)r1, (U32)(r2>>32), (U32)r2);
+        DISPLAY("\rNote: If you modified the hash functions, make sure to either update the values\n"
+                  "or temporarily comment out the tests in BMK_sanityCheck.\n");
         exit(1);
     }
     nbTests++;
@@ -472,10 +550,12 @@ static void BMK_checkResult128(XXH128_hash_t r1, XXH128_hash_t r2)
 {
     static int nbTests = 1;
     if ((r1.low64 != r2.low64) || (r1.high64 != r2.high64)) {
-        DISPLAY("\rERROR : Test%3i : 128-bit values non equals   !!!!!   \n", nbTests);
-        DISPLAY("\r { 0x%08X%08XULL, 0x%08X%08XULL } != { 0x%08X%08XULL, %08X%08XULL } \n",
+        DISPLAY("\rError: 128-bit hash test %i: Internal sanity check failed.\n", nbTests);
+        DISPLAY("\rGot { 0x%08X%08XULL, 0x%08X%08XULL }, expected { 0x%08X%08XULL, %08X%08XULL } \n",
                 (U32)(r1.low64>>32), (U32)r1.low64, (U32)(r1.high64>>32), (U32)r1.high64,
                 (U32)(r2.low64>>32), (U32)r2.low64, (U32)(r2.high64>>32), (U32)r2.high64 );
+        DISPLAY("\rNote: If you modified the hash functions, make sure to either update the values\n"
+                  "or temporarily comment out the tests in BMK_sanityCheck.\n");
         exit(1);
     }
     nbTests++;
@@ -783,19 +863,20 @@ static int BMK_hash(const char* fileName,
     /* Check file existence */
     if (fileName == stdinName) {
         inFile = stdin;
+        fileName = "stdin";
         SET_BINARY_MODE(stdin);
     }
     else
         inFile = fopen( fileName, "rb" );
     if (inFile==NULL) {
-        DISPLAY( "Pb opening %s\n", fileName);
+        DISPLAY("Error: Could not open '%s': %s.\n", fileName, strerror(errno));
         return 1;
     }
 
     /* Memory allocation & restrictions */
     buffer = malloc(blockSize);
     if(!buffer) {
-        DISPLAY("\nError: not enough memory!\n");
+        DISPLAY("\nError: Out of memory.\n");
         fclose(inFile);
         return 1;
     }
@@ -1104,7 +1185,7 @@ static void parseFile1(ParseFileArg* parseFileArg)
         if (lineNumber == 0) {
             /* This is unlikely happen, but md5sum.c has this
              * error check. */
-            DISPLAY("%s : too many checksum lines\n", inFileName);
+            DISPLAY("%s: Error: Too many checksum lines\n", inFileName);
             report->quit = 1;
             break;
         }
@@ -1123,15 +1204,15 @@ static void parseFile1(ParseFileArg* parseFileArg)
                 break;
 
             default:
-                DISPLAY("%s : %lu: unknown error\n", inFileName, lineNumber);
+                DISPLAY("%s:%lu: Error: Unknown error.\n", inFileName, lineNumber);
                 break;
 
             case GetLine_exceedMaxLineLength:
-                DISPLAY("%s : %lu: too long line\n", inFileName, lineNumber);
+                DISPLAY("%s:%lu: Error: Line too long.\n", inFileName, lineNumber);
                 break;
 
             case GetLine_outOfMemory:
-                DISPLAY("%s : %lu: out of memory\n", inFileName, lineNumber);
+                DISPLAY("%s:%lu: Error: Out of memory.\n", inFileName, lineNumber);
                 break;
             }
             report->quit = 1;
@@ -1141,7 +1222,7 @@ static void parseFile1(ParseFileArg* parseFileArg)
         if (parseLine(&parsedLine, parseFileArg->lineBuf) != ParseLine_ok) {
             report->nImproperlyFormattedLines++;
             if (parseFileArg->warn) {
-                DISPLAY("%s : %lu: improperly formatted XXHASH checksum line\n"
+                DISPLAY("%s:%lu: Error: Improperly formatted checksum line.\n"
                     , inFileName, lineNumber);
             }
             continue;
@@ -1152,7 +1233,7 @@ static void parseFile1(ParseFileArg* parseFileArg)
             report->nImproperlyFormattedLines++;
             report->nMixedFormatLines++;
             if (parseFileArg->warn) {
-                DISPLAY("%s : %lu: improperly formatted XXHASH checksum line (XXH32/64)\n"
+                DISPLAY("%s : %lu: Error: Multiple hash types in one file.\n"
                     , inFileName, lineNumber);
             }
             continue;
@@ -1195,15 +1276,15 @@ static void parseFile1(ParseFileArg* parseFileArg)
         switch (lineStatus)
         {
         default:
-            DISPLAY("%s : unknown error\n", inFileName);
+            DISPLAY("%s: Error: Unknown error.\n", inFileName);
             report->quit = 1;
             break;
 
         case LineStatus_failedToOpen:
             report->nOpenOrReadFailures++;
             if (!parseFileArg->statusOnly) {
-                DISPLAYRESULT("%s : %lu: FAILED open or read %s\n"
-                    , inFileName, lineNumber, parsedLine.filename);
+                DISPLAYRESULT("%s:%lu: Could not open or read '%s': %s.\n",
+                    inFileName, lineNumber, parsedLine.filename, strerror(errno));
             }
             break;
 
@@ -1266,13 +1347,14 @@ static int checkFile(const char* inFileName,
     if (inFileName == stdinName) {
         /* note : Since we expect text input for xxhash -c mode,
          * Don't set binary mode for stdin */
+        inFileName = "stdin";
         inFile = stdin;
     } else {
         inFile = fopen( inFileName, "rt" );
     }
 
     if (inFile == NULL) {
-        DISPLAY( "Pb opening %s\n", inFileName);
+        DISPLAY("Error: Could not open '%s': %s\n", inFileName, strerror(errno));
         return 0;
     }
 
@@ -1297,19 +1379,22 @@ static int checkFile(const char* inFileName,
     /* Show error/warning messages.  All messages are copied from md5sum.c
      */
     if (report->nProperlyFormattedLines == 0) {
-        DISPLAY("%s: no properly formatted XXHASH checksum lines found\n", inFileName);
+        DISPLAY("%s: no properly formatted xxHash checksum lines found\n", inFileName);
     } else if (!statusOnly) {
         if (report->nImproperlyFormattedLines) {
-            DISPLAYRESULT("%lu lines are improperly formatted\n"
-                , report->nImproperlyFormattedLines);
+            DISPLAYRESULT("%lu %s are improperly formatted\n"
+                , report->nImproperlyFormattedLines
+                , report->nImproperlyFormattedLines == 1 ? "line" : "lines");
         }
         if (report->nOpenOrReadFailures) {
-            DISPLAYRESULT("%lu listed files could not be read\n"
-                , report->nOpenOrReadFailures);
+            DISPLAYRESULT("%lu listed %s could not be read\n"
+                , report->nOpenOrReadFailures
+                , report->nOpenOrReadFailures == 1 ? "file" : "files");
         }
         if (report->nMismatchedChecksums) {
-            DISPLAYRESULT("%lu computed checksums did NOT match\n"
-                , report->nMismatchedChecksums);
+            DISPLAYRESULT("%lu computed %s did NOT match\n"
+                , report->nMismatchedChecksums
+                , report->nMismatchedChecksums == 1 ? "checksum" : "checksums");
     }   }
 
     /* Result (exit) code logic is copied from
@@ -1432,7 +1517,7 @@ static int readU32FromCharChecked(const char** stringPtr, unsigned* value)
 static unsigned readU32FromChar(const char** stringPtr) {
     unsigned result;
     if (readU32FromCharChecked(stringPtr, &result)) {
-        static const char errorMsg[] = "error: numeric value too large";
+        static const char errorMsg[] = "Error: numeric value too large";
         errorOut(errorMsg);
     }
     return result;

From b31ca8c40feca1d1c3172c6af99c92edac87ded3 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Fri, 15 Mar 2019 09:10:54 -0700
Subject: [PATCH 72/73] removed XXH_FORCE_NATIVE_FORMAT

---
 README.md |  2 --
 xxhash.c  | 96 ++++++++++++++-----------------------------------------
 2 files changed, 24 insertions(+), 74 deletions(-)

diff --git a/README.md b/README.md
index 6a0ad191..5213b198 100644
--- a/README.md
+++ b/README.md
@@ -81,8 +81,6 @@ they modify xxhash behavior. They are all disabled by default.
 - `XXH_CPU_LITTLE_ENDIAN` : by default, endianess is determined at compile time.
                             It's possible to skip auto-detection and force format to little-endian, by setting this macro to 1.
                             Setting it to 0 forces big-endian.
-- `XXH_FORCE_NATIVE_FORMAT` : on big-endian systems : use native number representation.
-                              Breaks consistency with little-endian results.
 - `XXH_PRIVATE_API` : same impact as `XXH_INLINE_ALL`.
                       Name underlines that symbols will not be published on library public interface.
 - `XXH_NAMESPACE` : prefix all symbols with the value of `XXH_NAMESPACE`.
diff --git a/xxhash.c b/xxhash.c
index 82ee887b..2edd6f8a 100644
--- a/xxhash.c
+++ b/xxhash.c
@@ -71,18 +71,6 @@
 #  define XXH_ACCEPT_NULL_INPUT_POINTER 0
 #endif
 
-/*!XXH_FORCE_NATIVE_FORMAT :
- * By default, xxHash library provides endian-independent Hash values, based on little-endian convention.
- * Results are therefore identical for little-endian and big-endian CPU.
- * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
- * Should endian-independence be of no importance for your application, you may set the #define below to 1,
- * to improve speed for Big-endian CPU.
- * This option has no impact on Little_Endian CPU.
- */
-#ifndef XXH_FORCE_NATIVE_FORMAT   /* can be defined externally */
-#  define XXH_FORCE_NATIVE_FORMAT 0
-#endif
-
 /*!XXH_FORCE_ALIGN_CHECK :
  * This is a minor performance trick, only useful with lots of very small keys.
  * It means : check for aligned/unaligned input.
@@ -245,12 +233,12 @@ static U32 XXH_readBE32(const void* ptr)
 }
 
 XXH_FORCE_INLINE U32
-XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+XXH_readLE32_align(const void* ptr, XXH_alignment align)
 {
     if (align==XXH_unaligned) {
         return XXH_readLE32(ptr);
     } else {
-        return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr);
+        return XXH_CPU_LITTLE_ENDIAN ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr);
     }
 }
 
@@ -334,11 +322,10 @@ static U32 XXH32_avalanche(U32 h32)
     return(h32);
 }
 
-#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
+#define XXH_get32bits(p) XXH_readLE32_align(p, align)
 
 static U32
-XXH32_finalize(U32 h32, const void* ptr, size_t len,
-                XXH_endianess endian, XXH_alignment align)
+XXH32_finalize(U32 h32, const void* ptr, size_t len, XXH_alignment align)
 
 {
     const BYTE* p = (const BYTE*)ptr;
@@ -397,8 +384,7 @@ XXH32_finalize(U32 h32, const void* ptr, size_t len,
 }
 
 XXH_FORCE_INLINE U32
-XXH32_endian_align(const void* input, size_t len, U32 seed,
-                    XXH_endianess endian, XXH_alignment align)
+XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_alignment align)
 {
     const BYTE* p = (const BYTE*)input;
     const BYTE* bEnd = p + len;
@@ -433,7 +419,7 @@ XXH32_endian_align(const void* input, size_t len, U32 seed,
 
     h32 += (U32)len;
 
-    return XXH32_finalize(h32, p, len&15, endian, align);
+    return XXH32_finalize(h32, p, len&15, align);
 }
 
 
@@ -445,21 +431,15 @@ XXH_PUBLIC_API unsigned int XXH32 (const void* input, size_t len, unsigned int s
     XXH32_reset(&state, seed);
     XXH32_update(&state, input, len);
     return XXH32_digest(&state);
+
 #else
-    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
 
     if (XXH_FORCE_ALIGN_CHECK) {
         if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
-            if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-                return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
-            else
-                return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+            return XXH32_endian_align(input, len, seed, XXH_aligned);
     }   }
 
-    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
-    else
-        return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+    return XXH32_endian_align(input, len, seed, XXH_unaligned);
 #endif
 }
 
@@ -560,8 +540,7 @@ XXH32_update(XXH32_state_t* state, const void* input, size_t len)
 }
 
 
-XXH_FORCE_INLINE U32
-XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian)
+XXH_PUBLIC_API unsigned int XXH32_digest (const XXH32_state_t* state)
 {
     U32 h32;
 
@@ -576,18 +555,7 @@ XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian)
 
     h32 += state->total_len_32;
 
-    return XXH32_finalize(h32, state->mem32, state->memsize, endian, XXH_aligned);
-}
-
-
-XXH_PUBLIC_API unsigned int XXH32_digest (const XXH32_state_t* state_in)
-{
-    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
-
-    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH32_digest_endian(state_in, XXH_littleEndian);
-    else
-        return XXH32_digest_endian(state_in, XXH_bigEndian);
+    return XXH32_finalize(h32, state->mem32, state->memsize, XXH_aligned);
 }
 
 
@@ -690,12 +658,12 @@ static U64 XXH_readBE64(const void* ptr)
 }
 
 XXH_FORCE_INLINE U64
-XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+XXH_readLE64_align(const void* ptr, XXH_alignment align)
 {
     if (align==XXH_unaligned)
         return XXH_readLE64(ptr);
     else
-        return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr);
+        return XXH_CPU_LITTLE_ENDIAN ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr);
 }
 
 
@@ -734,11 +702,10 @@ static U64 XXH64_avalanche(U64 h64)
 }
 
 
-#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
+#define XXH_get64bits(p) XXH_readLE64_align(p, align)
 
 static U64
-XXH64_finalize(U64 h64, const void* ptr, size_t len,
-               XXH_endianess endian, XXH_alignment align)
+XXH64_finalize(U64 h64, const void* ptr, size_t len, XXH_alignment align)
 {
     const BYTE* p = (const BYTE*)ptr;
 
@@ -846,8 +813,7 @@ XXH64_finalize(U64 h64, const void* ptr, size_t len,
 }
 
 XXH_FORCE_INLINE U64
-XXH64_endian_align(const void* input, size_t len, U64 seed,
-                XXH_endianess endian, XXH_alignment align)
+XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_alignment align)
 {
     const BYTE* p = (const BYTE*)input;
     const BYTE* bEnd = p + len;
@@ -886,7 +852,7 @@ XXH64_endian_align(const void* input, size_t len, U64 seed,
 
     h64 += (U64) len;
 
-    return XXH64_finalize(h64, p, len, endian, align);
+    return XXH64_finalize(h64, p, len, align);
 }
 
 
@@ -898,21 +864,16 @@ XXH_PUBLIC_API unsigned long long XXH64 (const void* input, size_t len, unsigned
     XXH64_reset(&state, seed);
     XXH64_update(&state, input, len);
     return XXH64_digest(&state);
+
 #else
-    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
 
     if (XXH_FORCE_ALIGN_CHECK) {
         if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
-            if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-                return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
-            else
-                return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+            return XXH64_endian_align(input, len, seed, XXH_aligned);
     }   }
 
-    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
-    else
-        return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+    return XXH64_endian_align(input, len, seed, XXH_unaligned);
+
 #endif
 }
 
@@ -1006,7 +967,8 @@ XXH64_update (XXH64_state_t* state, const void* input, size_t len)
     return XXH_OK;
 }
 
-XXH_FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian)
+
+XXH_PUBLIC_API unsigned long long XXH64_digest (const XXH64_state_t* state)
 {
     U64 h64;
 
@@ -1027,17 +989,7 @@ XXH_FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endian
 
     h64 += (U64) state->total_len;
 
-    return XXH64_finalize(h64, state->mem64, (size_t)state->total_len, endian, XXH_aligned);
-}
-
-XXH_PUBLIC_API unsigned long long XXH64_digest (const XXH64_state_t* state_in)
-{
-    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
-
-    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH64_digest_endian(state_in, XXH_littleEndian);
-    else
-        return XXH64_digest_endian(state_in, XXH_bigEndian);
+    return XXH64_finalize(h64, state->mem64, (size_t)state->total_len, XXH_aligned);
 }
 
 

From 5674c6dcdd449c70ed8ed918f09ae4ef4b538460 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Fri, 15 Mar 2019 09:30:42 -0700
Subject: [PATCH 73/73] update README to present XXH3

---
 README.md | 42 +++++++++++++++++++++++++++++++++---------
 xxhash.c  |  4 ++--
 2 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 5213b198..323bc6f0 100644
--- a/README.md
+++ b/README.md
@@ -98,7 +98,7 @@ they modify xxhash behavior. They are all disabled by default.
 
 Calling xxhash 64-bit variant from a C program :
 
-```c
+```C
 #include "xxhash.h"
 
 unsigned long long calcul_hash(const void* buffer, size_t length)
@@ -110,42 +110,66 @@ unsigned long long calcul_hash(const void* buffer, size_t length)
 ```
 
 Using streaming variant is more involved, but makes it possible to provide data in multiple rounds :
-```c
+```C
 #include "stdlib.h"   /* abort() */
 #include "xxhash.h"
 
 
 unsigned long long calcul_hash_streaming(someCustomType handler)
 {
+    /* create a hash state */
     XXH64_state_t* const state = XXH64_createState();
     if (state==NULL) abort();
 
-    size_t const bufferSize = SOME_VALUE;
+    size_t const bufferSize = SOME_SIZE;
     void* const buffer = malloc(bufferSize);
     if (buffer==NULL) abort();
 
+    /* Initialize state with selected seed */
     unsigned long long const seed = 0;   /* or any other value */
     XXH_errorcode const resetResult = XXH64_reset(state, seed);
     if (resetResult == XXH_ERROR) abort();
 
+    /* Feed the state with input data, any size, any number of times */
     (...)
     while ( /* any condition */ ) {
-        size_t const length = get_more_data(buffer, bufferSize, handler);   /* undescribed */
-        XXH_errorcode const addResult = XXH64_update(state, buffer, length);
-        if (addResult == XXH_ERROR) abort();
+        size_t const length = get_more_data(buffer, bufferSize, handler);   
+        XXH_errorcode const updateResult = XXH64_update(state, buffer, length);
+        if (updateResult == XXH_ERROR) abort();
         (...)
     }
-
     (...)
-    unsigned long long const hash = XXH64_digest(state);
 
+    /* Get the hash */
+    XXH64_hash_t const hash = XXH64_digest(state);
+
+    /* State can then be re-used; in this example, it is simply freed  */
     free(buffer);
     XXH64_freeState(state);
 
-    return hash;
+    return (unsigned long long)hash;
 }
 ```
 
+### New experimental hash algorithm
+
+Starting with `v0.7.0`, the library includes a new algorithm, named `XXH3`,
+able to generate 64 and 128-bits hashes.
+
+The new algorithm is much faster than its predecessors,
+for both long and small inputs,
+as can be observed in following graphs :
+
+![XXH3, bargraph](https://github.com/Cyan4973/xxHash/releases/download/graphs/H_bandwidth_bargraph.png)
+
+![XXH3, latency, random size](https://github.com/Cyan4973/xxHash/releases/download/graphs/H_latency_randomS.png)
+
+The algorithm is currently labelled experimental, as it may change in a future version.
+To access it, one need to unlock its declaration using macro `XXH_STATIC_LINKING_ONLY`.
+It can be used for ephemeral data, and for tests, but avoid storing long-term hash values yet.
+`XXH3` will be stabilized in a future version.
+This period will be used to collect users' feedback.
+
 
 ### Other programming languages
 
diff --git a/xxhash.c b/xxhash.c
index 2edd6f8a..0fd12ce3 100644
--- a/xxhash.c
+++ b/xxhash.c
@@ -1009,10 +1009,10 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
 
 
 
-/* *******************************************************************
+/* *********************************************************************
 *  XXH3
 *  New generation hash designed for speed on small keys and vectorization
-********************************************************************** */
+************************************************************************ */
 
 #include "xxh3.h"