[Library] Update zlibng (#1255)

* Update zlibng * Set cmake path more directly in zlibng to hopefully fix an issue with the build on drone * I'm dumb, missing / in path * Mackal helps with a dumb gitignore issue * Adding all the files, not sure what's ignoring them and im tired of looking * Some tweaks to zlibng build to hopefully get it to build properly. works on msvc now
2026-06-24 17:48:20 +00:00 · 2021-02-23 17:00:26 -08:00
parent e6dee96266
commit 2957f5084d
184 changed files with 22029 additions and 11703 deletions
@@ -1,3 +0,0 @@
-fill_window_sse.c	SSE2 optimized fill_window
-deflate_quick.c		SSE4 optimized deflate strategy for use as level 1
-crc_folding.c		SSE4 + PCLMULQDQ optimized CRC folding implementation
@@ -0,0 +1,8 @@
+Contents
+--------
+
+|Name|Description|
+|:-|:-|
+|deflate_quick.c|SSE4 optimized deflate strategy for use as level 1|
+|crc_folding.c|SSE4 + PCLMULQDQ optimized CRC folding implementation|
+|slide_sse2.c|SSE2 optimized slide_hash|
@@ -8,7 +8,9 @@ SFLAGS=
 INCLUDES=
 SUFFIX=

+AVX2FLAG=-mavx2
 SSE2FLAG=-msse2
+SSSE3FLAG=-mssse3
 SSE4FLAG=-msse4
 PCLMULFLAG=-mpclmul

@@ -16,7 +18,18 @@ SRCDIR=.
 SRCTOP=../..
 TOPDIR=$(SRCTOP)

-all: x86.o x86.lo fill_window_sse.o fill_window_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo
+all: \
+	x86.o x86.lo \
+	adler32_avx.o adler32.lo \
+	adler32_ssse3.o adler32_ssse3.lo \
+	chunkset_avx.o chunkset_avx.lo \
+	chunkset_sse.o chunkset_sse.lo \
+	compare258_avx.o compare258_avx.lo \
+	compare258_sse.o compare258_sse.lo \
+	insert_string_sse.o insert_string_sse.lo \
+	crc_folding.o crc_folding.lo \
+	slide_avx.o slide_avx.lo \
+	slide_sse.o slide_sse.lo

 x86.o:
 	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
@@ -24,17 +37,29 @@ x86.o:
 x86.lo:
 	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c

-fill_window_sse.o:
-	$(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_sse.c
+chunkset_avx.o:
+	$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx.c

-fill_window_sse.lo:
-	$(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_sse.c
+chunkset_avx.lo:
+	$(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx.c

-deflate_quick.o:
-	$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/deflate_quick.c
+chunkset_sse.o:
+	$(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse.c

-deflate_quick.lo:
-	$(CC) $(SFLAGS) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/deflate_quick.c
+chunkset_sse.lo:
+	$(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse.c
+
+compare258_avx.o:
+	$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx.c
+
+compare258_avx.lo:
+	$(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx.c
+
+compare258_sse.o:
+	$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c
+
+compare258_sse.lo:
+	$(CC) $(SFLAGS) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c

 insert_string_sse.o:
 	$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse.c
@@ -48,6 +73,30 @@ crc_folding.o:
 crc_folding.lo:
 	$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c

+slide_avx.o:
+	$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_avx.c
+
+slide_avx.lo:
+	$(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_avx.c
+
+slide_sse.o:
+	$(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_sse.c
+
+slide_sse.lo:
+	$(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_sse.c
+
+adler32_avx.o: $(SRCDIR)/adler32_avx.c
+	$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx.c
+
+adler32_avx.lo: $(SRCDIR)/adler32_avx.c
+	$(CC) $(SFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx.c
+
+adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
+	$(CC) $(CFLAGS) $(SSSE3FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+
+adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c
+	$(CC) $(SFLAGS) $(SSSE3FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+
 mostlyclean: clean
 clean:
 	rm -f *.o *.lo *~
@@ -0,0 +1,117 @@
+/* adler32.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../zutil.h"
+
+#include "../../adler32_p.h"
+
+#include <immintrin.h>
+
+#ifdef X86_AVX2_ADLER32
+
+Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len) {
+    uint32_t sum2;
+
+     /* split Adler-32 into component sums */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (UNLIKELY(buf == NULL))
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_len_16(adler, buf, len, sum2);
+
+    uint32_t ALIGNED_(32) s1[8], s2[8];
+
+    memset(s1, 0, sizeof(s1)); s1[7] = adler; // TODO: would a masked load be faster?
+    memset(s2, 0, sizeof(s2)); s2[7] = sum2;
+
+    char ALIGNED_(32) dot1[32] = \
+        {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+    __m256i dot1v = _mm256_load_si256((__m256i*)dot1);
+    char ALIGNED_(32) dot2[32] = \
+        {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+         16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+    __m256i dot2v = _mm256_load_si256((__m256i*)dot2);
+    short ALIGNED_(32) dot3[16] = \
+        {1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1};
+    __m256i dot3v = _mm256_load_si256((__m256i*)dot3);
+
+    // We will need to multiply by
+    char ALIGNED_(32) shift[16] = {5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    __m128i shiftv = _mm_load_si128((__m128i*)shift);
+
+    while (len >= 32) {
+       __m256i vs1 = _mm256_load_si256((__m256i*)s1);
+       __m256i vs2 = _mm256_load_si256((__m256i*)s2);
+       __m256i vs1_0 = vs1;
+
+       int k = (len < NMAX ? (int)len : NMAX);
+       k -= k % 32;
+       len -= k;
+
+       while (k >= 32) {
+           /*
+              vs1 = adler + sum(c[i])
+              vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+           */
+           __m256i vbuf = _mm256_loadu_si256((__m256i*)buf);
+           buf += 32;
+           k -= 32;
+
+           __m256i v_short_sum1 = _mm256_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts.
+           __m256i vsum1 = _mm256_madd_epi16(v_short_sum1, dot3v);   // sum 8 shorts to 4 int32_t;
+           __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v);
+           vs1 = _mm256_add_epi32(vsum1, vs1);
+           __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v);
+           vs1_0 = _mm256_sll_epi32(vs1_0, shiftv);
+           vsum2 = _mm256_add_epi32(vsum2, vs2);
+           vs2   = _mm256_add_epi32(vsum2, vs1_0);
+           vs1_0 = vs1;
+       }
+
+       // At this point, we have partial sums stored in vs1 and vs2.  There are AVX512 instructions that
+       // would allow us to sum these quickly (VP4DPWSSD).  For now, just unpack and move on.
+       uint32_t ALIGNED_(32) s1_unpack[8];
+       uint32_t ALIGNED_(32) s2_unpack[8];
+
+       _mm256_store_si256((__m256i*)s1_unpack, vs1);
+       _mm256_store_si256((__m256i*)s2_unpack, vs2);
+
+       adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
+               (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
+       adler %= BASE;
+       s1[7] = adler;
+
+       sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE) +
+              (s2_unpack[4] % BASE) + (s2_unpack[5] % BASE) + (s2_unpack[6] % BASE) + (s2_unpack[7] % BASE);
+       sum2 %= BASE;
+       s2[7] = sum2;
+    }
+
+    while (len) {
+        len--;
+        adler += *buf++;
+        sum2 += adler;
+    }
+    adler %= BASE;
+    sum2 %= BASE;
+
+    /* return recombined sums */
+    return adler | (sum2 << 16);
+}
+
+#endif
@@ -0,0 +1,118 @@
+/* adler32.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../zutil.h"
+
+#include "../../adler32_p.h"
+
+#ifdef X86_SSSE3_ADLER32
+
+#include <immintrin.h>
+
+Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len) {
+    uint32_t sum2;
+
+     /* split Adler-32 into component sums */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (UNLIKELY(buf == NULL))
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_len_16(adler, buf, len, sum2);
+
+    uint32_t ALIGNED_(16) s1[4], s2[4];
+
+    s1[0] = s1[1] = s1[2] = 0; s1[3] = adler;
+    s2[0] = s2[1] = s2[2] = 0; s2[3] = sum2;
+
+    char ALIGNED_(16) dot1[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+    __m128i dot1v = _mm_load_si128((__m128i*)dot1);
+    char ALIGNED_(16) dot2[16] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+    __m128i dot2v = _mm_load_si128((__m128i*)dot2);
+    short ALIGNED_(16) dot3[8] = {1, 1, 1, 1, 1, 1, 1, 1};
+    __m128i dot3v = _mm_load_si128((__m128i*)dot3);
+
+    // We will need to multiply by
+    //char ALIGNED_(16) shift[4] = {0, 0, 0, 4}; //{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4};
+
+    char ALIGNED_(16) shift[16] = {4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    __m128i shiftv = _mm_load_si128((__m128i*)shift);
+
+    while (len >= 16) {
+       __m128i vs1 = _mm_load_si128((__m128i*)s1);
+       __m128i vs2 = _mm_load_si128((__m128i*)s2);
+       __m128i vs1_0 = vs1;
+
+       int k = (len < NMAX ? (int)len : NMAX);
+       k -= k % 16;
+       len -= k;
+
+       while (k >= 16) {
+           /*
+              vs1 = adler + sum(c[i])
+              vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+
+              NOTE: 256-bit equivalents are:
+                _mm256_maddubs_epi16 <- operates on 32 bytes to 16 shorts
+                _mm256_madd_epi16    <- Sums 16 shorts to 8 int32_t.
+              We could rewrite the below to use 256-bit instructions instead of 128-bit.
+           */
+           __m128i vbuf = _mm_loadu_si128((__m128i*)buf);
+           buf += 16;
+           k -= 16;
+
+           __m128i v_short_sum1 = _mm_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts.
+           __m128i vsum1 = _mm_madd_epi16(v_short_sum1, dot3v);  // sum 8 shorts to 4 int32_t;
+           __m128i v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+           vs1 = _mm_add_epi32(vsum1, vs1);
+           __m128i vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+           vs1_0 = _mm_sll_epi32(vs1_0, shiftv);
+           vsum2 = _mm_add_epi32(vsum2, vs2);
+           vs2   = _mm_add_epi32(vsum2, vs1_0);
+           vs1_0 = vs1;
+       }
+
+       // At this point, we have partial sums stored in vs1 and vs2.  There are AVX512 instructions that
+       // would allow us to sum these quickly (VP4DPWSSD).  For now, just unpack and move on.
+
+       uint32_t ALIGNED_(16) s1_unpack[4];
+       uint32_t ALIGNED_(16) s2_unpack[4];
+
+       _mm_store_si128((__m128i*)s1_unpack, vs1);
+       _mm_store_si128((__m128i*)s2_unpack, vs2);
+
+       adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE);
+       adler %= BASE;
+       s1[3] = adler;
+
+       sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE);
+       sum2 %= BASE;
+       s2[3] = sum2;
+    }
+
+    while (len) {
+        len--;
+        adler += *buf++;
+        sum2 += adler;
+    }
+    adler %= BASE;
+    sum2 %= BASE;
+
+    /* return recombined sums */
+    return adler | (sum2 << 16);
+}
+
+#endif
@@ -0,0 +1,50 @@
+/* chunkset_avx.c -- AVX inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "zbuild.h"
+#include "zutil.h"
+
+#ifdef X86_AVX_CHUNKSET
+#include <immintrin.h>
+
+typedef __m256i chunk_t;
+
+#define HAVE_CHUNKMEMSET_1
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm256_set1_epi8(*(int8_t *)from);
+}
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm256_set1_epi16(*(int16_t *)from);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm256_set1_epi32(*(int32_t *)from);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm256_set1_epi64x(*(int64_t *)from);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = _mm256_loadu_si256((__m256i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    _mm256_storeu_si256((__m256i *)out, *chunk);
+}
+
+#define CHUNKSIZE        chunksize_avx
+#define CHUNKCOPY        chunkcopy_avx
+#define CHUNKCOPY_SAFE   chunkcopy_safe_avx
+#define CHUNKUNROLL      chunkunroll_avx
+#define CHUNKMEMSET      chunkmemset_avx
+#define CHUNKMEMSET_SAFE chunkmemset_safe_avx
+
+#include "chunkset_tpl.h"
+
+#endif
@@ -0,0 +1,51 @@
+/* chunkset_sse.c -- SSE inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zutil.h"
+
+#ifdef X86_SSE2
+#include <immintrin.h>
+
+typedef __m128i chunk_t;
+
+#define HAVE_CHUNKMEMSET_1
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm_set1_epi8(*(int8_t *)from);
+}
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm_set1_epi16(*(int16_t *)from);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm_set1_epi32(*(int32_t *)from);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm_set1_epi64x(*(int64_t *)from);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+#define CHUNKSIZE        chunksize_sse2
+#define CHUNKCOPY        chunkcopy_sse2
+#define CHUNKCOPY_SAFE   chunkcopy_safe_sse2
+#define CHUNKUNROLL      chunkunroll_sse2
+#define CHUNKMEMSET      chunkmemset_sse2
+#define CHUNKMEMSET_SAFE chunkmemset_safe_sse2
+
+#include "chunkset_tpl.h"
+
+#endif
@@ -0,0 +1,67 @@
+/* compare258_avx.c -- AVX2 version of compare258
+ * Copyright Mika T. Lindqvist  <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../zutil.h"
+
+#include "fallback_builtins.h"
+
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+
+#include <immintrin.h>
+#ifdef _MSC_VER
+#  include <nmmintrin.h>
+#endif
+
+/* UNALIGNED_OK, AVX2 intrinsic comparison */
+static inline uint32_t compare256_unaligned_avx2_static(const unsigned char *src0, const unsigned char *src1) {
+    uint32_t len = 0;
+
+    do {
+        __m256i ymm_src0, ymm_src1, ymm_cmp;
+        ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
+        ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
+        ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
+        unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
+        if (mask != 0xFFFFFFFF) {
+            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */
+            return len + match_byte;
+        }
+
+        src0 += 32, src1 += 32, len += 32;
+
+        ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
+        ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
+        ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1);
+        mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
+        if (mask != 0xFFFFFFFF) {
+            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+            return len + match_byte;
+        }
+
+        src0 += 32, src1 += 32, len += 32;
+    } while (len < 256);
+
+    return 256;
+}
+
+static inline uint32_t compare258_unaligned_avx2_static(const unsigned char *src0, const unsigned char *src1) {
+    if (*(uint16_t *)src0 != *(uint16_t *)src1)
+        return (*src0 == *src1);
+
+    return compare256_unaligned_avx2_static(src0+2, src1+2) + 2;
+}
+
+Z_INTERNAL uint32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1) {
+    return compare258_unaligned_avx2_static(src0, src1);
+}
+
+#define LONGEST_MATCH   longest_match_unaligned_avx2
+#define COMPARE256      compare256_unaligned_avx2_static
+#define COMPARE258      compare258_unaligned_avx2_static
+
+#include "match_tpl.h"
+
+#endif
@@ -0,0 +1,74 @@
+/* compare258_sse.c -- SSE4.2 version of compare258
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Authors:
+ *  Wajdi Feghali   <wajdi.k.feghali@intel.com>
+ *  Jim Guilford    <james.guilford@intel.com>
+ *  Vinodh Gopal    <vinodh.gopal@intel.com>
+ *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
+ *  Jim Kukunas     <james.t.kukunas@linux.intel.com>
+ *
+ * Portions are Copyright (C) 2016 12Sided Technology, LLC.
+ * Author:
+ *  Phil Vachon     <pvachon@12sidedtech.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../zutil.h"
+
+#ifdef X86_SSE42_CMP_STR
+
+#include <immintrin.h>
+#ifdef _MSC_VER
+#  include <nmmintrin.h>
+#endif
+
+/* UNALIGNED_OK, SSE4.2 intrinsic comparison */
+static inline uint32_t compare256_unaligned_sse4_static(const unsigned char *src0, const unsigned char *src1) {
+    uint32_t len = 0;
+
+    do {
+        #define mode _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY
+        __m128i xmm_src0, xmm_src1;
+        uint32_t ret;
+
+        xmm_src0 = _mm_loadu_si128((__m128i *)src0);
+        xmm_src1 = _mm_loadu_si128((__m128i *)src1);
+        ret = (uint32_t)_mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode);
+        if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) {
+            return len + ret;
+        }
+        src0 += 16, src1 += 16, len += 16;
+
+        xmm_src0 = _mm_loadu_si128((__m128i *)src0);
+        xmm_src1 = _mm_loadu_si128((__m128i *)src1);
+        ret = (uint32_t)_mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode);
+        if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) {
+            return len + ret;
+        }
+        src0 += 16, src1 += 16, len += 16;
+    } while (len < 256);
+
+    return 256;
+}
+
+static inline uint32_t compare258_unaligned_sse4_static(const unsigned char *src0, const unsigned char *src1) {
+    if (*(uint16_t *)src0 != *(uint16_t *)src1)
+        return (*src0 == *src1);
+
+    return compare256_unaligned_sse4_static(src0+2, src1+2) + 2;
+}
+
+Z_INTERNAL uint32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1) {
+    return compare258_unaligned_sse4_static(src0, src1);
+}
+
+#define LONGEST_MATCH   longest_match_unaligned_sse4
+#define COMPARE256      compare256_unaligned_sse4_static
+#define COMPARE258      compare258_unaligned_sse4_static
+
+#include "match_tpl.h"
+
+#endif
@@ -1,5 +1,5 @@
 /*
- * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ 
+ * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
 * instruction.
 *
 * A white paper describing this algorithm can be found at:
@@ -18,14 +18,14 @@

 #ifdef X86_PCLMULQDQ_CRC

-#include "zbuild.h"
+#include "../../zbuild.h"
 #include <inttypes.h>
 #include <immintrin.h>
 #include <wmmintrin.h>

 #include "crc_folding.h"

-ZLIB_INTERNAL void crc_fold_init(deflate_state *const s) {
+Z_INTERNAL void crc_fold_init(deflate_state *const s) {
    /* CRC_SAVE */
    _mm_storeu_si128((__m128i *)s->crc0 + 0, _mm_cvtsi32_si128(0x9db42487));
    _mm_storeu_si128((__m128i *)s->crc0 + 1, _mm_setzero_si128());
@@ -227,9 +227,10 @@ static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
    *xmm_crc3 = _mm_castps_si128(ps_res);
 }

-ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const unsigned char *src, long len) {
+Z_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const unsigned char *src, long len) {
    unsigned long algn_diff;
    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
+    char ALIGNED_(16) partial_buf[16] = { 0 };

    /* CRC_LOAD */
    __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);
@@ -241,11 +242,14 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
    if (len < 16) {
        if (len == 0)
            return;
-        xmm_crc_part = _mm_loadu_si128((__m128i *)src);
+
+        memcpy(partial_buf, src, len);
+        xmm_crc_part = _mm_loadu_si128((const __m128i *)partial_buf);
+        memcpy(dst, partial_buf, len);
        goto partial;
    }

-    algn_diff = (0 - (uintptr_t)src) & 0xF;
+    algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
    if (algn_diff) {
        xmm_crc_part = _mm_loadu_si128((__m128i *)src);
        _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
@@ -255,6 +259,8 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
        len -= algn_diff;

        partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
+    } else {
+        xmm_crc_part = _mm_setzero_si128();
    }

    while ((len -= 64) >= 0) {
@@ -305,7 +311,7 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
            goto done;

        dst += 48;
-        xmm_crc_part = _mm_load_si128((__m128i *)src + 3);
+        memcpy(&xmm_crc_part, (__m128i *)src + 3, len);
    } else if (len + 32 >= 0) {
        len += 32;

@@ -324,7 +330,7 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
            goto done;

        dst += 32;
-        xmm_crc_part = _mm_load_si128((__m128i *)src + 2);
+        memcpy(&xmm_crc_part, (__m128i *)src + 2, len);
    } else if (len + 48 >= 0) {
        len += 48;

@@ -340,16 +346,18 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
            goto done;

        dst += 16;
-        xmm_crc_part = _mm_load_si128((__m128i *)src + 1);
+        memcpy(&xmm_crc_part, (__m128i *)src + 1, len);
    } else {
        len += 64;
        if (len == 0)
            goto done;
-        xmm_crc_part = _mm_load_si128((__m128i *)src);
+        memcpy(&xmm_crc_part, src, len);
    }

+    _mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);
+    memcpy(dst, partial_buf, len);
+
 partial:
-    _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
    partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
 done:
    /* CRC_SAVE */
@@ -377,7 +385,7 @@ static const unsigned ALIGNED_(16) crc_mask2[4] = {
    0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
 };

-uint32_t ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s) {
+uint32_t Z_INTERNAL crc_fold_512to32(deflate_state *const s) {
    const __m128i xmm_mask  = _mm_load_si128((__m128i *)crc_mask);
    const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);

@@ -447,4 +455,3 @@ uint32_t ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s) {
 }

 #endif
-
@@ -10,10 +10,10 @@
 #ifndef CRC_FOLDING_H_
 #define CRC_FOLDING_H_

-#include "deflate.h"
+#include "../../deflate.h"

-ZLIB_INTERNAL void crc_fold_init(deflate_state *const);
-ZLIB_INTERNAL uint32_t crc_fold_512to32(deflate_state *const);
-ZLIB_INTERNAL void crc_fold_copy(deflate_state *const, unsigned char *, const unsigned char *, long);
+Z_INTERNAL void crc_fold_init(deflate_state *const);
+Z_INTERNAL uint32_t crc_fold_512to32(deflate_state *const);
+Z_INTERNAL void crc_fold_copy(deflate_state *const, unsigned char *, const unsigned char *, long);

 #endif
@@ -1,25 +0,0 @@
-#ifndef X86_CTZL_H
-#define X86_CTZL_H
-
-#include <intrin.h>
-#ifdef X86_CPUID
-# include "x86.h"
-#endif
-
-#if defined(_MSC_VER) && !defined(__clang__)
-/* This is not a general purpose replacement for __builtin_ctzl. The function expects that value is != 0
- * Because of that assumption trailing_zero is not initialized and the return value of _BitScanForward is not checked
- */
-static __forceinline unsigned long __builtin_ctzl(unsigned long value)
-{
-#ifdef X86_CPUID
-	if (x86_cpu_has_tzcnt)
-		return _tzcnt_u32(value);
-#endif
-	unsigned long trailing_zero;
-	_BitScanForward(&trailing_zero, value);
-	return trailing_zero;
-}
-#endif
-
-#endif
@@ -1,175 +0,0 @@
-/*
- * Fill Window with SSE2-optimized hash shifting
- *
- * Copyright (C) 2013 Intel Corporation
- * Authors:
- *  Arjan van de Ven    <arjan@linux.intel.com>
- *  Jim Kukunas         <james.t.kukunas@linux.intel.com>
- *
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-#ifdef X86_SSE2
-
-#include "zbuild.h"
-#include <immintrin.h>
-#include "deflate.h"
-#include "deflate_p.h"
-#include "functable.h"
-
-extern int read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
-
-ZLIB_INTERNAL void fill_window_sse(deflate_state *s) {
-    const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
-
-    register unsigned n;
-    register Pos *p;
-    unsigned more;    /* Amount of free space at the end of the window. */
-    unsigned int wsize = s->w_size;
-
-    Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
-
-    do {
-        more = (unsigned)(s->window_size -(unsigned long)s->lookahead -(unsigned long)s->strstart);
-
-        /* Deal with !@#$% 64K limit: */
-        if (sizeof(int) <= 2) {
-            if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
-                more = wsize;
-
-            } else if (more == (unsigned)(-1)) {
-                /* Very unlikely, but possible on 16 bit machine if
-                 * strstart == 0 && lookahead == 1 (input done a byte at time)
-                 */
-                more--;
-            }
-        }
-
-        /* If the window is almost full and there is insufficient lookahead,
-         * move the upper half to the lower one to make room in the upper half.
-         */
-        if (s->strstart >= wsize+MAX_DIST(s)) {
-            memcpy(s->window, s->window+wsize, (unsigned)wsize);
-            s->match_start = (s->match_start >= wsize) ? s->match_start - wsize : 0;
-            s->strstart    -= wsize; /* we now have strstart >= MAX_DIST */
-            s->block_start -= (long) wsize;
-
-            /* Slide the hash table (could be avoided with 32 bit values
-               at the expense of memory usage). We slide even when level == 0
-               to keep the hash table consistent if we switch back to level > 0
-               later. (Using level 0 permanently is not an optimal usage of
-               zlib, so we don't care about this pathological case.)
-             */
-            n = s->hash_size;
-            p = &s->head[n];
-            p -= 8;
-            do {
-                __m128i value, result;
-
-                value = _mm_loadu_si128((__m128i *)p);
-                result = _mm_subs_epu16(value, xmm_wsize);
-                _mm_storeu_si128((__m128i *)p, result);
-
-                p -= 8;
-                n -= 8;
-            } while (n > 0);
-
-            n = wsize;
-            p = &s->prev[n];
-            p -= 8;
-            do {
-                __m128i value, result;
-
-                value = _mm_loadu_si128((__m128i *)p);
-                result = _mm_subs_epu16(value, xmm_wsize);
-                _mm_storeu_si128((__m128i *)p, result);
-
-                p -= 8;
-                n -= 8;
-            } while (n > 0);
-            more += wsize;
-        }
-        if (s->strm->avail_in == 0) break;
-
-        /* If there was no sliding:
-         *    strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
-         *    more == window_size - lookahead - strstart
-         * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
-         * => more >= window_size - 2*WSIZE + 2
-         * In the BIG_MEM or MMAP case (not yet supported),
-         *   window_size == input_size + MIN_LOOKAHEAD  &&
-         *   strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
-         * Otherwise, window_size == 2*WSIZE so more >= 2.
-         * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
-         */
-        Assert(more >= 2, "more < 2");
-
-        n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
-        s->lookahead += n;
-
-        /* Initialize the hash value now that we have some input: */
-        if (s->lookahead + s->insert >= MIN_MATCH) {
-            unsigned int str = s->strstart - s->insert;
-            s->ins_h = s->window[str];
-            if (str >= 1)
-                functable.insert_string(s, str + 2 - MIN_MATCH, 1);
-#if MIN_MATCH != 3
-#error Call insert_string() MIN_MATCH-3 more times
-            while (s->insert) {
-                functable.insert_string(s, str, 1);
-                str++;
-                s->insert--;
-                if (s->lookahead + s->insert < MIN_MATCH)
-                    break;
-            }
-#else
-            unsigned int count;
-            if (unlikely(s->lookahead == 1)){
-                count = s->insert - 1;
-            }else{
-                count = s->insert;
-            }
-            functable.insert_string(s, str, count);
-            s->insert -= count;
-#endif
-        }
-        /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
-         * but this is not important since only literal bytes will be emitted.
-         */
-    } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
-
-    /* If the WIN_INIT bytes after the end of the current data have never been
-     * written, then zero those bytes in order to avoid memory check reports of
-     * the use of uninitialized (or uninitialised as Julian writes) bytes by
-     * the longest match routines.  Update the high water mark for the next
-     * time through here.  WIN_INIT is set to MAX_MATCH since the longest match
-     * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
-     */
-    if (s->high_water < s->window_size) {
-        unsigned long curr = s->strstart + (unsigned long)(s->lookahead);
-        unsigned long init;
-
-        if (s->high_water < curr) {
-            /* Previous high water mark below current data -- zero WIN_INIT
-             * bytes or up to end of window, whichever is less.
-             */
-            init = s->window_size - curr;
-            if (init > WIN_INIT)
-                init = WIN_INIT;
-            memset(s->window + curr, 0, (unsigned)init);
-            s->high_water = curr + init;
-        } else if (s->high_water < (unsigned long)curr + WIN_INIT) {
-            /* High water mark at or above current data, but below current data
-             * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
-             * to end of window, whichever is less.
-             */
-            init = (unsigned long)curr + WIN_INIT - s->high_water;
-            if (init > s->window_size - s->high_water)
-                init = s->window_size - s->high_water;
-            memset(s->window + s->high_water, 0, (unsigned)init);
-            s->high_water += init;
-        }
-    }
-
-    Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "not enough room for search");
-}
-#endif
@@ -5,52 +5,42 @@
 *
 */

-#include "zbuild.h"
-#include "deflate.h"
-
-/* ===========================================================================
- * Insert string str in the dictionary and set match_head to the previous head
- * of the hash chain (the most recent string with same hash key). Return
- * the previous length of the hash chain.
- * IN  assertion: all calls to to INSERT_STRING are made with consecutive
- *    input characters and the first MIN_MATCH bytes of str are valid
- *    (except for the last MIN_MATCH-1 bytes of the input file).
- */
-#ifdef X86_SSE4_2_CRC_HASH
-ZLIB_INTERNAL Pos insert_string_sse(deflate_state *const s, const Pos str, unsigned int count) {
-    Pos ret = 0;
-    unsigned int idx;
-    unsigned int *ip, val, h;
-
-    for (idx = 0; idx < count; idx++) {
-        ip = (unsigned *)&s->window[str+idx];
-        memcpy(&val, ip, sizeof(val));
-        h = 0;
-
-        if (s->level >= TRIGGER_LEVEL)
-            val &= 0xFFFFFF;
-
+#include "../../zbuild.h"
+#include <immintrin.h>
 #ifdef _MSC_VER
-        h = _mm_crc32_u32(h, val);
-#elif defined(X86_SSE4_2_CRC_INTRIN)
-        h = __builtin_ia32_crc32si(h, val);
+#  include <nmmintrin.h>
+#endif
+#include "../../deflate.h"
+
+#ifdef X86_SSE42_CRC_INTRIN
+#  ifdef _MSC_VER
+#    define UPDATE_HASH(s, h, val)\
+        h = _mm_crc32_u32(h, val)
+#  else
+#    define UPDATE_HASH(s, h, val)\
+        h = __builtin_ia32_crc32si(h, val)
+#  endif
 #else
-        __asm__ __volatile__ (
-            "crc32 %1,%0\n\t"
-            : "+r" (h)
-            : "r" (val)
-        );
-#endif
-        Pos head = s->head[h & s->hash_mask];
-        if (head != str+idx) {
-            s->prev[(str+idx) & s->w_mask] = head;
-            s->head[h & s->hash_mask] = str+idx;
-            if (idx == count-1)
-              ret = head;
-        } else if (idx == count - 1) {
-          ret = str + idx;
-        }
+#  ifdef _MSC_VER
+#    define UPDATE_HASH(s, h, val) {\
+        __asm mov edx, h\
+        __asm mov eax, val\
+        __asm crc32 eax, edx\
+        __asm mov val, eax\
    }
-    return ret;
-}
+#  else
+#    define UPDATE_HASH(s, h, val) \
+        __asm__ __volatile__ (\
+            "crc32 %1,%0\n\t"\
+            : "+r" (h)\
+            : "r" (val)\
+        );
+#  endif
+#endif
+
+#define INSERT_STRING       insert_string_sse4
+#define QUICK_INSERT_STRING quick_insert_string_sse4
+
+#ifdef X86_SSE42_CRC_HASH
+#  include "../../insert_string_tpl.h"
 #endif
@@ -0,0 +1,47 @@
+/*
+ * AVX2 optimized hash slide, based on Intel's slide_sse implementation
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Authors:
+ *   Arjan van de Ven   <arjan@linux.intel.com>
+ *   Jim Kukunas        <james.t.kukunas@linux.intel.com>
+ *   Mika T. Lindqvist  <postmaster@raasu.org>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+#include <immintrin.h>
+
+Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
+    Pos *p;
+    unsigned n;
+    uint16_t wsize = (uint16_t)s->w_size;
+    const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
+
+    n = HASH_SIZE;
+    p = &s->head[n] - 16;
+    do {
+        __m256i value, result;
+
+        value = _mm256_loadu_si256((__m256i *)p);
+        result= _mm256_subs_epu16(value, ymm_wsize);
+        _mm256_storeu_si256((__m256i *)p, result);
+        p -= 16;
+        n -= 16;
+    } while (n > 0);
+
+    n = wsize;
+    p = &s->prev[n] - 16;
+    do {
+        __m256i value, result;
+
+        value = _mm256_loadu_si256((__m256i *)p);
+        result= _mm256_subs_epu16(value, ymm_wsize);
+        _mm256_storeu_si256((__m256i *)p, result);
+
+        p -= 16;
+        n -= 16;
+    } while (n > 0);
+}
@@ -0,0 +1,46 @@
+/*
+ * SSE optimized hash slide
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Authors:
+ *   Arjan van de Ven   <arjan@linux.intel.com>
+ *   Jim Kukunas        <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+#include <immintrin.h>
+
+Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
+    Pos *p;
+    unsigned n;
+    uint16_t wsize = (uint16_t)s->w_size;
+    const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
+
+    n = HASH_SIZE;
+    p = &s->head[n] - 8;
+    do {
+        __m128i value, result;
+
+        value = _mm_loadu_si128((__m128i *)p);
+        result= _mm_subs_epu16(value, xmm_wsize);
+        _mm_storeu_si128((__m128i *)p, result);
+        p -= 8;
+        n -= 8;
+    } while (n > 0);
+
+    n = wsize;
+    p = &s->prev[n] - 8;
+    do {
+        __m128i value, result;
+
+        value = _mm_loadu_si128((__m128i *)p);
+        result= _mm_subs_epu16(value, xmm_wsize);
+        _mm_storeu_si128((__m128i *)p, result);
+
+        p -= 8;
+        n -= 8;
+    } while (n > 0);
+}
@@ -8,61 +8,73 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#include "zutil.h"
+#include "../../zutil.h"

 #ifdef _MSC_VER
-#include <intrin.h>
+#  include <intrin.h>
 #else
 // Newer versions of GCC and clang come with cpuid.h
-#include <cpuid.h>
+#  include <cpuid.h>
 #endif

-ZLIB_INTERNAL int x86_cpu_has_sse2;
-ZLIB_INTERNAL int x86_cpu_has_sse42;
-ZLIB_INTERNAL int x86_cpu_has_pclmulqdq;
-ZLIB_INTERNAL int x86_cpu_has_tzcnt;
+Z_INTERNAL int x86_cpu_has_avx2;
+Z_INTERNAL int x86_cpu_has_sse2;
+Z_INTERNAL int x86_cpu_has_ssse3;
+Z_INTERNAL int x86_cpu_has_sse42;
+Z_INTERNAL int x86_cpu_has_pclmulqdq;
+Z_INTERNAL int x86_cpu_has_tzcnt;

 static void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
 #ifdef _MSC_VER
-	unsigned int registers[4];
-	__cpuid(registers, info);
+    unsigned int registers[4];
+    __cpuid((int *)registers, info);

-	*eax = registers[0];
-	*ebx = registers[1];
-	*ecx = registers[2];
-	*edx = registers[3];
+    *eax = registers[0];
+    *ebx = registers[1];
+    *ecx = registers[2];
+    *edx = registers[3];
 #else
-	unsigned int _eax;
-	unsigned int _ebx;
-	unsigned int _ecx;
-	unsigned int _edx;
-	__cpuid(info, _eax, _ebx, _ecx, _edx);
-	*eax = _eax;
-	*ebx = _ebx;
-	*ecx = _ecx;
-	*edx = _edx;
+    __cpuid(info, *eax, *ebx, *ecx, *edx);
 #endif
 }

-void ZLIB_INTERNAL x86_check_features(void) {
-	unsigned eax, ebx, ecx, edx;
-	unsigned maxbasic;
+static void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
+#ifdef _MSC_VER
+    unsigned int registers[4];
+    __cpuidex((int *)registers, info, subinfo);

-	cpuid(0, &maxbasic, &ebx, &ecx, &edx);
-
-	cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
-
-	x86_cpu_has_sse2 = edx & 0x4000000;
-	x86_cpu_has_sse42 = ecx & 0x100000;
-	x86_cpu_has_pclmulqdq = ecx & 0x2;
-
-	if (maxbasic >= 7) {
-	  cpuid(7, &eax, &ebx, &ecx, &edx);
-
-	  // check BMI1 bit
-	  // Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
-	  x86_cpu_has_tzcnt = ebx & 0x8;
-	} else {
-	  x86_cpu_has_tzcnt = 0;
-	}
+    *eax = registers[0];
+    *ebx = registers[1];
+    *ecx = registers[2];
+    *edx = registers[3];
+#else
+    __cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
+#endif
+}
+
+void Z_INTERNAL x86_check_features(void) {
+    unsigned eax, ebx, ecx, edx;
+    unsigned maxbasic;
+
+    cpuid(0, &maxbasic, &ebx, &ecx, &edx);
+
+    cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
+
+    x86_cpu_has_sse2 = edx & 0x4000000;
+    x86_cpu_has_ssse3 = ecx & 0x200;
+    x86_cpu_has_sse42 = ecx & 0x100000;
+    x86_cpu_has_pclmulqdq = ecx & 0x2;
+
+    if (maxbasic >= 7) {
+        cpuidex(7, 0, &eax, &ebx, &ecx, &edx);
+
+        // check BMI1 bit
+        // Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
+        x86_cpu_has_tzcnt = ebx & 0x8;
+        // check AVX2 bit
+        x86_cpu_has_avx2 = ebx & 0x20;
+    } else {
+        x86_cpu_has_tzcnt = 0;
+        x86_cpu_has_avx2 = 0;
+    }
 }
@@ -1,16 +1,18 @@
- /* cpu.h -- check for CPU features
- * Copyright (C) 2013 Intel Corporation Jim Kukunas
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
+/* cpu.h -- check for CPU features
+* Copyright (C) 2013 Intel Corporation Jim Kukunas
+* For conditions of distribution and use, see copyright notice in zlib.h
+*/

 #ifndef CPU_H_
 #define CPU_H_

+extern int x86_cpu_has_avx2;
 extern int x86_cpu_has_sse2;
+extern int x86_cpu_has_ssse3;
 extern int x86_cpu_has_sse42;
 extern int x86_cpu_has_pclmulqdq;
 extern int x86_cpu_has_tzcnt;

-void ZLIB_INTERNAL x86_check_features(void);
+void Z_INTERNAL x86_check_features(void);

 #endif /* CPU_H_ */