[Library] Update zlibng (#1255)

* Update zlibng * Set cmake path more directly in zlibng to hopefully fix an issue with the build on drone * I'm dumb, missing / in path * Mackal helps with a dumb gitignore issue * Adding all the files, not sure what's ignoring them and im tired of looking * Some tweaks to zlibng build to hopefully get it to build properly. works on msvc now
2026-05-31 00:46:46 +00:00 · 2021-02-23 17:00:26 -08:00
parent e6dee96266
commit 2957f5084d
184 changed files with 22029 additions and 11703 deletions
@@ -6,19 +6,27 @@ CC=
 CFLAGS=
 SFLAGS=
 INCLUDES=
+ACLEFLAG=
+NEONFLAG=
 SUFFIX=

 SRCDIR=.
 SRCTOP=../..
 TOPDIR=$(SRCTOP)

-all: adler32_neon.o adler32_neon.lo armfeature.o armfeature.lo crc32_acle.o crc32_acle.lo fill_window_arm.o fill_window_arm.lo insert_string_acle.o insert_string_acle.lo
+all: \
+	adler32_neon.o adler32_neon.lo \
+	armfeature.o armfeature.lo \
+	chunkset_neon.o chunkset_neon.lo \
+	crc32_acle.o crc32_acle.lo \
+	slide_neon.o slide_neon.lo \
+	insert_string_acle.o insert_string_acle.lo

 adler32_neon.o:
-	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
+	$(CC) $(CFLAGS) $(NEONFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c

 adler32_neon.lo:
-	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
+	$(CC) $(SFLAGS) $(NEONFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c

 armfeature.o:
 	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/armfeature.c
@@ -26,23 +34,29 @@ armfeature.o:
 armfeature.lo:
 	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/armfeature.c

+chunkset_neon.o:
+	$(CC) $(CFLAGS) $(NEONFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
+
+chunkset_neon.lo:
+	$(CC) $(SFLAGS) $(NEONFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
+
 crc32_acle.o:
-	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
+	$(CC) $(CFLAGS) $(ACLEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c

 crc32_acle.lo:
-	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
+	$(CC) $(SFLAGS) $(ACLEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c

-fill_window_arm.o:
-	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_arm.c
+slide_neon.o:
+	$(CC) $(CFLAGS) $(NEONFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_neon.c

-fill_window_arm.lo:
-	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_arm.c
+slide_neon.lo:
+	$(CC) $(SFLAGS) $(NEONFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_neon.c

 insert_string_acle.o:
-	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
+	$(CC) $(CFLAGS) $(ACLEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c

 insert_string_acle.lo:
-	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
+	$(CC) $(SFLAGS) $(ACLEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c

 mostlyclean: clean
 clean:
@@ -2,24 +2,16 @@
 * Copyright (C) 2017 ARM Holdings Inc.
 * Author: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
 *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- * 1. The origin of this software must not be misrepresented; you must not
- *  claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
+ * For conditions of distribution and use, see copyright notice in zlib.h
 */
-#include "adler32_neon.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#include "adler32_p.h"
+#ifdef ARM_NEON_ADLER32
+#ifdef _M_ARM64
+#  include <arm64_neon.h>
+#else
+#  include <arm_neon.h>
+#endif
+#include "../../zutil.h"
+#include "../../adler32_p.h"

 static void NEON_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
    static const uint8_t taps[32] = {
@@ -109,7 +101,7 @@ uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {

    for (i = 0; i < len; i += n) {
        if ((i + n) > len)
-            n = len - i;
+            n = (int)(len - i);

        if (n < 16)
            break;
@@ -1,29 +0,0 @@
-/* Copyright (C) 1995-2011, 2016 Mark Adler
- * Copyright (C) 2017 ARM Holdings Inc.
- * Author: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- * 1. The origin of this software must not be misrepresented; you must not
- *  claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-#ifndef __ADLER32_NEON__
-#define __ADLER32_NEON__
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-// Depending on the compiler flavor, size_t may be defined in one or the other header. See:
-// http://stackoverflow.com/questions/26410466/gcc-linaro-compiler-throws-error-unknown-type-name-size-t
-#include <stdint.h>
-#include <stddef.h>
-uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len);
-#endif
-#endif
@@ -8,6 +8,6 @@
 extern int arm_cpu_has_neon;
 extern int arm_cpu_has_crc32;

-void ZLIB_INTERNAL arm_check_features(void);
+void Z_INTERNAL arm_check_features(void);

 #endif /* ARM_H_ */
@@ -1,50 +1,69 @@
-#include "zutil.h"
+#include "../../zutil.h"

 #if defined(__linux__)
-# include <sys/auxv.h>
-# include <asm/hwcap.h>
+#  include <sys/auxv.h>
+#  include <asm/hwcap.h>
+#elif defined(__FreeBSD__) && defined(__aarch64__)
+#  include <machine/armreg.h>
+#  ifndef ID_AA64ISAR0_CRC32_VAL
+#    define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32
+#  endif
+#elif defined(__APPLE__)
+#  include <sys/sysctl.h>
 #elif defined(_WIN32)
-# include <winapifamily.h>
+#  include <winapifamily.h>
 #endif

 static int arm_has_crc32() {
 #if defined(__linux__) && defined(HWCAP2_CRC32)
-  return (getauxval(AT_HWCAP2) & HWCAP2_CRC32) != 0 ? 1 : 0;
+    return (getauxval(AT_HWCAP2) & HWCAP2_CRC32) != 0 ? 1 : 0;
+#elif defined(__FreeBSD__) && defined(__aarch64__)
+    return getenv("QEMU_EMULATING") == NULL
+      && ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE;
+#elif defined(__APPLE__)
+    int hascrc32;
+    size_t size = sizeof(hascrc32);
+    return sysctlbyname("hw.optional.armv8_crc32", &hascrc32, &size, NULL, 0) == 0
+      && hascrc32 == 1;
 #elif defined(ARM_NOCHECK_ACLE)
-  return 1;
+    return 1;
 #else
-  return 0;
+    return 0;
 #endif
 }

 /* AArch64 has neon. */
-#if !defined(__aarch64__)
-static inline int arm_has_neon()
-{
- #if defined(__linux__) && defined(HWCAP_NEON)
+#if !defined(__aarch64__) && !defined(_M_ARM64)
+static inline int arm_has_neon() {
+#if defined(__linux__) && defined(HWCAP_NEON)
    return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0 ? 1 : 0;
- #elif defined(_M_ARM) && defined(WINAPI_FAMILY_PARTITION)
-  #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
+#elif defined(__APPLE__)
+    int hasneon;
+    size_t size = sizeof(hasneon);
+    return sysctlbyname("hw.optional.neon", &hasneon, &size, NULL, 0) == 0
+      && hasneon == 1;
+#elif defined(_M_ARM) && defined(WINAPI_FAMILY_PARTITION)
+#  if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
    return 1; /* Always supported */
-  #endif
- #endif
+#  endif
+#endif

- #if defined(ARM_NOCHECK_NEON)
+#if defined(ARM_NOCHECK_NEON)
    return 1;
- #else
-    return 0;
- #endif
-}
-#endif
-
-ZLIB_INTERNAL int arm_cpu_has_neon;
-ZLIB_INTERNAL int arm_cpu_has_crc32;
-
-void ZLIB_INTERNAL arm_check_features(void) {
-#if defined(__aarch64__)
-  arm_cpu_has_neon = 1; /* always available */
 #else
-  arm_cpu_has_neon = arm_has_neon();
+    return 0;
 #endif
-  arm_cpu_has_crc32 = arm_has_crc32();
+}
+#endif
+
+Z_INTERNAL int arm_cpu_has_neon;
+Z_INTERNAL int arm_cpu_has_crc32;
+
+void Z_INTERNAL arm_check_features(void) {
+#if defined(__aarch64__) || defined(_M_ARM64)
+    arm_cpu_has_neon = 1; /* always available */
+#else
+    arm_cpu_has_neon = arm_has_neon();
+#endif
+    arm_cpu_has_crc32 = arm_has_crc32();
 }
@@ -0,0 +1,54 @@
+/* chunkset_neon.c -- NEON inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_NEON_CHUNKSET
+#ifdef _M_ARM64
+#  include <arm64_neon.h>
+#else
+#  include <arm_neon.h>
+#endif
+#include "../../zbuild.h"
+#include "../../zutil.h"
+
+typedef uint8x16_t chunk_t;
+
+#define HAVE_CHUNKMEMSET_1
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
+    *chunk = vld1q_dup_u8(from);
+}
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    *chunk = vreinterpretq_u8_s16(vdupq_n_s16(*(int16_t *)from));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    *chunk = vreinterpretq_u8_s32(vdupq_n_s32(*(int32_t *)from));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    *chunk = vcombine_u8(vld1_u8(from), vld1_u8(from));
+}
+
+#define CHUNKSIZE        chunksize_neon
+#define CHUNKCOPY        chunkcopy_neon
+#define CHUNKCOPY_SAFE   chunkcopy_safe_neon
+#define CHUNKUNROLL      chunkunroll_neon
+#define CHUNKMEMSET      chunkmemset_neon
+#define CHUNKMEMSET_SAFE chunkmemset_safe_neon
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = vld1q_u8(s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    vst1q_u8(out, *chunk);
+}
+
+#include "chunkset_tpl.h"
+
+#endif
@@ -5,21 +5,16 @@
 *
 */

-#ifdef __ARM_FEATURE_CRC32
-# include <arm_acle.h>
-# ifdef ZLIB_COMPAT
-#  include <zconf.h>
-# else
-#  include <zconf-ng.h>
-# endif
-# ifdef __linux__
-#  include <stddef.h>
-# endif
+#ifdef ARM_ACLE_CRC_HASH
+#ifndef _MSC_VER
+#  include <arm_acle.h>
+#endif
+#include "../../zutil.h"

 uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
-    register uint32_t c;
-    register const uint16_t *buf2;
-    register const uint32_t *buf4;
+    Z_REGISTER uint32_t c;
+    Z_REGISTER const uint16_t *buf2;
+    Z_REGISTER const uint32_t *buf4;

    c = ~crc;
    if (len && ((ptrdiff_t)buf & 1)) {
@@ -36,7 +31,7 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
        buf4 = (const uint32_t *) buf;
    }

-# if defined(__aarch64__)
+#if defined(__aarch64__)
    if ((len > sizeof(uint32_t)) && ((ptrdiff_t)buf & sizeof(uint32_t))) {
        c = __crc32w(c, *buf4++);
        len -= sizeof(uint32_t);
@@ -44,7 +39,7 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {

    const uint64_t *buf8 = (const uint64_t *) buf4;

-#  ifdef UNROLL_MORE
+#ifdef UNROLL_MORE
    while (len >= 4 * sizeof(uint64_t)) {
        c = __crc32d(c, *buf8++);
        c = __crc32d(c, *buf8++);
@@ -52,7 +47,7 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
        c = __crc32d(c, *buf8++);
        len -= 4 * sizeof(uint64_t);
    }
-#  endif
+#endif

    while (len >= sizeof(uint64_t)) {
        c = __crc32d(c, *buf8++);
@@ -74,7 +69,7 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
    }

    buf = (const unsigned char *) buf2;
-# else /* __aarch64__ */
+#else /* __aarch64__ */

 #  ifdef UNROLL_MORE
    while (len >= 8 * sizeof(uint32_t)) {
@@ -103,7 +98,7 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
    } else {
        buf = (const unsigned char *) buf4;
    }
-# endif /* __aarch64__ */
+#endif /* __aarch64__ */

    if (len) {
        c = __crc32b(c, *buf);
@@ -112,4 +107,4 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
    c = ~c;
    return c;
 }
-#endif /* __ARM_FEATURE_CRC32 */
+#endif
@@ -5,7 +5,7 @@

 #if defined(_MSC_VER) && !defined(__clang__)
 static __forceinline unsigned long __builtin_ctzl(unsigned long value) {
-	return _arm_clz(_arm_rbit(value));
+    return _arm_clz(_arm_rbit(value));
 }
 #endif

@@ -1,169 +0,0 @@
-/* fill_window_arm.c -- Optimized hash table shifting for ARM with support for NEON instructions
- * Copyright (C) 2017 Mika T. Lindqvist
- *
- * Authors:
- * Mika T. Lindqvist <postmaster@raasu.org>
- * Jun He <jun.he@arm.com>
- *
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-/* @(#) $Id$ */
-
-#include "zbuild.h"
-#include "deflate.h"
-#include "deflate_p.h"
-#include "functable.h"
-
-extern ZLIB_INTERNAL int read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-
-/* SIMD version of hash_chain rebase */
-static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) {
-    register uint16x8_t v, *p;
-    register size_t n;
-
-    size_t size = entries*sizeof(table[0]);
-    Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
-
-    Assert(sizeof(Pos) == 2, "Wrong Pos size");
-    v = vdupq_n_u16(window_size);
-
-    p = (uint16x8_t *)table;
-    n = size / (sizeof(uint16x8_t) * 8);
-    do {
-        p[0] = vqsubq_u16(p[0], v);
-        p[1] = vqsubq_u16(p[1], v);
-        p[2] = vqsubq_u16(p[2], v);
-        p[3] = vqsubq_u16(p[3], v);
-        p[4] = vqsubq_u16(p[4], v);
-        p[5] = vqsubq_u16(p[5], v);
-        p[6] = vqsubq_u16(p[6], v);
-        p[7] = vqsubq_u16(p[7], v);
-        p += 8;
-    } while (--n);
-}
-#else
-/* generic version for hash rebase */
-static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) {
-    unsigned int i;
-    for (i = 0; i < entries; i++) {
-        table[i] = (table[i] >= window_size) ? (table[i] - window_size) : NIL;
-    }
-}
-#endif
-
-void fill_window_arm(deflate_state *s) {
-    register unsigned n;
-    unsigned long more;  /* Amount of free space at the end of the window. */
-    unsigned int wsize = s->w_size;
-
-    Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
-
-    do {
-        more = s->window_size - s->lookahead - s->strstart;
-
-        /* If the window is almost full and there is insufficient lookahead,
-         * move the upper half to the lower one to make room in the upper half.
-         */
-        if (s->strstart >= wsize+MAX_DIST(s)) {
-            memcpy(s->window, s->window+wsize, wsize);
-            s->match_start -= wsize;
-            s->strstart    -= wsize; /* we now have strstart >= MAX_DIST */
-            s->block_start -= wsize;
-
-            /* Slide the hash table (could be avoided with 32 bit values
-               at the expense of memory usage). We slide even when level == 0
-               to keep the hash table consistent if we switch back to level > 0
-               later. (Using level 0 permanently is not an optimal usage of
-               zlib, so we don't care about this pathological case.)
-             */
-
-            slide_hash_chain(s->head, s->hash_size, wsize);
-            slide_hash_chain(s->prev, wsize, wsize);
-            more += wsize;
-        }
-        if (s->strm->avail_in == 0)
-            break;
-
-        /* If there was no sliding:
-         *    strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
-         *    more == window_size - lookahead - strstart
-         * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
-         * => more >= window_size - 2*WSIZE + 2
-         * In the BIG_MEM or MMAP case (not yet supported),
-         *   window_size == input_size + MIN_LOOKAHEAD  &&
-         *   strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
-         * Otherwise, window_size == 2*WSIZE so more >= 2.
-         * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
-         */
-        Assert(more >= 2, "more < 2");
-
-        n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
-        s->lookahead += n;
-
-        /* Initialize the hash value now that we have some input: */
-        if (s->lookahead + s->insert >= MIN_MATCH) {
-            unsigned int str = s->strstart - s->insert;
-            unsigned int insert_cnt = s->insert;
-            unsigned int slen;
-
-            s->ins_h = s->window[str];
-
-            if (unlikely(s->lookahead < MIN_MATCH))
-                insert_cnt += s->lookahead - MIN_MATCH;
-            slen = insert_cnt;
-            if (str >= (MIN_MATCH - 2))
-            {
-                str += 2 - MIN_MATCH;
-                insert_cnt += MIN_MATCH - 2;
-            }
-            if (insert_cnt > 0)
-            {
-                functable.insert_string(s, str, insert_cnt);
-                s->insert -= slen;
-            }
-        }
-        /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
-         * but this is not important since only literal bytes will be emitted.
-         */
-    } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
-
-    /* If the WIN_INIT bytes after the end of the current data have never been
-     * written, then zero those bytes in order to avoid memory check reports of
-     * the use of uninitialized (or uninitialised as Julian writes) bytes by
-     * the longest match routines.  Update the high water mark for the next
-     * time through here.  WIN_INIT is set to MAX_MATCH since the longest match
-     * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
-     */
-    if (s->high_water < s->window_size) {
-        unsigned long curr = s->strstart + (unsigned long)s->lookahead;
-        unsigned long init;
-
-        if (s->high_water < curr) {
-            /* Previous high water mark below current data -- zero WIN_INIT
-             * bytes or up to end of window, whichever is less.
-             */
-            init = s->window_size - curr;
-            if (init > WIN_INIT)
-                init = WIN_INIT;
-            memset(s->window + curr, 0, init);
-            s->high_water = curr + init;
-        } else if (s->high_water < curr + WIN_INIT) {
-            /* High water mark at or above current data, but below current data
-             * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
-             * to end of window, whichever is less.
-             */
-            init = curr + WIN_INIT;
-            if (init > s->window_size)
-                init = s->window_size;
-            init -= s->high_water;
-            memset(s->window + s->high_water, 0, init);
-            s->high_water += init;
-        }
-    }
-
-    Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "not enough room for search");
-}
@@ -5,49 +5,18 @@
 *
 */

-#if defined(__ARM_FEATURE_CRC32) && defined(ARM_ACLE_CRC_HASH)
-#include <arm_acle.h>
-#include "zbuild.h"
-#include "deflate.h"
-
-/* ===========================================================================
- * Insert string str in the dictionary and set match_head to the previous head
- * of the hash chain (the most recent string with same hash key). Return
- * the previous length of the hash chain.
- * IN  assertion: all calls to to INSERT_STRING are made with consecutive
- *    input characters and the first MIN_MATCH bytes of str are valid
- *    (except for the last MIN_MATCH-1 bytes of the input file).
- */
-Pos insert_string_acle(deflate_state *const s, const Pos str, unsigned int count) {
-    Pos p, lp, ret;
-
-    if (unlikely(count == 0)) {
-        return s->prev[str & s->w_mask];
-    }
-
-    ret = 0;
-    lp = str + count - 1; /* last position */
-
-    for (p = str; p <= lp; p++) {
-        uint32_t val, h, hm;
-        memcpy(&val, &s->window[p], sizeof(val));
-
-        if (s->level >= TRIGGER_LEVEL)
-            val &= 0xFFFFFF;
-
-        h = __crc32w(0, val);
-        hm = h & s->hash_mask;
-
-        Pos head = s->head[hm];
-        if (head != p) {
-            s->prev[p & s->w_mask] = head;
-            s->head[hm] = p;
-            if (p == lp)
-              ret = head;
-        } else if (p == lp) {
-          ret = p;
-        }
-    }
-    return ret;
-}
+#ifdef ARM_ACLE_CRC_HASH
+#ifndef _MSC_VER
+#  include <arm_acle.h>
+#endif
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+#define UPDATE_HASH(s, h, val) \
+    h = __crc32w(0, val)
+
+#define INSERT_STRING       insert_string_acle
+#define QUICK_INSERT_STRING quick_insert_string_acle
+
+#include "../../insert_string_tpl.h"
 #endif
@@ -0,0 +1,52 @@
+/* slide_neon.c -- Optimized hash table shifting for ARM with support for NEON instructions
+ * Copyright (C) 2017-2020 Mika T. Lindqvist
+ *
+ * Authors:
+ * Mika T. Lindqvist <postmaster@raasu.org>
+ * Jun He <jun.he@arm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#if defined(ARM_NEON_SLIDEHASH)
+#ifdef _M_ARM64
+#  include <arm64_neon.h>
+#else
+#  include <arm_neon.h>
+#endif
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+/* SIMD version of hash_chain rebase */
+static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) {
+    Z_REGISTER uint16x8_t v, *p;
+    Z_REGISTER size_t n;
+
+    size_t size = entries*sizeof(table[0]);
+    Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
+
+    Assert(sizeof(Pos) == 2, "Wrong Pos size");
+    v = vdupq_n_u16(window_size);
+
+    p = (uint16x8_t *)table;
+    n = size / (sizeof(uint16x8_t) * 8);
+    do {
+        p[0] = vqsubq_u16(p[0], v);
+        p[1] = vqsubq_u16(p[1], v);
+        p[2] = vqsubq_u16(p[2], v);
+        p[3] = vqsubq_u16(p[3], v);
+        p[4] = vqsubq_u16(p[4], v);
+        p[5] = vqsubq_u16(p[5], v);
+        p[6] = vqsubq_u16(p[6], v);
+        p[7] = vqsubq_u16(p[7], v);
+        p += 8;
+    } while (--n);
+}
+
+Z_INTERNAL void slide_hash_neon(deflate_state *s) {
+    unsigned int wsize = s->w_size;
+
+    slide_hash_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_chain(s->prev, wsize, wsize);
+}
+#endif
@@ -0,0 +1,49 @@
+# Makefile for POWER-specific files
+# Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+P8FLAGS=-mcpu=power8
+
+all: power.o \
+     power.lo \
+     adler32_power8.o \
+     adler32_power8.lo \
+     slide_hash_power8.o \
+     slide_hash_power8.lo
+
+power.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power.c
+
+power.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power.c
+
+adler32_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
+
+adler32_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
+
+slide_hash_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
+
+slide_hash_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean:
+	rm -f Makefile
@@ -0,0 +1,154 @@
+/* Adler32 for POWER8 using VSX instructions.
+ * Copyright (C) 2020 IBM Corporation
+ * Author: Rogerio Alves <rcardoso@linux.ibm.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector)
+ * instructions.
+ *
+ * If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means
+ * iteration n) is the initial value of adler - at start  _0 is 1 unless
+ * adler initial value is different than 1. So s1_1 = s1_0 + c[0] after
+ * the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on.
+ * Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on
+ * after iteration N.
+ *
+ * Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] +
+ * N-1*c[1] + ... + c[N]
+ *
+ * In a more general way:
+ *
+ * s1_N = s1_0 + sum(i=1 to N)c[i]
+ * s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i]
+ *
+ * Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we
+ * can process N-bit at time we can do this at once.
+ *
+ * Since VSX can support 16-bit vector instructions, we can process
+ * 16-bit at time using N = 16 we have:
+ *
+ * s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i]
+ * s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i]
+ *
+ * After the first iteration we calculate the adler32 checksum for 16 bytes.
+ *
+ * For more background about adler32 please check the RFC:
+ * https://www.ietf.org/rfc/rfc1950.txt
+ */
+
+#ifdef POWER8_VSX_ADLER32
+
+#include <altivec.h>
+#include "zbuild.h"
+#include "zutil.h"
+#include "adler32_p.h"
+
+/* Vector across sum unsigned int (saturate).  */
+inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsigned int __b) {
+    __b = vec_sld(__a, __a, 8);
+    __b = vec_add(__b, __a);
+    __a = vec_sld(__b, __b, 4);
+    __a = vec_add(__a, __b);
+
+    return __a;
+}
+
+uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len) {
+    uint32_t s1 = adler & 0xffff;
+    uint32_t s2 = (adler >> 16) & 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(s1, buf, s2);
+
+    /* If buffer is empty or len=0 we need to return adler initial value.  */
+    if (UNLIKELY(buf == NULL))
+        return 1;
+
+    /* This is faster than VSX code for len < 64.  */
+    if (len < 64)
+        return adler32_len_64(s1, buf, len, s2);
+
+    /* Use POWER VSX instructions for len >= 64. */
+    const vector unsigned int v_zeros = { 0 };
+    const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
+         6, 5, 4, 3, 2, 1};
+    const vector unsigned char vsh = vec_splat_u8(4);
+    const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};
+    vector unsigned int vs1 = { 0 };
+    vector unsigned int vs2 = { 0 };
+    vector unsigned int vs1_save = { 0 };
+    vector unsigned int vsum1, vsum2;
+    vector unsigned char vbuf;
+    int n;
+
+    vs1[0] = s1;
+    vs2[0] = s2;
+
+    /* Do length bigger than NMAX in blocks of NMAX size.  */
+    while (len >= NMAX) {
+        len -= NMAX;
+        n = NMAX / 16;
+        do {
+            vbuf = vec_xl(0, (unsigned char *) buf);
+            vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i].  */
+            /* sum(i=1 to 16) buf[i]*(16-i+1).  */
+            vsum2 = vec_msum(vbuf, v_mul, v_zeros);
+            /* Save vs1.  */
+            vs1_save = vec_add(vs1_save, vs1);
+            /* Accumulate the sums.  */
+            vs1 = vec_add(vsum1, vs1);
+            vs2 = vec_add(vsum2, vs2);
+
+            buf += 16;
+        } while (--n);
+        /* Once each block of NMAX size.  */
+        vs1 = vec_sumsu(vs1, vsum1);
+        vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save.  */
+        vs2 = vec_add(vs1_save, vs2);
+        vs2 = vec_sumsu(vs2, vsum2);
+
+        /* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521.  */
+        vs1[0] = vs1[0] % BASE;
+        /* vs2[0] = s2_i + 16*s1_save +
+           sum(i=1 to 16)(16-i+1)*buf[i] mod 65521.  */
+        vs2[0] = vs2[0] % BASE;
+
+        vs1 = vec_and(vs1, vmask);
+        vs2 = vec_and(vs2, vmask);
+        vs1_save = v_zeros;
+    }
+
+    /* len is less than NMAX one modulo is needed.  */
+    if (len >= 16) {
+        while (len >= 16) {
+            len -= 16;
+
+            vbuf = vec_xl(0, (unsigned char *) buf);
+
+            vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i].  */
+            /* sum(i=1 to 16) buf[i]*(16-i+1).  */
+            vsum2 = vec_msum(vbuf, v_mul, v_zeros);
+            /* Save vs1.  */
+            vs1_save = vec_add(vs1_save, vs1);
+            /* Accumulate the sums.  */
+            vs1 = vec_add(vsum1, vs1);
+            vs2 = vec_add(vsum2, vs2);
+
+            buf += 16;
+        }
+        /* Since the size will be always less than NMAX we do this once.  */
+        vs1 = vec_sumsu(vs1, vsum1);
+        vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save.  */
+        vs2 = vec_add(vs1_save, vs2);
+        vs2 = vec_sumsu(vs2, vsum2);
+    }
+    /* Copy result back to s1, s2 (mod 65521).  */
+    s1 = vs1[0] % BASE;
+    s2 = vs2[0] % BASE;
+
+    /* Process tail (len < 16).and return  */
+    return adler32_len_16(s1, buf, len, s2);
+}
+
+#endif /* POWER8_VSX_ADLER32 */
@@ -0,0 +1,19 @@
+/* POWER feature check
+ * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <sys/auxv.h>
+#include "../../zutil.h"
+
+Z_INTERNAL int power_cpu_has_arch_2_07;
+
+void Z_INTERNAL power_check_features(void) {
+    unsigned long hwcap2;
+    hwcap2 = getauxval(AT_HWCAP2);
+
+#ifdef POWER8
+    if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+      power_cpu_has_arch_2_07 = 1;
+#endif
+}
@@ -0,0 +1,13 @@
+/* power.h -- check for POWER CPU features
+ * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_H_
+#define POWER_H_
+
+extern int power_cpu_has_arch_2_07;
+
+void Z_INTERNAL power_check_features(void);
+
+#endif /* POWER_H_ */
@@ -0,0 +1,60 @@
+/* Optimized slide_hash for POWER processors
+ * Copyright (C) 2019-2020 IBM Corporation
+ * Author: Matheus Castanho <msc@linux.ibm.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER8_VSX_SLIDEHASH
+
+#include <altivec.h>
+#include "zbuild.h"
+#include "deflate.h"
+
+static inline void slide_hash_power8_loop(deflate_state *s, unsigned n_elems, Pos *table_end) {
+    vector unsigned short vw, vm, *vp;
+    unsigned chunks;
+
+    /* Each vector register (chunk) corresponds to 128 bits == 8 Posf,
+     * so instead of processing each of the n_elems in the hash table
+     * individually, we can do it in chunks of 8 with vector instructions.
+     *
+     * This function is only called from slide_hash_power8(), and both calls
+     * pass n_elems as a power of 2 higher than 2^7, as defined by
+     * deflateInit2_(), so n_elems will always be a multiple of 8. */
+    chunks = n_elems >> 3;
+    Assert(n_elems % 8 == 0, "Weird hash table size!");
+
+    /* This type casting is safe since s->w_size is always <= 64KB
+     * as defined by deflateInit2_() and Posf == unsigned short */
+    vw[0] = (Pos) s->w_size;
+    vw = vec_splat(vw,0);
+
+    vp = (vector unsigned short *) table_end;
+
+    do {
+        /* Processing 8 elements at a time */
+        vp--;
+        vm = *vp;
+
+        /* This is equivalent to: m >= w_size ? m - w_size : 0
+         * Since we are using a saturated unsigned subtraction, any
+         * values that are > w_size will be set to 0, while the others
+         * will be subtracted by w_size. */
+        *vp = vec_subs(vm,vw);
+    } while (--chunks);
+}
+
+void Z_INTERNAL slide_hash_power8(deflate_state *s) {
+    unsigned int n;
+    Pos *p;
+
+    n = HASH_SIZE;
+    p = &s->head[n];
+    slide_hash_power8_loop(s,n,p);
+
+    n = s->w_size;
+    p = &s->prev[n];
+    slide_hash_power8_loop(s,n,p);
+}
+
+#endif /* POWER8_VSX_SLIDEHASH */
@@ -1,6 +1,7 @@
-This directory contains IBM Z DEFLATE CONVERSION CALL support for
-zlib-ng. In order to enable it, the following build commands should be
-used:
+# Introduction
+
+This directory contains SystemZ deflate hardware acceleration support.
+It can be enabled using the following build commands:

    $ ./configure --with-dfltcc-deflate --with-dfltcc-inflate
    $ make
@@ -10,60 +11,206 @@ or
    $ cmake -DWITH_DFLTCC_DEFLATE=1 -DWITH_DFLTCC_INFLATE=1 .
    $ make

-When built like this, zlib-ng would compress in hardware on level 1,
-and in software on all other levels. Decompression will always happen
-in hardware. In order to enable DFLTCC compression for levels 1-6 (i.e.
-to make it used by default) one could add -DDFLTCC_LEVEL_MASK=0x7e to
-CFLAGS when building zlib-ng.
+When built like this, zlib-ng would compress using hardware on level 1,
+and using software on all other levels. Decompression will always happen
+in hardware. In order to enable hardware compression for levels 1-6
+(i.e. to make it used by default) one could add
+`-DDFLTCC_LEVEL_MASK=0x7e` to CFLAGS when building zlib-ng.

-Two DFLTCC compression calls produce the same results only when they
-both are made on machines of the same generation, and when the
-respective buffers have the same offset relative to the start of the
-page. Therefore care should be taken when using hardware compression
-when reproducible results are desired.
+SystemZ deflate hardware acceleration is available on [IBM z15](
+https://www.ibm.com/products/z15) and newer machines under the name [
+"Integrated Accelerator for zEnterprise Data Compression"](
+https://www.ibm.com/support/z-content-solutions/compression/). The
+programming interface to it is a machine instruction called DEFLATE
+CONVERSION CALL (DFLTCC). It is documented in Chapter 26 of [Principles
+of Operation](http://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf). Both
+the code and the rest of this document refer to this feature simply as
+"DFLTCC".
+
+# Performance
+
+Performance figures are published [here](
+https://github.com/iii-i/zlib-ng/wiki/Performance-with-dfltcc-patch-applied-and-dfltcc-support-built-on-dfltcc-enabled-machine
+). The compression speed-up can be as high as 110x and the decompression
+speed-up can be as high as 15x.
+
+# Limitations
+
+Two DFLTCC compression calls with identical inputs are not guaranteed to
+produce identical outputs. Therefore care should be taken when using
+hardware compression when reproducible results are desired. In
+particular, zlib-ng-specific `zng_deflateSetParams` call allows setting
+`Z_DEFLATE_REPRODUCIBLE` parameter, which disables DFLTCC support for a
+particular stream.

 DFLTCC does not support every single zlib-ng feature, in particular:

-* inflate(Z_BLOCK) and inflate(Z_TREES)
-* inflateMark()
-* inflatePrime()
-* deflateParams() after the first deflate() call
+* `inflate(Z_BLOCK)` and `inflate(Z_TREES)`
+* `inflateMark()`
+* `inflatePrime()`
+* `inflateSyncPoint()`

 When used, these functions will either switch to software, or, in case
 this is not possible, gracefully fail.

-All SystemZ-specific code lives in a separate file and is integrated
-with the rest of zlib-ng using hook macros, which are explained below.
+# Code structure
+
+All SystemZ-specific code lives in `arch/s390` directory and is
+integrated with the rest of zlib-ng using hook macros.
+
+## Hook macros

 DFLTCC takes as arguments a parameter block, an input buffer, an output
-buffer and a window. ZALLOC_STATE, ZFREE_STATE, ZCOPY_STATE,
-ZALLOC_WINDOW and TRY_FREE_WINDOW macros encapsulate allocation details
-for the parameter block (which is allocated alongside zlib-ng state)
-and the window (which must be page-aligned).
+buffer and a window. `ZALLOC_STATE()`, `ZFREE_STATE()`, `ZCOPY_STATE()`,
+`ZALLOC_WINDOW()` and `TRY_FREE_WINDOW()` macros encapsulate allocation
+details for the parameter block (which is allocated alongside zlib-ng
+state) and the window (which must be page-aligned).

-While for inflate software and hardware window formats match, this is
-not the case for deflate. Therefore, deflateSetDictionary and
-deflateGetDictionary need special handling, which is triggered using
-the DEFLATE_SET_DICTIONARY_HOOK and DEFLATE_GET_DICTIONARY_HOOK macros.
+While inflate software and hardware window formats match, this is not
+the case for deflate. Therefore, `deflateSetDictionary()` and
+`deflateGetDictionary()` need special handling, which is triggered using
+`DEFLATE_SET_DICTIONARY_HOOK()` and `DEFLATE_GET_DICTIONARY_HOOK()`
+macros.

-deflateResetKeep() and inflateResetKeep() update the DFLTCC parameter
-block using DEFLATE_RESET_KEEP_HOOK and INFLATE_RESET_KEEP_HOOK macros.
+`deflateResetKeep()` and `inflateResetKeep()` update the DFLTCC
+parameter block using `DEFLATE_RESET_KEEP_HOOK()` and
+`INFLATE_RESET_KEEP_HOOK()` macros.

-DEFLATE_PARAMS_HOOK, INFLATE_PRIME_HOOK and INFLATE_MARK_HOOK macros
-make the unsupported deflateParams(), inflatePrime() and inflateMark()
-calls fail gracefully.
+`INFLATE_PRIME_HOOK()`, `INFLATE_MARK_HOOK()` and
+`INFLATE_SYNC_POINT_HOOK()` macros make the respective unsupported
+calls gracefully fail.
+
+`DEFLATE_PARAMS_HOOK()` implements switching between hardware and
+software compression mid-stream using `deflateParams()`. Switching
+normally entails flushing the current block, which might not be possible
+in low memory situations. `deflateParams()` uses `DEFLATE_DONE()` hook
+in order to detect and gracefully handle such situations.

 The algorithm implemented in hardware has different compression ratio
-than the one implemented in software. DEFLATE_BOUND_ADJUST_COMPLEN and
-DEFLATE_NEED_CONSERVATIVE_BOUND macros make deflateBound() return the
-correct results for the hardware implementation.
+than the one implemented in software. `DEFLATE_BOUND_ADJUST_COMPLEN()`
+and `DEFLATE_NEED_CONSERVATIVE_BOUND()` macros make `deflateBound()`
+return the correct results for the hardware implementation.

-Actual compression and decompression are handled by DEFLATE_HOOK and
-INFLATE_TYPEDO_HOOK macros. Since inflation with DFLTCC manages the
-window on its own, calling updatewindow() is suppressed using
-INFLATE_NEED_UPDATEWINDOW() macro.
+Actual compression and decompression are handled by `DEFLATE_HOOK()` and
+`INFLATE_TYPEDO_HOOK()` macros. Since inflation with DFLTCC manages the
+window on its own, calling `updatewindow()` is suppressed using
+`INFLATE_NEED_UPDATEWINDOW()` macro.

 In addition to compression, DFLTCC computes CRC-32 and Adler-32
 checksums, therefore, whenever it's used, software checksumming is
-suppressed using DEFLATE_NEED_CHECKSUM and INFLATE_NEED_CHECKSUM
+suppressed using `DEFLATE_NEED_CHECKSUM()` and `INFLATE_NEED_CHECKSUM()`
 macros.
+
+While software always produces reproducible compression results, this
+is not the case for DFLTCC. Therefore, zlib-ng users are given the
+ability to specify whether or not reproducible compression results
+are required. While it is always possible to specify this setting
+before the compression begins, it is not always possible to do so in
+the middle of a deflate stream - the exact conditions for that are
+determined by `DEFLATE_CAN_SET_REPRODUCIBLE()` macro.
+
+## SystemZ-specific code
+
+When zlib-ng is built with DFLTCC, the hooks described above are
+converted to calls to functions, which are implemented in
+`arch/s390/dfltcc_*` files. The functions can be grouped in three broad
+categories:
+
+* Base DFLTCC support, e.g. wrapping the machine instruction -
+  `dfltcc()` and allocating aligned memory - `dfltcc_alloc_state()`.
+* Translating between software and hardware data formats, e.g.
+  `dfltcc_deflate_set_dictionary()`.
+* Translating between software and hardware state machines, e.g.
+  `dfltcc_deflate()` and `dfltcc_inflate()`.
+
+The functions from the first two categories are fairly simple, however,
+various quirks in both software and hardware state machines make the
+functions from the third category quite complicated.
+
+### `dfltcc_deflate()` function
+
+This function is called by `deflate()` and has the following
+responsibilities:
+
+* Checking whether DFLTCC can be used with the current stream. If this
+  is not the case, then it returns `0`, making `deflate()` use some
+  other function in order to compress in software. Otherwise it returns
+  `1`.
+* Block management and Huffman table generation. DFLTCC ends blocks only
+  when explicitly instructed to do so by the software. Furthermore,
+  whether to use fixed or dynamic Huffman tables must also be determined
+  by the software. Since looking at data in order to gather statistics
+  would negate performance benefits, the following approach is used: the
+  first `DFLTCC_FIRST_FHT_BLOCK_SIZE` bytes are placed into a fixed
+  block, and every next `DFLTCC_BLOCK_SIZE` bytes are placed into
+  dynamic blocks.
+* Writing EOBS. Block Closing Control bit in the parameter block
+  instructs DFLTCC to write EOBS, however, certain conditions need to be
+  met: input data length must be non-zero or Continuation Flag must be
+  set. To put this in simpler terms, DFLTCC will silently refuse to
+  write EOBS if this is the only thing that it is asked to do. Since the
+  code has to be able to emit EOBS in software anyway, in order to avoid
+  tricky corner cases Block Closing Control is never used. Whether to
+  write EOBS is instead controlled by `soft_bcc` variable.
+* Triggering block post-processing. Depending on flush mode, `deflate()`
+  must perform various additional actions when a block or a stream ends.
+  `dfltcc_deflate()` informs `deflate()` about this using
+  `block_state *result` parameter.
+* Converting software state fields into hardware parameter block fields,
+  and vice versa. For example, `wrap` and Check Value Type or `bi_valid`
+  and Sub-Byte Boundary. Certain fields cannot be translated and must
+  persist untouched in the parameter block between calls, for example,
+  Continuation Flag or Continuation State Buffer.
+* Handling flush modes and low-memory situations. These aspects are
+  quite intertwined and pervasive. The general idea here is that the
+  code must not do anything in software - whether explicitly by e.g.
+  calling `send_eobs()`, or implicitly - by returning to `deflate()`
+  with certain return and `*result` values, when Continuation Flag is
+  set.
+* Ending streams. When a new block is started and flush mode is
+  `Z_FINISH`, Block Header Final parameter block bit is used to mark
+  this block as final. However, sometimes an empty final block is
+  needed, and, unfortunately, just like with EOBS, DFLTCC will silently
+  refuse to do this. The general idea of DFLTCC implementation is to
+  rely as much as possible on the existing code. Here in order to do
+  this, the code pretends that it does not support DFLTCC, which makes
+  `deflate()` call a software compression function, which writes an
+  empty final block. Whether this is required is controlled by
+  `need_empty_block` variable.
+* Error handling. This is simply converting
+  Operation-Ending-Supplemental Code to string. Errors can only happen
+  due to things like memory corruption, and therefore they don't affect
+  the `deflate()` return code.
+
+### `dfltcc_inflate()` function
+
+This function is called by `inflate()` from the `TYPEDO` state (that is,
+when all the metadata is parsed and the stream is positioned at the type
+bits of deflate block header) and it's responsible for the following:
+
+* Falling back to software when flush mode is `Z_BLOCK` or `Z_TREES`.
+  Unfortunately, there is no way to ask DFLTCC to stop decompressing on
+  block or tree boundary.
+* `inflate()` decompression loop management. This is controlled using
+  the return value, which can be either `DFLTCC_INFLATE_BREAK` or
+  `DFLTCC_INFLATE_CONTINUE`.
+* Converting software state fields into hardware parameter block fields,
+  and vice versa. For example, `whave` and History Length or `wnext` and
+  History Offset.
+* Ending streams. This instructs `inflate()` to return `Z_STREAM_END`
+  and is controlled by `last` state field.
+* Error handling. Like deflate, error handling comprises
+  Operation-Ending-Supplemental Code to string conversion. Unlike
+  deflate, errors may happen due to bad inputs, therefore they are
+  propagated to `inflate()` by setting `mode` field to `MEM` or `BAD`.
+
+# Testing
+
+Given complexity of DFLTCC machine instruction, it is not clear whether
+QEMU TCG will ever support it. At the time of writing, one has to have
+access to an IBM z15+ VM or LPAR in order to test DFLTCC support. Since
+DFLTCC is a non-privileged instruction, neither special VM/LPAR
+configuration nor root are required.
+
+Still, zlib-ng CI has a few QEMU TCG-based configurations that check
+whether fallback to software is working.
@@ -1,6 +1,6 @@
 /* dfltcc_deflate.c - IBM Z DEFLATE CONVERSION CALL general support. */

-#include "zbuild.h"
+#include "../../zbuild.h"
 #include "dfltcc_common.h"
 #include "dfltcc_detail.h"

@@ -12,20 +12,31 @@
   `posix_memalign' is not an option. Thus, we overallocate and take the
   aligned portion of the buffer.
 */
-static inline int is_dfltcc_enabled(void)
-{
+static inline int is_dfltcc_enabled(void) {
    uint64_t facilities[(DFLTCC_FACILITY / 64) + 1];
-    register uint8_t r0 __asm__("r0");
+    Z_REGISTER uint8_t r0 __asm__("r0");

    memset(facilities, 0, sizeof(facilities));
    r0 = sizeof(facilities) / sizeof(facilities[0]) - 1;
-    __asm__ volatile("stfle %[facilities]\n" : [facilities] "=Q" (facilities), [r0] "+r" (r0) :: "cc");
+    /* STFLE is supported since z9-109 and only in z/Architecture mode. When
+     * compiling with -m31, gcc defaults to ESA mode, however, since the kernel
+     * is 64-bit, it's always z/Architecture mode at runtime.
+     */
+    __asm__ volatile(
+#ifndef __clang__
+                     ".machinemode push\n"
+                     ".machinemode zarch\n"
+#endif
+                     "stfle %[facilities]\n"
+#ifndef __clang__
+                     ".machinemode pop\n"
+#endif
+                     : [facilities] "=Q" (facilities), [r0] "+r" (r0) :: "cc");
    return is_bit_set((const char *)facilities, DFLTCC_FACILITY);
 }

-void ZLIB_INTERNAL dfltcc_reset(PREFIX3(streamp) strm, uInt size)
-{
-    struct dfltcc_state *dfltcc_state = (struct dfltcc_state *)((char *)strm->state + size);
+void Z_INTERNAL dfltcc_reset(PREFIX3(streamp) strm, uInt size) {
+    struct dfltcc_state *dfltcc_state = (struct dfltcc_state *)((char *)strm->state + ALIGN_UP(size, 8));
    struct dfltcc_qaf_param *param = (struct dfltcc_qaf_param *)&dfltcc_state->param;

    /* Initialize available functions */
@@ -47,24 +58,17 @@ void ZLIB_INTERNAL dfltcc_reset(PREFIX3(streamp) strm, uInt size)
    dfltcc_state->param.ribm = DFLTCC_RIBM;
 }

-void ZLIB_INTERNAL *dfltcc_alloc_state(PREFIX3(streamp) strm, uInt items, uInt size)
-{
-    Assert((items * size) % 8 == 0,
-           "The size of zlib-ng state must be a multiple of 8");
-    return ZALLOC(strm, items * size + sizeof(struct dfltcc_state), sizeof(unsigned char));
+void Z_INTERNAL *dfltcc_alloc_state(PREFIX3(streamp) strm, uInt items, uInt size) {
+    return ZALLOC(strm, ALIGN_UP(items * size, 8) + sizeof(struct dfltcc_state), sizeof(unsigned char));
 }

-void ZLIB_INTERNAL dfltcc_copy_state(void *dst, const void *src, uInt size)
-{
-    memcpy(dst, src, size + sizeof(struct dfltcc_state));
+void Z_INTERNAL dfltcc_copy_state(void *dst, const void *src, uInt size) {
+    memcpy(dst, src, ALIGN_UP(size, 8) + sizeof(struct dfltcc_state));
 }

 static const int PAGE_ALIGN = 0x1000;

-#define ALIGN_UP(p, size) (__typeof__(p))(((uintptr_t)(p) + ((size) - 1)) & ~((size) - 1))
-
-void ZLIB_INTERNAL *dfltcc_alloc_window(PREFIX3(streamp) strm, uInt items, uInt size)
-{
+void Z_INTERNAL *dfltcc_alloc_window(PREFIX3(streamp) strm, uInt items, uInt size) {
    void *p;
    void *w;

@@ -79,8 +83,7 @@ void ZLIB_INTERNAL *dfltcc_alloc_window(PREFIX3(streamp) strm, uInt items, uInt
    return w;
 }

-void ZLIB_INTERNAL dfltcc_free_window(PREFIX3(streamp) strm, void *w)
-{
+void Z_INTERNAL dfltcc_free_window(PREFIX3(streamp) strm, void *w) {
    if (w)
        ZFREE(strm, *(void **)((unsigned char *)w - sizeof(void *)));
 }
@@ -2,17 +2,17 @@
 #define DFLTCC_COMMON_H

 #ifdef ZLIB_COMPAT
-#include "zlib.h"
+#include "../../zlib.h"
 #else
-#include "zlib-ng.h"
+#include "../../zlib-ng.h"
 #endif
-#include "zutil.h"
+#include "../../zutil.h"

-void ZLIB_INTERNAL *dfltcc_alloc_state(PREFIX3(streamp) strm, uInt items, uInt size);
-void ZLIB_INTERNAL dfltcc_copy_state(void *dst, const void *src, uInt size);
-void ZLIB_INTERNAL dfltcc_reset(PREFIX3(streamp) strm, uInt size);
-void ZLIB_INTERNAL *dfltcc_alloc_window(PREFIX3(streamp) strm, uInt items, uInt size);
-void ZLIB_INTERNAL dfltcc_free_window(PREFIX3(streamp) strm, void *w);
+void Z_INTERNAL *dfltcc_alloc_state(PREFIX3(streamp) strm, uInt items, uInt size);
+void Z_INTERNAL dfltcc_copy_state(void *dst, const void *src, uInt size);
+void Z_INTERNAL dfltcc_reset(PREFIX3(streamp) strm, uInt size);
+void Z_INTERNAL *dfltcc_alloc_window(PREFIX3(streamp) strm, uInt items, uInt size);
+void Z_INTERNAL dfltcc_free_window(PREFIX3(streamp) strm, void *w);

 #define ZALLOC_STATE dfltcc_alloc_state

@@ -13,27 +13,26 @@
        $ make
 */

-#include "zbuild.h"
-#include "zutil.h"
-#include "deflate.h"
+#include "../../zbuild.h"
+#include "../../zutil.h"
+#include "../../deflate.h"
+#include "../../trees_emit.h"
 #include "dfltcc_deflate.h"
 #include "dfltcc_detail.h"

-static inline int dfltcc_are_params_ok(int level, uInt window_bits, int strategy, uint16_t level_mask)
-{
-    return (level_mask & ((uint16_t)1 << level)) != 0 &&
-        (window_bits == HB_BITS) &&
-        (strategy == Z_FIXED || strategy == Z_DEFAULT_STRATEGY);
-}
-
-
-int ZLIB_INTERNAL dfltcc_can_deflate(PREFIX3(streamp) strm)
-{
+static inline int dfltcc_can_deflate_with_params(PREFIX3(streamp) strm, int level, uInt window_bits, int strategy,
+                                       int reproducible) {
    deflate_state *state = (deflate_state *)strm->state;
    struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);

    /* Unsupported compression settings */
-    if (!dfltcc_are_params_ok(state->level, state->w_bits, state->strategy, dfltcc_state->level_mask))
+    if ((dfltcc_state->level_mask & (1 << level)) == 0)
+        return 0;
+    if (window_bits != HB_BITS)
+        return 0;
+    if (strategy != Z_FIXED && strategy != Z_DEFAULT_STRATEGY)
+        return 0;
+    if (reproducible)
        return 0;

    /* Unsupported hardware */
@@ -45,8 +44,13 @@ int ZLIB_INTERNAL dfltcc_can_deflate(PREFIX3(streamp) strm)
    return 1;
 }

-static inline void dfltcc_gdht(PREFIX3(streamp) strm)
-{
+int Z_INTERNAL dfltcc_can_deflate(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+
+    return dfltcc_can_deflate_with_params(strm, state->level, state->w_bits, state->strategy, state->reproducible);
+}
+
+static inline void dfltcc_gdht(PREFIX3(streamp) strm) {
    deflate_state *state = (deflate_state *)strm->state;
    struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
    size_t avail_in = strm->avail_in;
@@ -54,8 +58,7 @@ static inline void dfltcc_gdht(PREFIX3(streamp) strm)
    dfltcc(DFLTCC_GDHT, param, NULL, NULL, &strm->next_in, &avail_in, NULL);
 }

-static inline dfltcc_cc dfltcc_cmpr(PREFIX3(streamp) strm)
-{
+static inline dfltcc_cc dfltcc_cmpr(PREFIX3(streamp) strm) {
    deflate_state *state = (deflate_state *)strm->state;
    struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
    size_t avail_in = strm->avail_in;
@@ -72,11 +75,10 @@ static inline dfltcc_cc dfltcc_cmpr(PREFIX3(streamp) strm)
    return cc;
 }

-static inline void send_eobs(PREFIX3(streamp) strm, const struct dfltcc_param_v0 *param)
-{
+static inline void send_eobs(PREFIX3(streamp) strm, const struct dfltcc_param_v0 *param) {
    deflate_state *state = (deflate_state *)strm->state;

-    send_bits(state, bi_reverse(param->eobs >> (15 - param->eobl), param->eobl), param->eobl);
+    send_bits(state, bi_reverse(param->eobs >> (15 - param->eobl), param->eobl), param->eobl, state->bi_buf, state->bi_valid);
    flush_pending(strm);
    if (state->pending != 0) {
        /* The remaining data is located in pending_out[0:pending]. If someone
@@ -93,8 +95,7 @@ static inline void send_eobs(PREFIX3(streamp) strm, const struct dfltcc_param_v0
 #endif
 }

-int ZLIB_INTERNAL dfltcc_deflate(PREFIX3(streamp) strm, int flush, block_state *result)
-{
+int Z_INTERNAL dfltcc_deflate(PREFIX3(streamp) strm, int flush, block_state *result) {
    deflate_state *state = (deflate_state *)strm->state;
    struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
    struct dfltcc_param_v0 *param = &dfltcc_state->param;
@@ -104,31 +105,38 @@ int ZLIB_INTERNAL dfltcc_deflate(PREFIX3(streamp) strm, int flush, block_state *
    int soft_bcc;
    int no_flush;

-    if (!dfltcc_can_deflate(strm))
+    if (!dfltcc_can_deflate(strm)) {
+        /* Clear history. */
+        if (flush == Z_FULL_FLUSH)
+            param->hl = 0;
        return 0;
+    }

 again:
    masked_avail_in = 0;
    soft_bcc = 0;
    no_flush = flush == Z_NO_FLUSH;

-    /* Trailing empty block. Switch to software, except when Continuation Flag
-     * is set, which means that DFLTCC has buffered some output in the
-     * parameter block and needs to be called again in order to flush it.
+    /* No input data. Return, except when Continuation Flag is set, which means
+     * that DFLTCC has buffered some output in the parameter block and needs to
+     * be called again in order to flush it.
     */
-    if (flush == Z_FINISH && strm->avail_in == 0 && !param->cf) {
-        if (param->bcf) {
-            /* A block is still open, and the hardware does not support closing
-             * blocks without adding data. Thus, close it manually.
-             */
+    if (strm->avail_in == 0 && !param->cf) {
+        /* A block is still open, and the hardware does not support closing
+         * blocks without adding data. Thus, close it manually.
+         */
+        if (!no_flush && param->bcf) {
            send_eobs(strm, param);
            param->bcf = 0;
        }
-        return 0;
-    }
-
-    if (strm->avail_in == 0 && !param->cf) {
-        *result = need_more;
+        /* Let one of deflate_* functions write a trailing empty block. */
+        if (flush == Z_FINISH)
+            return 0;
+        /* Clear history. */
+        if (flush == Z_FULL_FLUSH)
+            param->hl = 0;
+        /* Trigger block post-processing if necessary. */
+        *result = no_flush ? need_more : block_done;
        return 1;
    }

@@ -154,13 +162,18 @@ again:
            send_eobs(strm, param);
            param->bcf = 0;
            dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
-            if (strm->avail_out == 0) {
-                *result = need_more;
-                return 1;
-            }
        }
    }

+    /* No space for compressed data. If we proceed, dfltcc_cmpr() will return
+     * DFLTCC_CC_OP1_TOO_SHORT without buffering header bits, but we will still
+     * set BCF=1, which is wrong. Avoid complications and return early.
+     */
+    if (strm->avail_out == 0) {
+        *result = need_more;
+        return 1;
+    }
+
    /* The caller gave us too much data. Pass only one block worth of
     * uncompressed data to DFLTCC and mask the rest, so that on the next
     * iteration we start a new block.
@@ -180,7 +193,7 @@ again:
    param->cvt = state->wrap == 2 ? CVT_CRC32 : CVT_ADLER32;
    if (!no_flush)
        /* We need to close a block. Always do this in software - when there is
-         * no input data, the hardware will not nohor BCC. */
+         * no input data, the hardware will not honor BCC. */
        soft_bcc = 1;
    if (flush == Z_FINISH && !param->bcf)
        /* We are about to open a BFINAL block, set Block Header Final bit
@@ -195,8 +208,8 @@ again:
    param->sbb = (unsigned int)state->bi_valid;
    if (param->sbb > 0)
        *strm->next_out = (unsigned char)state->bi_buf;
-    if (param->hl)
-        param->nt = 0; /* Honor history */
+    /* Honor history and check value */
+    param->nt = 0;
    param->cv = state->wrap == 2 ? ZSWAP32(strm->adler) : strm->adler;

    /* When opening a block, choose a Huffman-Table Type */
@@ -277,31 +290,60 @@ again:
   fly with deflateParams, we need to convert between hardware and software
   window formats.
 */
-int ZLIB_INTERNAL dfltcc_deflate_params(PREFIX3(streamp) strm, int level, int strategy)
-{
+static int dfltcc_was_deflate_used(PREFIX3(streamp) strm) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
+
+    return strm->total_in > 0 || param->nt == 0 || param->hl > 0;
+}
+
+int Z_INTERNAL dfltcc_deflate_params(PREFIX3(streamp) strm, int level, int strategy, int *flush) {
    deflate_state *state = (deflate_state *)strm->state;
-    struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
-    struct dfltcc_param_v0 *param = &dfltcc_state->param;
    int could_deflate = dfltcc_can_deflate(strm);
-    int can_deflate = dfltcc_are_params_ok(level, state->w_bits, strategy, dfltcc_state->level_mask);
+    int can_deflate = dfltcc_can_deflate_with_params(strm, level, state->w_bits, strategy, state->reproducible);

    if (can_deflate == could_deflate)
        /* We continue to work in the same mode - no changes needed */
        return Z_OK;

-    if (strm->total_in == 0 && param->nt == 1 && param->hl == 0)
+    if (!dfltcc_was_deflate_used(strm))
        /* DFLTCC was not used yet - no changes needed */
        return Z_OK;

-    /* Switching between hardware and software is not implemented */
-    return Z_STREAM_ERROR;
+    /* For now, do not convert between window formats - simply get rid of the old data instead */
+    *flush = Z_FULL_FLUSH;
+    return Z_OK;
+}
+
+int Z_INTERNAL dfltcc_deflate_done(PREFIX3(streamp) strm, int flush) {
+    deflate_state *state = (deflate_state *)strm->state;
+    struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
+    struct dfltcc_param_v0 *param = &dfltcc_state->param;
+
+    /* When deflate(Z_FULL_FLUSH) is called with small avail_out, it might
+     * close the block without resetting the compression state. Detect this
+     * situation and return that deflation is not done.
+     */
+    if (flush == Z_FULL_FLUSH && strm->avail_out == 0)
+        return 0;
+
+    /* Return that deflation is not done if DFLTCC is used and either it
+     * buffered some data (Continuation Flag is set), or has not written EOBS
+     * yet (Block-Continuation Flag is set).
+     */
+    return !dfltcc_can_deflate(strm) || (!param->cf && !param->bcf);
+}
+
+int Z_INTERNAL dfltcc_can_set_reproducible(PREFIX3(streamp) strm, int reproducible) {
+    deflate_state *state = (deflate_state *)strm->state;
+
+    return reproducible != state->reproducible && !dfltcc_was_deflate_used(strm);
 }

 /*
   Preloading history.
 */
-static void append_history(struct dfltcc_param_v0 *param, unsigned char *history, const unsigned char *buf, uInt count)
-{
+static void append_history(struct dfltcc_param_v0 *param, unsigned char *history, const unsigned char *buf, uInt count) {
    size_t offset;
    size_t n;

@@ -331,20 +373,19 @@ static void append_history(struct dfltcc_param_v0 *param, unsigned char *history
    }
 }

-int ZLIB_INTERNAL dfltcc_deflate_set_dictionary(PREFIX3(streamp) strm,
-                                                const unsigned char *dictionary, uInt dict_length)
-{
+int Z_INTERNAL dfltcc_deflate_set_dictionary(PREFIX3(streamp) strm,
+                                                const unsigned char *dictionary, uInt dict_length) {
    deflate_state *state = (deflate_state *)strm->state;
    struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
    struct dfltcc_param_v0 *param = &dfltcc_state->param;

    append_history(param, state->window, dictionary, dict_length);
    state->strstart = 1; /* Add FDICT to zlib header */
+    state->block_start = state->strstart; /* Make deflate_stored happy */
    return Z_OK;
 }

-int ZLIB_INTERNAL dfltcc_deflate_get_dictionary(PREFIX3(streamp) strm, unsigned char *dictionary, uInt *dict_length)
-{
+int Z_INTERNAL dfltcc_deflate_get_dictionary(PREFIX3(streamp) strm, unsigned char *dictionary, uInt *dict_length) {
    deflate_state *state = (deflate_state *)strm->state;
    struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
    struct dfltcc_param_v0 *param = &dfltcc_state->param;
@@ -3,12 +3,14 @@

 #include "dfltcc_common.h"

-int ZLIB_INTERNAL dfltcc_can_deflate(PREFIX3(streamp) strm);
-int ZLIB_INTERNAL dfltcc_deflate(PREFIX3(streamp) strm, int flush, block_state *result);
-int ZLIB_INTERNAL dfltcc_deflate_params(PREFIX3(streamp) strm, int level, int strategy);
-int ZLIB_INTERNAL dfltcc_deflate_set_dictionary(PREFIX3(streamp) strm,
+int Z_INTERNAL dfltcc_can_deflate(PREFIX3(streamp) strm);
+int Z_INTERNAL dfltcc_deflate(PREFIX3(streamp) strm, int flush, block_state *result);
+int Z_INTERNAL dfltcc_deflate_params(PREFIX3(streamp) strm, int level, int strategy, int *flush);
+int Z_INTERNAL dfltcc_deflate_done(PREFIX3(streamp) strm, int flush);
+int Z_INTERNAL dfltcc_can_set_reproducible(PREFIX3(streamp) strm, int reproducible);
+int Z_INTERNAL dfltcc_deflate_set_dictionary(PREFIX3(streamp) strm,
                                                const unsigned char *dictionary, uInt dict_length);
-int ZLIB_INTERNAL dfltcc_deflate_get_dictionary(PREFIX3(streamp) strm, unsigned char *dictionary, uInt* dict_length);
+int Z_INTERNAL dfltcc_deflate_get_dictionary(PREFIX3(streamp) strm, unsigned char *dictionary, uInt* dict_length);

 #define DEFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
    do { \
@@ -25,15 +27,17 @@ int ZLIB_INTERNAL dfltcc_deflate_get_dictionary(PREFIX3(streamp) strm, unsigned
 #define DEFLATE_RESET_KEEP_HOOK(strm) \
    dfltcc_reset((strm), sizeof(deflate_state))

-#define DEFLATE_PARAMS_HOOK(strm, level, strategy) \
+#define DEFLATE_PARAMS_HOOK(strm, level, strategy, hook_flush) \
    do { \
        int err; \
 \
-        err = dfltcc_deflate_params((strm), (level), (strategy)); \
+        err = dfltcc_deflate_params((strm), (level), (strategy), (hook_flush)); \
        if (err == Z_STREAM_ERROR) \
            return err; \
    } while (0)

+#define DEFLATE_DONE dfltcc_deflate_done
+
 #define DEFLATE_BOUND_ADJUST_COMPLEN(strm, complen, source_len) \
    do { \
        if (dfltcc_can_deflate((strm))) \
@@ -47,4 +51,6 @@ int ZLIB_INTERNAL dfltcc_deflate_get_dictionary(PREFIX3(streamp) strm, unsigned

 #define DEFLATE_NEED_CHECKSUM(strm) (!dfltcc_can_deflate((strm)))

+#define DEFLATE_CAN_SET_REPRODUCIBLE dfltcc_can_set_reproducible
+
 #endif
@@ -46,18 +46,17 @@ typedef enum {
 #define DFLTCC_FACILITY 151

 static inline dfltcc_cc dfltcc(int fn, void *param,
-                               unsigned char **op1, size_t *len1, const unsigned char **op2, size_t *len2, void *hist)
-{
+                               unsigned char **op1, size_t *len1, z_const unsigned char **op2, size_t *len2, void *hist) {
    unsigned char *t2 = op1 ? *op1 : NULL;
    size_t t3 = len1 ? *len1 : 0;
-    const unsigned char *t4 = op2 ? *op2 : NULL;
+    z_const unsigned char *t4 = op2 ? *op2 : NULL;
    size_t t5 = len2 ? *len2 : 0;
-    register int r0 __asm__("r0") = fn;
-    register void *r1 __asm__("r1") = param;
-    register unsigned char *r2 __asm__("r2") = t2;
-    register size_t r3 __asm__("r3") = t3;
-    register const unsigned char *r4 __asm__("r4") = t4;
-    register size_t r5 __asm__("r5") = t5;
+    Z_REGISTER int r0 __asm__("r0") = fn;
+    Z_REGISTER void *r1 __asm__("r1") = param;
+    Z_REGISTER unsigned char *r2 __asm__("r2") = t2;
+    Z_REGISTER size_t r3 __asm__("r3") = t3;
+    Z_REGISTER z_const unsigned char *r4 __asm__("r4") = t4;
+    Z_REGISTER size_t r5 __asm__("r5") = t5;
    int cc;

    __asm__ volatile(
@@ -108,13 +107,11 @@ struct dfltcc_qaf_param {

 static_assert(sizeof(struct dfltcc_qaf_param) == 32, sizeof_struct_dfltcc_qaf_param_is_32);

-static inline int is_bit_set(const char *bits, int n)
-{
+static inline int is_bit_set(const char *bits, int n) {
    return bits[n / 8] & (1 << (7 - (n % 8)));
 }

-static inline void clear_bit(char *bits, int n)
-{
+static inline void clear_bit(char *bits, int n) {
    bits[n / 8] &= ~(1 << (7 - (n % 8)));
 }

@@ -175,8 +172,7 @@ struct dfltcc_param_v0 {

 static_assert(sizeof(struct dfltcc_param_v0) == 1536, sizeof_struct_dfltcc_param_v0_is_1536);

-static inline const char *oesc_msg(char *buf, int oesc)
-{
+static inline z_const char *oesc_msg(char *buf, int oesc) {
    if (oesc == 0x00)
        return NULL; /* Successful completion */
    else {
@@ -198,4 +194,6 @@ struct dfltcc_state {
    char msg[64];                      /* Buffer for strm->msg */
 };

-#define GET_DFLTCC_STATE(state) ((struct dfltcc_state *)((state) + 1))
+#define ALIGN_UP(p, size) (__typeof__(p))(((uintptr_t)(p) + ((size) - 1)) & ~((size) - 1))
+
+#define GET_DFLTCC_STATE(state) ((struct dfltcc_state *)((char *)(state) + ALIGN_UP(sizeof(*state), 8)))
@@ -13,15 +13,14 @@
        $ make
 */

-#include "zbuild.h"
-#include "zutil.h"
-#include "inftrees.h"
-#include "inflate.h"
+#include "../../zbuild.h"
+#include "../../zutil.h"
+#include "../../inftrees.h"
+#include "../../inflate.h"
 #include "dfltcc_inflate.h"
 #include "dfltcc_detail.h"

-int ZLIB_INTERNAL dfltcc_can_inflate(PREFIX3(streamp) strm)
-{
+int Z_INTERNAL dfltcc_can_inflate(PREFIX3(streamp) strm) {
    struct inflate_state *state = (struct inflate_state *)strm->state;
    struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);

@@ -33,8 +32,7 @@ int ZLIB_INTERNAL dfltcc_can_inflate(PREFIX3(streamp) strm)
    return is_bit_set(dfltcc_state->af.fns, DFLTCC_XPND) && is_bit_set(dfltcc_state->af.fmts, DFLTCC_FMT0);
 }

-static inline dfltcc_cc dfltcc_xpnd(PREFIX3(streamp) strm)
-{
+static inline dfltcc_cc dfltcc_xpnd(PREFIX3(streamp) strm) {
    struct inflate_state *state = (struct inflate_state *)strm->state;
    struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
    size_t avail_in = strm->avail_in;
@@ -49,8 +47,7 @@ static inline dfltcc_cc dfltcc_xpnd(PREFIX3(streamp) strm)
    return cc;
 }

-dfltcc_inflate_action ZLIB_INTERNAL dfltcc_inflate(PREFIX3(streamp) strm, int flush, int *ret)
-{
+dfltcc_inflate_action Z_INTERNAL dfltcc_inflate(PREFIX3(streamp) strm, int flush, int *ret) {
    struct inflate_state *state = (struct inflate_state *)strm->state;
    struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
    struct dfltcc_param_v0 *param = &dfltcc_state->param;
@@ -115,16 +112,14 @@ dfltcc_inflate_action ZLIB_INTERNAL dfltcc_inflate(PREFIX3(streamp) strm, int fl
        DFLTCC_INFLATE_BREAK : DFLTCC_INFLATE_CONTINUE;
 }

-int ZLIB_INTERNAL dfltcc_was_inflate_used(PREFIX3(streamp) strm)
-{
+int Z_INTERNAL dfltcc_was_inflate_used(PREFIX3(streamp) strm) {
    struct inflate_state *state = (struct inflate_state *)strm->state;
    struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;

    return !param->nt;
 }

-int ZLIB_INTERNAL dfltcc_inflate_disable(PREFIX3(streamp) strm)
-{
+int Z_INTERNAL dfltcc_inflate_disable(PREFIX3(streamp) strm) {
    struct inflate_state *state = (struct inflate_state *)strm->state;
    struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);

@@ -3,15 +3,15 @@

 #include "dfltcc_common.h"

-int ZLIB_INTERNAL dfltcc_can_inflate(PREFIX3(streamp) strm);
+int Z_INTERNAL dfltcc_can_inflate(PREFIX3(streamp) strm);
 typedef enum {
    DFLTCC_INFLATE_CONTINUE,
    DFLTCC_INFLATE_BREAK,
    DFLTCC_INFLATE_SOFTWARE,
 } dfltcc_inflate_action;
-dfltcc_inflate_action ZLIB_INTERNAL dfltcc_inflate(PREFIX3(streamp) strm, int flush, int *ret);
-int ZLIB_INTERNAL dfltcc_was_inflate_used(PREFIX3(streamp) strm);
-int ZLIB_INTERNAL dfltcc_inflate_disable(PREFIX3(streamp) strm);
+dfltcc_inflate_action Z_INTERNAL dfltcc_inflate(PREFIX3(streamp) strm, int flush, int *ret);
+int Z_INTERNAL dfltcc_was_inflate_used(PREFIX3(streamp) strm);
+int Z_INTERNAL dfltcc_inflate_disable(PREFIX3(streamp) strm);

 #define INFLATE_RESET_KEEP_HOOK(strm) \
    dfltcc_reset((strm), sizeof(struct inflate_state))
@@ -41,4 +41,9 @@ int ZLIB_INTERNAL dfltcc_inflate_disable(PREFIX3(streamp) strm);
        if (dfltcc_was_inflate_used((strm))) return -(1L << 16); \
    } while (0)

+#define INFLATE_SYNC_POINT_HOOK(strm) \
+    do { \
+        if (dfltcc_was_inflate_used((strm))) return Z_STREAM_ERROR; \
+    } while (0)
+
 #endif
@@ -1,3 +0,0 @@
-fill_window_sse.c	SSE2 optimized fill_window
-deflate_quick.c		SSE4 optimized deflate strategy for use as level 1
-crc_folding.c		SSE4 + PCLMULQDQ optimized CRC folding implementation
@@ -0,0 +1,8 @@
+Contents
+--------
+
+|Name|Description|
+|:-|:-|
+|deflate_quick.c|SSE4 optimized deflate strategy for use as level 1|
+|crc_folding.c|SSE4 + PCLMULQDQ optimized CRC folding implementation|
+|slide_sse2.c|SSE2 optimized slide_hash|
@@ -8,7 +8,9 @@ SFLAGS=
 INCLUDES=
 SUFFIX=

+AVX2FLAG=-mavx2
 SSE2FLAG=-msse2
+SSSE3FLAG=-mssse3
 SSE4FLAG=-msse4
 PCLMULFLAG=-mpclmul

@@ -16,7 +18,18 @@ SRCDIR=.
 SRCTOP=../..
 TOPDIR=$(SRCTOP)

-all: x86.o x86.lo fill_window_sse.o fill_window_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo
+all: \
+	x86.o x86.lo \
+	adler32_avx.o adler32.lo \
+	adler32_ssse3.o adler32_ssse3.lo \
+	chunkset_avx.o chunkset_avx.lo \
+	chunkset_sse.o chunkset_sse.lo \
+	compare258_avx.o compare258_avx.lo \
+	compare258_sse.o compare258_sse.lo \
+	insert_string_sse.o insert_string_sse.lo \
+	crc_folding.o crc_folding.lo \
+	slide_avx.o slide_avx.lo \
+	slide_sse.o slide_sse.lo

 x86.o:
 	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
@@ -24,17 +37,29 @@ x86.o:
 x86.lo:
 	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c

-fill_window_sse.o:
-	$(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_sse.c
+chunkset_avx.o:
+	$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx.c

-fill_window_sse.lo:
-	$(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_sse.c
+chunkset_avx.lo:
+	$(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx.c

-deflate_quick.o:
-	$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/deflate_quick.c
+chunkset_sse.o:
+	$(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse.c

-deflate_quick.lo:
-	$(CC) $(SFLAGS) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/deflate_quick.c
+chunkset_sse.lo:
+	$(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse.c
+
+compare258_avx.o:
+	$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx.c
+
+compare258_avx.lo:
+	$(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx.c
+
+compare258_sse.o:
+	$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c
+
+compare258_sse.lo:
+	$(CC) $(SFLAGS) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c

 insert_string_sse.o:
 	$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse.c
@@ -48,6 +73,30 @@ crc_folding.o:
 crc_folding.lo:
 	$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c

+slide_avx.o:
+	$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_avx.c
+
+slide_avx.lo:
+	$(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_avx.c
+
+slide_sse.o:
+	$(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_sse.c
+
+slide_sse.lo:
+	$(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_sse.c
+
+adler32_avx.o: $(SRCDIR)/adler32_avx.c
+	$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx.c
+
+adler32_avx.lo: $(SRCDIR)/adler32_avx.c
+	$(CC) $(SFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx.c
+
+adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
+	$(CC) $(CFLAGS) $(SSSE3FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+
+adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c
+	$(CC) $(SFLAGS) $(SSSE3FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+
 mostlyclean: clean
 clean:
 	rm -f *.o *.lo *~
@@ -0,0 +1,117 @@
+/* adler32.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../zutil.h"
+
+#include "../../adler32_p.h"
+
+#include <immintrin.h>
+
+#ifdef X86_AVX2_ADLER32
+
+Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len) {
+    uint32_t sum2;
+
+     /* split Adler-32 into component sums */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (UNLIKELY(buf == NULL))
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_len_16(adler, buf, len, sum2);
+
+    uint32_t ALIGNED_(32) s1[8], s2[8];
+
+    memset(s1, 0, sizeof(s1)); s1[7] = adler; // TODO: would a masked load be faster?
+    memset(s2, 0, sizeof(s2)); s2[7] = sum2;
+
+    char ALIGNED_(32) dot1[32] = \
+        {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+    __m256i dot1v = _mm256_load_si256((__m256i*)dot1);
+    char ALIGNED_(32) dot2[32] = \
+        {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+         16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+    __m256i dot2v = _mm256_load_si256((__m256i*)dot2);
+    short ALIGNED_(32) dot3[16] = \
+        {1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1};
+    __m256i dot3v = _mm256_load_si256((__m256i*)dot3);
+
+    // We will need to multiply by
+    char ALIGNED_(32) shift[16] = {5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    __m128i shiftv = _mm_load_si128((__m128i*)shift);
+
+    while (len >= 32) {
+       __m256i vs1 = _mm256_load_si256((__m256i*)s1);
+       __m256i vs2 = _mm256_load_si256((__m256i*)s2);
+       __m256i vs1_0 = vs1;
+
+       int k = (len < NMAX ? (int)len : NMAX);
+       k -= k % 32;
+       len -= k;
+
+       while (k >= 32) {
+           /*
+              vs1 = adler + sum(c[i])
+              vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+           */
+           __m256i vbuf = _mm256_loadu_si256((__m256i*)buf);
+           buf += 32;
+           k -= 32;
+
+           __m256i v_short_sum1 = _mm256_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts.
+           __m256i vsum1 = _mm256_madd_epi16(v_short_sum1, dot3v);   // sum 8 shorts to 4 int32_t;
+           __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v);
+           vs1 = _mm256_add_epi32(vsum1, vs1);
+           __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v);
+           vs1_0 = _mm256_sll_epi32(vs1_0, shiftv);
+           vsum2 = _mm256_add_epi32(vsum2, vs2);
+           vs2   = _mm256_add_epi32(vsum2, vs1_0);
+           vs1_0 = vs1;
+       }
+
+       // At this point, we have partial sums stored in vs1 and vs2.  There are AVX512 instructions that
+       // would allow us to sum these quickly (VP4DPWSSD).  For now, just unpack and move on.
+       uint32_t ALIGNED_(32) s1_unpack[8];
+       uint32_t ALIGNED_(32) s2_unpack[8];
+
+       _mm256_store_si256((__m256i*)s1_unpack, vs1);
+       _mm256_store_si256((__m256i*)s2_unpack, vs2);
+
+       adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
+               (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
+       adler %= BASE;
+       s1[7] = adler;
+
+       sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE) +
+              (s2_unpack[4] % BASE) + (s2_unpack[5] % BASE) + (s2_unpack[6] % BASE) + (s2_unpack[7] % BASE);
+       sum2 %= BASE;
+       s2[7] = sum2;
+    }
+
+    while (len) {
+        len--;
+        adler += *buf++;
+        sum2 += adler;
+    }
+    adler %= BASE;
+    sum2 %= BASE;
+
+    /* return recombined sums */
+    return adler | (sum2 << 16);
+}
+
+#endif
@@ -0,0 +1,118 @@
+/* adler32.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../zutil.h"
+
+#include "../../adler32_p.h"
+
+#ifdef X86_SSSE3_ADLER32
+
+#include <immintrin.h>
+
+Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len) {
+    uint32_t sum2;
+
+     /* split Adler-32 into component sums */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (UNLIKELY(buf == NULL))
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_len_16(adler, buf, len, sum2);
+
+    uint32_t ALIGNED_(16) s1[4], s2[4];
+
+    s1[0] = s1[1] = s1[2] = 0; s1[3] = adler;
+    s2[0] = s2[1] = s2[2] = 0; s2[3] = sum2;
+
+    char ALIGNED_(16) dot1[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+    __m128i dot1v = _mm_load_si128((__m128i*)dot1);
+    char ALIGNED_(16) dot2[16] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+    __m128i dot2v = _mm_load_si128((__m128i*)dot2);
+    short ALIGNED_(16) dot3[8] = {1, 1, 1, 1, 1, 1, 1, 1};
+    __m128i dot3v = _mm_load_si128((__m128i*)dot3);
+
+    // We will need to multiply by
+    //char ALIGNED_(16) shift[4] = {0, 0, 0, 4}; //{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4};
+
+    char ALIGNED_(16) shift[16] = {4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    __m128i shiftv = _mm_load_si128((__m128i*)shift);
+
+    while (len >= 16) {
+       __m128i vs1 = _mm_load_si128((__m128i*)s1);
+       __m128i vs2 = _mm_load_si128((__m128i*)s2);
+       __m128i vs1_0 = vs1;
+
+       int k = (len < NMAX ? (int)len : NMAX);
+       k -= k % 16;
+       len -= k;
+
+       while (k >= 16) {
+           /*
+              vs1 = adler + sum(c[i])
+              vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+
+              NOTE: 256-bit equivalents are:
+                _mm256_maddubs_epi16 <- operates on 32 bytes to 16 shorts
+                _mm256_madd_epi16    <- Sums 16 shorts to 8 int32_t.
+              We could rewrite the below to use 256-bit instructions instead of 128-bit.
+           */
+           __m128i vbuf = _mm_loadu_si128((__m128i*)buf);
+           buf += 16;
+           k -= 16;
+
+           __m128i v_short_sum1 = _mm_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts.
+           __m128i vsum1 = _mm_madd_epi16(v_short_sum1, dot3v);  // sum 8 shorts to 4 int32_t;
+           __m128i v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+           vs1 = _mm_add_epi32(vsum1, vs1);
+           __m128i vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+           vs1_0 = _mm_sll_epi32(vs1_0, shiftv);
+           vsum2 = _mm_add_epi32(vsum2, vs2);
+           vs2   = _mm_add_epi32(vsum2, vs1_0);
+           vs1_0 = vs1;
+       }
+
+       // At this point, we have partial sums stored in vs1 and vs2.  There are AVX512 instructions that
+       // would allow us to sum these quickly (VP4DPWSSD).  For now, just unpack and move on.
+
+       uint32_t ALIGNED_(16) s1_unpack[4];
+       uint32_t ALIGNED_(16) s2_unpack[4];
+
+       _mm_store_si128((__m128i*)s1_unpack, vs1);
+       _mm_store_si128((__m128i*)s2_unpack, vs2);
+
+       adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE);
+       adler %= BASE;
+       s1[3] = adler;
+
+       sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE);
+       sum2 %= BASE;
+       s2[3] = sum2;
+    }
+
+    while (len) {
+        len--;
+        adler += *buf++;
+        sum2 += adler;
+    }
+    adler %= BASE;
+    sum2 %= BASE;
+
+    /* return recombined sums */
+    return adler | (sum2 << 16);
+}
+
+#endif
@@ -0,0 +1,50 @@
+/* chunkset_avx.c -- AVX inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "zbuild.h"
+#include "zutil.h"
+
+#ifdef X86_AVX_CHUNKSET
+#include <immintrin.h>
+
+typedef __m256i chunk_t;
+
+#define HAVE_CHUNKMEMSET_1
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm256_set1_epi8(*(int8_t *)from);
+}
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm256_set1_epi16(*(int16_t *)from);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm256_set1_epi32(*(int32_t *)from);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm256_set1_epi64x(*(int64_t *)from);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = _mm256_loadu_si256((__m256i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    _mm256_storeu_si256((__m256i *)out, *chunk);
+}
+
+#define CHUNKSIZE        chunksize_avx
+#define CHUNKCOPY        chunkcopy_avx
+#define CHUNKCOPY_SAFE   chunkcopy_safe_avx
+#define CHUNKUNROLL      chunkunroll_avx
+#define CHUNKMEMSET      chunkmemset_avx
+#define CHUNKMEMSET_SAFE chunkmemset_safe_avx
+
+#include "chunkset_tpl.h"
+
+#endif
@@ -0,0 +1,51 @@
+/* chunkset_sse.c -- SSE inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zutil.h"
+
+#ifdef X86_SSE2
+#include <immintrin.h>
+
+typedef __m128i chunk_t;
+
+#define HAVE_CHUNKMEMSET_1
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm_set1_epi8(*(int8_t *)from);
+}
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm_set1_epi16(*(int16_t *)from);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm_set1_epi32(*(int32_t *)from);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm_set1_epi64x(*(int64_t *)from);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+#define CHUNKSIZE        chunksize_sse2
+#define CHUNKCOPY        chunkcopy_sse2
+#define CHUNKCOPY_SAFE   chunkcopy_safe_sse2
+#define CHUNKUNROLL      chunkunroll_sse2
+#define CHUNKMEMSET      chunkmemset_sse2
+#define CHUNKMEMSET_SAFE chunkmemset_safe_sse2
+
+#include "chunkset_tpl.h"
+
+#endif
@@ -0,0 +1,67 @@
+/* compare258_avx.c -- AVX2 version of compare258
+ * Copyright Mika T. Lindqvist  <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../zutil.h"
+
+#include "fallback_builtins.h"
+
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+
+#include <immintrin.h>
+#ifdef _MSC_VER
+#  include <nmmintrin.h>
+#endif
+
+/* UNALIGNED_OK, AVX2 intrinsic comparison */
+static inline uint32_t compare256_unaligned_avx2_static(const unsigned char *src0, const unsigned char *src1) {
+    uint32_t len = 0;
+
+    do {
+        __m256i ymm_src0, ymm_src1, ymm_cmp;
+        ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
+        ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
+        ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
+        unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
+        if (mask != 0xFFFFFFFF) {
+            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */
+            return len + match_byte;
+        }
+
+        src0 += 32, src1 += 32, len += 32;
+
+        ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
+        ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
+        ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1);
+        mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
+        if (mask != 0xFFFFFFFF) {
+            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+            return len + match_byte;
+        }
+
+        src0 += 32, src1 += 32, len += 32;
+    } while (len < 256);
+
+    return 256;
+}
+
+static inline uint32_t compare258_unaligned_avx2_static(const unsigned char *src0, const unsigned char *src1) {
+    if (*(uint16_t *)src0 != *(uint16_t *)src1)
+        return (*src0 == *src1);
+
+    return compare256_unaligned_avx2_static(src0+2, src1+2) + 2;
+}
+
+Z_INTERNAL uint32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1) {
+    return compare258_unaligned_avx2_static(src0, src1);
+}
+
+#define LONGEST_MATCH   longest_match_unaligned_avx2
+#define COMPARE256      compare256_unaligned_avx2_static
+#define COMPARE258      compare258_unaligned_avx2_static
+
+#include "match_tpl.h"
+
+#endif
@@ -0,0 +1,74 @@
+/* compare258_sse.c -- SSE4.2 version of compare258
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Authors:
+ *  Wajdi Feghali   <wajdi.k.feghali@intel.com>
+ *  Jim Guilford    <james.guilford@intel.com>
+ *  Vinodh Gopal    <vinodh.gopal@intel.com>
+ *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
+ *  Jim Kukunas     <james.t.kukunas@linux.intel.com>
+ *
+ * Portions are Copyright (C) 2016 12Sided Technology, LLC.
+ * Author:
+ *  Phil Vachon     <pvachon@12sidedtech.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../zutil.h"
+
+#ifdef X86_SSE42_CMP_STR
+
+#include <immintrin.h>
+#ifdef _MSC_VER
+#  include <nmmintrin.h>
+#endif
+
+/* UNALIGNED_OK, SSE4.2 intrinsic comparison */
+static inline uint32_t compare256_unaligned_sse4_static(const unsigned char *src0, const unsigned char *src1) {
+    uint32_t len = 0;
+
+    do {
+        #define mode _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY
+        __m128i xmm_src0, xmm_src1;
+        uint32_t ret;
+
+        xmm_src0 = _mm_loadu_si128((__m128i *)src0);
+        xmm_src1 = _mm_loadu_si128((__m128i *)src1);
+        ret = (uint32_t)_mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode);
+        if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) {
+            return len + ret;
+        }
+        src0 += 16, src1 += 16, len += 16;
+
+        xmm_src0 = _mm_loadu_si128((__m128i *)src0);
+        xmm_src1 = _mm_loadu_si128((__m128i *)src1);
+        ret = (uint32_t)_mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode);
+        if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) {
+            return len + ret;
+        }
+        src0 += 16, src1 += 16, len += 16;
+    } while (len < 256);
+
+    return 256;
+}
+
+static inline uint32_t compare258_unaligned_sse4_static(const unsigned char *src0, const unsigned char *src1) {
+    if (*(uint16_t *)src0 != *(uint16_t *)src1)
+        return (*src0 == *src1);
+
+    return compare256_unaligned_sse4_static(src0+2, src1+2) + 2;
+}
+
+Z_INTERNAL uint32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1) {
+    return compare258_unaligned_sse4_static(src0, src1);
+}
+
+#define LONGEST_MATCH   longest_match_unaligned_sse4
+#define COMPARE256      compare256_unaligned_sse4_static
+#define COMPARE258      compare258_unaligned_sse4_static
+
+#include "match_tpl.h"
+
+#endif
@@ -1,5 +1,5 @@
 /*
- * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ 
+ * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
 * instruction.
 *
 * A white paper describing this algorithm can be found at:
@@ -18,14 +18,14 @@

 #ifdef X86_PCLMULQDQ_CRC

-#include "zbuild.h"
+#include "../../zbuild.h"
 #include <inttypes.h>
 #include <immintrin.h>
 #include <wmmintrin.h>

 #include "crc_folding.h"

-ZLIB_INTERNAL void crc_fold_init(deflate_state *const s) {
+Z_INTERNAL void crc_fold_init(deflate_state *const s) {
    /* CRC_SAVE */
    _mm_storeu_si128((__m128i *)s->crc0 + 0, _mm_cvtsi32_si128(0x9db42487));
    _mm_storeu_si128((__m128i *)s->crc0 + 1, _mm_setzero_si128());
@@ -227,9 +227,10 @@ static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
    *xmm_crc3 = _mm_castps_si128(ps_res);
 }

-ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const unsigned char *src, long len) {
+Z_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const unsigned char *src, long len) {
    unsigned long algn_diff;
    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
+    char ALIGNED_(16) partial_buf[16] = { 0 };

    /* CRC_LOAD */
    __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);
@@ -241,11 +242,14 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
    if (len < 16) {
        if (len == 0)
            return;
-        xmm_crc_part = _mm_loadu_si128((__m128i *)src);
+
+        memcpy(partial_buf, src, len);
+        xmm_crc_part = _mm_loadu_si128((const __m128i *)partial_buf);
+        memcpy(dst, partial_buf, len);
        goto partial;
    }

-    algn_diff = (0 - (uintptr_t)src) & 0xF;
+    algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
    if (algn_diff) {
        xmm_crc_part = _mm_loadu_si128((__m128i *)src);
        _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
@@ -255,6 +259,8 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
        len -= algn_diff;

        partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
+    } else {
+        xmm_crc_part = _mm_setzero_si128();
    }

    while ((len -= 64) >= 0) {
@@ -305,7 +311,7 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
            goto done;

        dst += 48;
-        xmm_crc_part = _mm_load_si128((__m128i *)src + 3);
+        memcpy(&xmm_crc_part, (__m128i *)src + 3, len);
    } else if (len + 32 >= 0) {
        len += 32;

@@ -324,7 +330,7 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
            goto done;

        dst += 32;
-        xmm_crc_part = _mm_load_si128((__m128i *)src + 2);
+        memcpy(&xmm_crc_part, (__m128i *)src + 2, len);
    } else if (len + 48 >= 0) {
        len += 48;

@@ -340,16 +346,18 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
            goto done;

        dst += 16;
-        xmm_crc_part = _mm_load_si128((__m128i *)src + 1);
+        memcpy(&xmm_crc_part, (__m128i *)src + 1, len);
    } else {
        len += 64;
        if (len == 0)
            goto done;
-        xmm_crc_part = _mm_load_si128((__m128i *)src);
+        memcpy(&xmm_crc_part, src, len);
    }

+    _mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);
+    memcpy(dst, partial_buf, len);
+
 partial:
-    _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
    partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
 done:
    /* CRC_SAVE */
@@ -377,7 +385,7 @@ static const unsigned ALIGNED_(16) crc_mask2[4] = {
    0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
 };

-uint32_t ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s) {
+uint32_t Z_INTERNAL crc_fold_512to32(deflate_state *const s) {
    const __m128i xmm_mask  = _mm_load_si128((__m128i *)crc_mask);
    const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);

@@ -447,4 +455,3 @@ uint32_t ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s) {
 }

 #endif
-
@@ -10,10 +10,10 @@
 #ifndef CRC_FOLDING_H_
 #define CRC_FOLDING_H_

-#include "deflate.h"
+#include "../../deflate.h"

-ZLIB_INTERNAL void crc_fold_init(deflate_state *const);
-ZLIB_INTERNAL uint32_t crc_fold_512to32(deflate_state *const);
-ZLIB_INTERNAL void crc_fold_copy(deflate_state *const, unsigned char *, const unsigned char *, long);
+Z_INTERNAL void crc_fold_init(deflate_state *const);
+Z_INTERNAL uint32_t crc_fold_512to32(deflate_state *const);
+Z_INTERNAL void crc_fold_copy(deflate_state *const, unsigned char *, const unsigned char *, long);

 #endif
@@ -1,25 +0,0 @@
-#ifndef X86_CTZL_H
-#define X86_CTZL_H
-
-#include <intrin.h>
-#ifdef X86_CPUID
-# include "x86.h"
-#endif
-
-#if defined(_MSC_VER) && !defined(__clang__)
-/* This is not a general purpose replacement for __builtin_ctzl. The function expects that value is != 0
- * Because of that assumption trailing_zero is not initialized and the return value of _BitScanForward is not checked
- */
-static __forceinline unsigned long __builtin_ctzl(unsigned long value)
-{
-#ifdef X86_CPUID
-	if (x86_cpu_has_tzcnt)
-		return _tzcnt_u32(value);
-#endif
-	unsigned long trailing_zero;
-	_BitScanForward(&trailing_zero, value);
-	return trailing_zero;
-}
-#endif
-
-#endif
@@ -1,175 +0,0 @@
-/*
- * Fill Window with SSE2-optimized hash shifting
- *
- * Copyright (C) 2013 Intel Corporation
- * Authors:
- *  Arjan van de Ven    <arjan@linux.intel.com>
- *  Jim Kukunas         <james.t.kukunas@linux.intel.com>
- *
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-#ifdef X86_SSE2
-
-#include "zbuild.h"
-#include <immintrin.h>
-#include "deflate.h"
-#include "deflate_p.h"
-#include "functable.h"
-
-extern int read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
-
-ZLIB_INTERNAL void fill_window_sse(deflate_state *s) {
-    const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
-
-    register unsigned n;
-    register Pos *p;
-    unsigned more;    /* Amount of free space at the end of the window. */
-    unsigned int wsize = s->w_size;
-
-    Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
-
-    do {
-        more = (unsigned)(s->window_size -(unsigned long)s->lookahead -(unsigned long)s->strstart);
-
-        /* Deal with !@#$% 64K limit: */
-        if (sizeof(int) <= 2) {
-            if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
-                more = wsize;
-
-            } else if (more == (unsigned)(-1)) {
-                /* Very unlikely, but possible on 16 bit machine if
-                 * strstart == 0 && lookahead == 1 (input done a byte at time)
-                 */
-                more--;
-            }
-        }
-
-        /* If the window is almost full and there is insufficient lookahead,
-         * move the upper half to the lower one to make room in the upper half.
-         */
-        if (s->strstart >= wsize+MAX_DIST(s)) {
-            memcpy(s->window, s->window+wsize, (unsigned)wsize);
-            s->match_start = (s->match_start >= wsize) ? s->match_start - wsize : 0;
-            s->strstart    -= wsize; /* we now have strstart >= MAX_DIST */
-            s->block_start -= (long) wsize;
-
-            /* Slide the hash table (could be avoided with 32 bit values
-               at the expense of memory usage). We slide even when level == 0
-               to keep the hash table consistent if we switch back to level > 0
-               later. (Using level 0 permanently is not an optimal usage of
-               zlib, so we don't care about this pathological case.)
-             */
-            n = s->hash_size;
-            p = &s->head[n];
-            p -= 8;
-            do {
-                __m128i value, result;
-
-                value = _mm_loadu_si128((__m128i *)p);
-                result = _mm_subs_epu16(value, xmm_wsize);
-                _mm_storeu_si128((__m128i *)p, result);
-
-                p -= 8;
-                n -= 8;
-            } while (n > 0);
-
-            n = wsize;
-            p = &s->prev[n];
-            p -= 8;
-            do {
-                __m128i value, result;
-
-                value = _mm_loadu_si128((__m128i *)p);
-                result = _mm_subs_epu16(value, xmm_wsize);
-                _mm_storeu_si128((__m128i *)p, result);
-
-                p -= 8;
-                n -= 8;
-            } while (n > 0);
-            more += wsize;
-        }
-        if (s->strm->avail_in == 0) break;
-
-        /* If there was no sliding:
-         *    strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
-         *    more == window_size - lookahead - strstart
-         * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
-         * => more >= window_size - 2*WSIZE + 2
-         * In the BIG_MEM or MMAP case (not yet supported),
-         *   window_size == input_size + MIN_LOOKAHEAD  &&
-         *   strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
-         * Otherwise, window_size == 2*WSIZE so more >= 2.
-         * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
-         */
-        Assert(more >= 2, "more < 2");
-
-        n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
-        s->lookahead += n;
-
-        /* Initialize the hash value now that we have some input: */
-        if (s->lookahead + s->insert >= MIN_MATCH) {
-            unsigned int str = s->strstart - s->insert;
-            s->ins_h = s->window[str];
-            if (str >= 1)
-                functable.insert_string(s, str + 2 - MIN_MATCH, 1);
-#if MIN_MATCH != 3
-#error Call insert_string() MIN_MATCH-3 more times
-            while (s->insert) {
-                functable.insert_string(s, str, 1);
-                str++;
-                s->insert--;
-                if (s->lookahead + s->insert < MIN_MATCH)
-                    break;
-            }
-#else
-            unsigned int count;
-            if (unlikely(s->lookahead == 1)){
-                count = s->insert - 1;
-            }else{
-                count = s->insert;
-            }
-            functable.insert_string(s, str, count);
-            s->insert -= count;
-#endif
-        }
-        /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
-         * but this is not important since only literal bytes will be emitted.
-         */
-    } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
-
-    /* If the WIN_INIT bytes after the end of the current data have never been
-     * written, then zero those bytes in order to avoid memory check reports of
-     * the use of uninitialized (or uninitialised as Julian writes) bytes by
-     * the longest match routines.  Update the high water mark for the next
-     * time through here.  WIN_INIT is set to MAX_MATCH since the longest match
-     * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
-     */
-    if (s->high_water < s->window_size) {
-        unsigned long curr = s->strstart + (unsigned long)(s->lookahead);
-        unsigned long init;
-
-        if (s->high_water < curr) {
-            /* Previous high water mark below current data -- zero WIN_INIT
-             * bytes or up to end of window, whichever is less.
-             */
-            init = s->window_size - curr;
-            if (init > WIN_INIT)
-                init = WIN_INIT;
-            memset(s->window + curr, 0, (unsigned)init);
-            s->high_water = curr + init;
-        } else if (s->high_water < (unsigned long)curr + WIN_INIT) {
-            /* High water mark at or above current data, but below current data
-             * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
-             * to end of window, whichever is less.
-             */
-            init = (unsigned long)curr + WIN_INIT - s->high_water;
-            if (init > s->window_size - s->high_water)
-                init = s->window_size - s->high_water;
-            memset(s->window + s->high_water, 0, (unsigned)init);
-            s->high_water += init;
-        }
-    }
-
-    Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "not enough room for search");
-}
-#endif
@@ -5,52 +5,42 @@
 *
 */

-#include "zbuild.h"
-#include "deflate.h"
-
-/* ===========================================================================
- * Insert string str in the dictionary and set match_head to the previous head
- * of the hash chain (the most recent string with same hash key). Return
- * the previous length of the hash chain.
- * IN  assertion: all calls to to INSERT_STRING are made with consecutive
- *    input characters and the first MIN_MATCH bytes of str are valid
- *    (except for the last MIN_MATCH-1 bytes of the input file).
- */
-#ifdef X86_SSE4_2_CRC_HASH
-ZLIB_INTERNAL Pos insert_string_sse(deflate_state *const s, const Pos str, unsigned int count) {
-    Pos ret = 0;
-    unsigned int idx;
-    unsigned int *ip, val, h;
-
-    for (idx = 0; idx < count; idx++) {
-        ip = (unsigned *)&s->window[str+idx];
-        memcpy(&val, ip, sizeof(val));
-        h = 0;
-
-        if (s->level >= TRIGGER_LEVEL)
-            val &= 0xFFFFFF;
-
+#include "../../zbuild.h"
+#include <immintrin.h>
 #ifdef _MSC_VER
-        h = _mm_crc32_u32(h, val);
-#elif defined(X86_SSE4_2_CRC_INTRIN)
-        h = __builtin_ia32_crc32si(h, val);
+#  include <nmmintrin.h>
+#endif
+#include "../../deflate.h"
+
+#ifdef X86_SSE42_CRC_INTRIN
+#  ifdef _MSC_VER
+#    define UPDATE_HASH(s, h, val)\
+        h = _mm_crc32_u32(h, val)
+#  else
+#    define UPDATE_HASH(s, h, val)\
+        h = __builtin_ia32_crc32si(h, val)
+#  endif
 #else
-        __asm__ __volatile__ (
-            "crc32 %1,%0\n\t"
-            : "+r" (h)
-            : "r" (val)
-        );
-#endif
-        Pos head = s->head[h & s->hash_mask];
-        if (head != str+idx) {
-            s->prev[(str+idx) & s->w_mask] = head;
-            s->head[h & s->hash_mask] = str+idx;
-            if (idx == count-1)
-              ret = head;
-        } else if (idx == count - 1) {
-          ret = str + idx;
-        }
+#  ifdef _MSC_VER
+#    define UPDATE_HASH(s, h, val) {\
+        __asm mov edx, h\
+        __asm mov eax, val\
+        __asm crc32 eax, edx\
+        __asm mov val, eax\
    }
-    return ret;
-}
+#  else
+#    define UPDATE_HASH(s, h, val) \
+        __asm__ __volatile__ (\
+            "crc32 %1,%0\n\t"\
+            : "+r" (h)\
+            : "r" (val)\
+        );
+#  endif
+#endif
+
+#define INSERT_STRING       insert_string_sse4
+#define QUICK_INSERT_STRING quick_insert_string_sse4
+
+#ifdef X86_SSE42_CRC_HASH
+#  include "../../insert_string_tpl.h"
 #endif
@@ -0,0 +1,47 @@
+/*
+ * AVX2 optimized hash slide, based on Intel's slide_sse implementation
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Authors:
+ *   Arjan van de Ven   <arjan@linux.intel.com>
+ *   Jim Kukunas        <james.t.kukunas@linux.intel.com>
+ *   Mika T. Lindqvist  <postmaster@raasu.org>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+#include <immintrin.h>
+
+Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
+    Pos *p;
+    unsigned n;
+    uint16_t wsize = (uint16_t)s->w_size;
+    const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
+
+    n = HASH_SIZE;
+    p = &s->head[n] - 16;
+    do {
+        __m256i value, result;
+
+        value = _mm256_loadu_si256((__m256i *)p);
+        result= _mm256_subs_epu16(value, ymm_wsize);
+        _mm256_storeu_si256((__m256i *)p, result);
+        p -= 16;
+        n -= 16;
+    } while (n > 0);
+
+    n = wsize;
+    p = &s->prev[n] - 16;
+    do {
+        __m256i value, result;
+
+        value = _mm256_loadu_si256((__m256i *)p);
+        result= _mm256_subs_epu16(value, ymm_wsize);
+        _mm256_storeu_si256((__m256i *)p, result);
+
+        p -= 16;
+        n -= 16;
+    } while (n > 0);
+}
@@ -0,0 +1,46 @@
+/*
+ * SSE optimized hash slide
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Authors:
+ *   Arjan van de Ven   <arjan@linux.intel.com>
+ *   Jim Kukunas        <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+#include <immintrin.h>
+
+Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
+    Pos *p;
+    unsigned n;
+    uint16_t wsize = (uint16_t)s->w_size;
+    const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
+
+    n = HASH_SIZE;
+    p = &s->head[n] - 8;
+    do {
+        __m128i value, result;
+
+        value = _mm_loadu_si128((__m128i *)p);
+        result= _mm_subs_epu16(value, xmm_wsize);
+        _mm_storeu_si128((__m128i *)p, result);
+        p -= 8;
+        n -= 8;
+    } while (n > 0);
+
+    n = wsize;
+    p = &s->prev[n] - 8;
+    do {
+        __m128i value, result;
+
+        value = _mm_loadu_si128((__m128i *)p);
+        result= _mm_subs_epu16(value, xmm_wsize);
+        _mm_storeu_si128((__m128i *)p, result);
+
+        p -= 8;
+        n -= 8;
+    } while (n > 0);
+}
@@ -8,61 +8,73 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

-#include "zutil.h"
+#include "../../zutil.h"

 #ifdef _MSC_VER
-#include <intrin.h>
+#  include <intrin.h>
 #else
 // Newer versions of GCC and clang come with cpuid.h
-#include <cpuid.h>
+#  include <cpuid.h>
 #endif

-ZLIB_INTERNAL int x86_cpu_has_sse2;
-ZLIB_INTERNAL int x86_cpu_has_sse42;
-ZLIB_INTERNAL int x86_cpu_has_pclmulqdq;
-ZLIB_INTERNAL int x86_cpu_has_tzcnt;
+Z_INTERNAL int x86_cpu_has_avx2;
+Z_INTERNAL int x86_cpu_has_sse2;
+Z_INTERNAL int x86_cpu_has_ssse3;
+Z_INTERNAL int x86_cpu_has_sse42;
+Z_INTERNAL int x86_cpu_has_pclmulqdq;
+Z_INTERNAL int x86_cpu_has_tzcnt;

 static void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
 #ifdef _MSC_VER
-	unsigned int registers[4];
-	__cpuid(registers, info);
+    unsigned int registers[4];
+    __cpuid((int *)registers, info);

-	*eax = registers[0];
-	*ebx = registers[1];
-	*ecx = registers[2];
-	*edx = registers[3];
+    *eax = registers[0];
+    *ebx = registers[1];
+    *ecx = registers[2];
+    *edx = registers[3];
 #else
-	unsigned int _eax;
-	unsigned int _ebx;
-	unsigned int _ecx;
-	unsigned int _edx;
-	__cpuid(info, _eax, _ebx, _ecx, _edx);
-	*eax = _eax;
-	*ebx = _ebx;
-	*ecx = _ecx;
-	*edx = _edx;
+    __cpuid(info, *eax, *ebx, *ecx, *edx);
 #endif
 }

-void ZLIB_INTERNAL x86_check_features(void) {
-	unsigned eax, ebx, ecx, edx;
-	unsigned maxbasic;
+static void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
+#ifdef _MSC_VER
+    unsigned int registers[4];
+    __cpuidex((int *)registers, info, subinfo);

-	cpuid(0, &maxbasic, &ebx, &ecx, &edx);
-
-	cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
-
-	x86_cpu_has_sse2 = edx & 0x4000000;
-	x86_cpu_has_sse42 = ecx & 0x100000;
-	x86_cpu_has_pclmulqdq = ecx & 0x2;
-
-	if (maxbasic >= 7) {
-	  cpuid(7, &eax, &ebx, &ecx, &edx);
-
-	  // check BMI1 bit
-	  // Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
-	  x86_cpu_has_tzcnt = ebx & 0x8;
-	} else {
-	  x86_cpu_has_tzcnt = 0;
-	}
+    *eax = registers[0];
+    *ebx = registers[1];
+    *ecx = registers[2];
+    *edx = registers[3];
+#else
+    __cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
+#endif
+}
+
+void Z_INTERNAL x86_check_features(void) {
+    unsigned eax, ebx, ecx, edx;
+    unsigned maxbasic;
+
+    cpuid(0, &maxbasic, &ebx, &ecx, &edx);
+
+    cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
+
+    x86_cpu_has_sse2 = edx & 0x4000000;
+    x86_cpu_has_ssse3 = ecx & 0x200;
+    x86_cpu_has_sse42 = ecx & 0x100000;
+    x86_cpu_has_pclmulqdq = ecx & 0x2;
+
+    if (maxbasic >= 7) {
+        cpuidex(7, 0, &eax, &ebx, &ecx, &edx);
+
+        // check BMI1 bit
+        // Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
+        x86_cpu_has_tzcnt = ebx & 0x8;
+        // check AVX2 bit
+        x86_cpu_has_avx2 = ebx & 0x20;
+    } else {
+        x86_cpu_has_tzcnt = 0;
+        x86_cpu_has_avx2 = 0;
+    }
 }
@@ -1,16 +1,18 @@
- /* cpu.h -- check for CPU features
- * Copyright (C) 2013 Intel Corporation Jim Kukunas
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
+/* cpu.h -- check for CPU features
+* Copyright (C) 2013 Intel Corporation Jim Kukunas
+* For conditions of distribution and use, see copyright notice in zlib.h
+*/

 #ifndef CPU_H_
 #define CPU_H_

+extern int x86_cpu_has_avx2;
 extern int x86_cpu_has_sse2;
+extern int x86_cpu_has_ssse3;
 extern int x86_cpu_has_sse42;
 extern int x86_cpu_has_pclmulqdq;
 extern int x86_cpu_has_tzcnt;

-void ZLIB_INTERNAL x86_check_features(void);
+void Z_INTERNAL x86_check_features(void);

 #endif /* CPU_H_ */