Fix for x86 zlib issue

2026-02-20 12:42:26 +00:00 · 2019-06-30 18:28:11 -07:00 · 2019-06-30 18:28:11 -07:00 · f50d502f3d
commit f50d502f3d
parent e086f53b97
11 changed files with 3275 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@ -24,8 +24,6 @@ Makefile
 cmake_install.cmake
 install_manifest.txt
 [Bb]uild*/
-x64/
-x86/
 log/
 logs/
 vcpkg/
--- a/libs/zlibng/arch/x86/INDEX
+++ b/libs/zlibng/arch/x86/INDEX
@ -0,0 +1,3 @@
+fill_window_sse.c	SSE2 optimized fill_window
+deflate_quick.c		SSE4 optimized deflate strategy for use as level 1
+crc_folding.c		SSE4 + PCLMULQDQ optimized CRC folding implementation
--- a/libs/zlibng/arch/x86/Makefile.in
+++ b/libs/zlibng/arch/x86/Makefile.in
@ -0,0 +1,58 @@
+# Makefile for zlib
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+SSE2FLAG=-msse2
+SSE4FLAG=-msse4
+PCLMULFLAG=-mpclmul
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: x86.o x86.lo fill_window_sse.o fill_window_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo
+
+x86.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
+
+x86.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
+
+fill_window_sse.o:
+	$(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_sse.c
+
+fill_window_sse.lo:
+	$(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_sse.c
+
+deflate_quick.o:
+	$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/deflate_quick.c
+
+deflate_quick.lo:
+	$(CC) $(SFLAGS) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/deflate_quick.c
+
+insert_string_sse.o:
+	$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse.c
+
+insert_string_sse.lo:
+	$(CC) $(SFLAGS) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse.c
+
+crc_folding.o:
+	$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c
+
+crc_folding.lo:
+	$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean:
+	rm -f Makefile
--- a/libs/zlibng/arch/x86/crc_folding.c
+++ b/libs/zlibng/arch/x86/crc_folding.c
@ -0,0 +1,450 @@
+/*
+ * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ 
+ * instruction.
+ *
+ * A white paper describing this algorithm can be found at:
+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Authors:
+ *     Wajdi Feghali   <wajdi.k.feghali@intel.com>
+ *     Jim Guilford    <james.guilford@intel.com>
+ *     Vinodh Gopal    <vinodh.gopal@intel.com>
+ *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
+ *     Jim Kukunas     <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_PCLMULQDQ_CRC
+
+#include "zbuild.h"
+#include <inttypes.h>
+#include <immintrin.h>
+#include <wmmintrin.h>
+
+#include "crc_folding.h"
+
+ZLIB_INTERNAL void crc_fold_init(deflate_state *const s) {
+    /* CRC_SAVE */
+    _mm_storeu_si128((__m128i *)s->crc0 + 0, _mm_cvtsi32_si128(0x9db42487));
+    _mm_storeu_si128((__m128i *)s->crc0 + 1, _mm_setzero_si128());
+    _mm_storeu_si128((__m128i *)s->crc0 + 2, _mm_setzero_si128());
+    _mm_storeu_si128((__m128i *)s->crc0 + 3, _mm_setzero_si128());
+
+    s->strm->adler = 0;
+}
+
+static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
+    const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
+                                             0x00000001, 0xc6e41596);
+    __m128i x_tmp3;
+    __m128 ps_crc0, ps_crc3, ps_res;
+
+    x_tmp3 = *xmm_crc3;
+
+    *xmm_crc3 = *xmm_crc0;
+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
+
+    *xmm_crc0 = *xmm_crc1;
+    *xmm_crc1 = *xmm_crc2;
+    *xmm_crc2 = x_tmp3;
+    *xmm_crc3 = _mm_castps_si128(ps_res);
+}
+
+static void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
+    const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
+                                             0x00000001, 0xc6e41596);
+    __m128i x_tmp3, x_tmp2;
+    __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
+
+    x_tmp3 = *xmm_crc3;
+    x_tmp2 = *xmm_crc2;
+
+    *xmm_crc3 = *xmm_crc1;
+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+    ps_res31 = _mm_xor_ps(ps_crc3, ps_crc1);
+
+    *xmm_crc2 = *xmm_crc0;
+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+    ps_res20 = _mm_xor_ps(ps_crc0, ps_crc2);
+
+    *xmm_crc0 = x_tmp2;
+    *xmm_crc1 = x_tmp3;
+    *xmm_crc2 = _mm_castps_si128(ps_res20);
+    *xmm_crc3 = _mm_castps_si128(ps_res31);
+}
+
+static void fold_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
+    const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
+                                             0x00000001, 0xc6e41596);
+    __m128i x_tmp3;
+    __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
+
+    x_tmp3 = *xmm_crc3;
+
+    *xmm_crc3 = *xmm_crc2;
+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
+
+    *xmm_crc2 = *xmm_crc1;
+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+    ps_res21 = _mm_xor_ps(ps_crc1, ps_crc2);
+
+    *xmm_crc1 = *xmm_crc0;
+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+    ps_res10 = _mm_xor_ps(ps_crc0, ps_crc1);
+
+    *xmm_crc0 = x_tmp3;
+    *xmm_crc1 = _mm_castps_si128(ps_res10);
+    *xmm_crc2 = _mm_castps_si128(ps_res21);
+    *xmm_crc3 = _mm_castps_si128(ps_res32);
+}
+
+static void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
+    const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
+                                             0x00000001, 0xc6e41596);
+    __m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
+    __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
+    __m128 ps_t0, ps_t1, ps_t2, ps_t3;
+    __m128 ps_res0, ps_res1, ps_res2, ps_res3;
+
+    x_tmp0 = *xmm_crc0;
+    x_tmp1 = *xmm_crc1;
+    x_tmp2 = *xmm_crc2;
+    x_tmp3 = *xmm_crc3;
+
+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+    x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+    ps_t0 = _mm_castsi128_ps(x_tmp0);
+    ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
+
+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+    x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+    ps_t1 = _mm_castsi128_ps(x_tmp1);
+    ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
+
+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
+    x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+    ps_t2 = _mm_castsi128_ps(x_tmp2);
+    ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
+
+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
+    x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    ps_t3 = _mm_castsi128_ps(x_tmp3);
+    ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
+
+    *xmm_crc0 = _mm_castps_si128(ps_res0);
+    *xmm_crc1 = _mm_castps_si128(ps_res1);
+    *xmm_crc2 = _mm_castps_si128(ps_res2);
+    *xmm_crc3 = _mm_castps_si128(ps_res3);
+}
+
+static const unsigned ALIGNED_(32) pshufb_shf_table[60] = {
+    0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
+    0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */
+    0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */
+    0x87868584, 0x8b8a8988, 0x8f8e8d8c, 0x03020100, /* shl 12 (16 - 4)/shr4 */
+    0x88878685, 0x8c8b8a89, 0x008f8e8d, 0x04030201, /* shl 11 (16 - 5)/shr5 */
+    0x89888786, 0x8d8c8b8a, 0x01008f8e, 0x05040302, /* shl 10 (16 - 6)/shr6 */
+    0x8a898887, 0x8e8d8c8b, 0x0201008f, 0x06050403, /* shl  9 (16 - 7)/shr7 */
+    0x8b8a8988, 0x8f8e8d8c, 0x03020100, 0x07060504, /* shl  8 (16 - 8)/shr8 */
+    0x8c8b8a89, 0x008f8e8d, 0x04030201, 0x08070605, /* shl  7 (16 - 9)/shr9 */
+    0x8d8c8b8a, 0x01008f8e, 0x05040302, 0x09080706, /* shl  6 (16 -10)/shr10*/
+    0x8e8d8c8b, 0x0201008f, 0x06050403, 0x0a090807, /* shl  5 (16 -11)/shr11*/
+    0x8f8e8d8c, 0x03020100, 0x07060504, 0x0b0a0908, /* shl  4 (16 -12)/shr12*/
+    0x008f8e8d, 0x04030201, 0x08070605, 0x0c0b0a09, /* shl  3 (16 -13)/shr13*/
+    0x01008f8e, 0x05040302, 0x09080706, 0x0d0c0b0a, /* shl  2 (16 -14)/shr14*/
+    0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b  /* shl  1 (16 -15)/shr15*/
+};
+
+static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2,
+                         __m128i *xmm_crc3, __m128i *xmm_crc_part) {
+
+    const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
+                                             0x00000001, 0xc6e41596);
+    const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080);
+
+    __m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
+    __m128i xmm_a0_0, xmm_a0_1;
+    __m128 ps_crc3, psa0_0, psa0_1, ps_res;
+
+    xmm_shl = _mm_load_si128((__m128i *)pshufb_shf_table + (len - 1));
+    xmm_shr = xmm_shl;
+    xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
+
+    xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
+
+    *xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
+    xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
+    *xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
+
+    *xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
+    xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
+    *xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
+
+    *xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
+    xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
+    *xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
+
+    *xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
+    *xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
+    *xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
+
+    xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
+    xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
+
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    psa0_0 = _mm_castsi128_ps(xmm_a0_0);
+    psa0_1 = _mm_castsi128_ps(xmm_a0_1);
+
+    ps_res = _mm_xor_ps(ps_crc3, psa0_0);
+    ps_res = _mm_xor_ps(ps_res, psa0_1);
+
+    *xmm_crc3 = _mm_castps_si128(ps_res);
+}
+
+ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const unsigned char *src, long len) {
+    unsigned long algn_diff;
+    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
+
+    /* CRC_LOAD */
+    __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);
+    __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);
+    __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);
+    __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);
+    __m128i xmm_crc_part;
+
+    if (len < 16) {
+        if (len == 0)
+            return;
+        xmm_crc_part = _mm_loadu_si128((__m128i *)src);
+        goto partial;
+    }
+
+    algn_diff = (0 - (uintptr_t)src) & 0xF;
+    if (algn_diff) {
+        xmm_crc_part = _mm_loadu_si128((__m128i *)src);
+        _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
+
+        dst += algn_diff;
+        src += algn_diff;
+        len -= algn_diff;
+
+        partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
+    }
+
+    while ((len -= 64) >= 0) {
+        /* CRC_LOAD */
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+        xmm_t3 = _mm_load_si128((__m128i *)src + 3);
+
+        fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+        /* CRC_SAVE */
+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
+        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+        _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+
+        xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
+        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
+        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
+
+        src += 64;
+        dst += 64;
+    }
+
+    /*
+     * len = num bytes left - 64
+     */
+    if (len + 16 >= 0) {
+        len += 16;
+
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+
+        fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
+        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+
+        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
+        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
+
+        if (len == 0)
+            goto done;
+
+        dst += 48;
+        xmm_crc_part = _mm_load_si128((__m128i *)src + 3);
+    } else if (len + 32 >= 0) {
+        len += 32;
+
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+
+        fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
+        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+
+        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
+
+        if (len == 0)
+            goto done;
+
+        dst += 32;
+        xmm_crc_part = _mm_load_si128((__m128i *)src + 2);
+    } else if (len + 48 >= 0) {
+        len += 48;
+
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+
+        fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
+
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+
+        if (len == 0)
+            goto done;
+
+        dst += 16;
+        xmm_crc_part = _mm_load_si128((__m128i *)src + 1);
+    } else {
+        len += 64;
+        if (len == 0)
+            goto done;
+        xmm_crc_part = _mm_load_si128((__m128i *)src);
+    }
+
+partial:
+    _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
+    partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
+done:
+    /* CRC_SAVE */
+    _mm_storeu_si128((__m128i *)s->crc0 + 0, xmm_crc0);
+    _mm_storeu_si128((__m128i *)s->crc0 + 1, xmm_crc1);
+    _mm_storeu_si128((__m128i *)s->crc0 + 2, xmm_crc2);
+    _mm_storeu_si128((__m128i *)s->crc0 + 3, xmm_crc3);
+    _mm_storeu_si128((__m128i *)s->crc0 + 4, xmm_crc_part);
+}
+
+static const unsigned ALIGNED_(16) crc_k[] = {
+    0xccaa009e, 0x00000000, /* rk1 */
+    0x751997d0, 0x00000001, /* rk2 */
+    0xccaa009e, 0x00000000, /* rk5 */
+    0x63cd6124, 0x00000001, /* rk6 */
+    0xf7011640, 0x00000001, /* rk7 */
+    0xdb710640, 0x00000001  /* rk8 */
+};
+
+static const unsigned ALIGNED_(16) crc_mask[4] = {
+    0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
+};
+
+static const unsigned ALIGNED_(16) crc_mask2[4] = {
+    0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
+};
+
+uint32_t ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s) {
+    const __m128i xmm_mask  = _mm_load_si128((__m128i *)crc_mask);
+    const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
+
+    uint32_t crc;
+    __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
+
+    /* CRC_LOAD */
+    __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);
+    __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);
+    __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);
+    __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);
+
+    /*
+     * k1
+     */
+    crc_fold = _mm_load_si128((__m128i *)crc_k);
+
+    x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
+    xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
+    xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
+    xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
+
+    x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
+    xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
+    xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
+    xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
+
+    x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
+    xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
+
+    /*
+     * k5
+     */
+    crc_fold = _mm_load_si128((__m128i *)crc_k + 1);
+
+    xmm_crc0 = xmm_crc3;
+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
+    xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
+
+    xmm_crc0 = xmm_crc3;
+    xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
+    xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
+
+    /*
+     * k7
+     */
+    xmm_crc1 = xmm_crc3;
+    xmm_crc2 = xmm_crc3;
+    crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
+
+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
+    xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
+
+    xmm_crc2 = xmm_crc3;
+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
+
+    crc = _mm_extract_epi32(xmm_crc3, 2);
+    return ~crc;
+}
+
+#endif
+
--- a/libs/zlibng/arch/x86/crc_folding.h
+++ b/libs/zlibng/arch/x86/crc_folding.h
@ -0,0 +1,19 @@
+/* crc_folding.h
+ *
+ * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
+ * instruction.
+ *
+ * Copyright (C) 2013 Intel Corporation Jim Kukunas
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CRC_FOLDING_H_
+#define CRC_FOLDING_H_
+
+#include "deflate.h"
+
+ZLIB_INTERNAL void crc_fold_init(deflate_state *const);
+ZLIB_INTERNAL uint32_t crc_fold_512to32(deflate_state *const);
+ZLIB_INTERNAL void crc_fold_copy(deflate_state *const, unsigned char *, const unsigned char *, long);
+
+#endif
--- a/libs/zlibng/arch/x86/ctzl.h
+++ b/libs/zlibng/arch/x86/ctzl.h
@ -0,0 +1,25 @@
+#ifndef X86_CTZL_H
+#define X86_CTZL_H
+
+#include <intrin.h>
+#ifdef X86_CPUID
+# include "x86.h"
+#endif
+
+#if defined(_MSC_VER) && !defined(__clang__)
+/* This is not a general purpose replacement for __builtin_ctzl. The function expects that value is != 0
+ * Because of that assumption trailing_zero is not initialized and the return value of _BitScanForward is not checked
+ */
+static __forceinline unsigned long __builtin_ctzl(unsigned long value)
+{
+#ifdef X86_CPUID
+	if (x86_cpu_has_tzcnt)
+		return _tzcnt_u32(value);
+#endif
+	unsigned long trailing_zero;
+	_BitScanForward(&trailing_zero, value);
+	return trailing_zero;
+}
+#endif
+
+#endif
--- a/libs/zlibng/arch/x86/deflate_quick.c
+++ b/libs/zlibng/arch/x86/deflate_quick.c
--- a/libs/zlibng/arch/x86/fill_window_sse.c
+++ b/libs/zlibng/arch/x86/fill_window_sse.c
@ -0,0 +1,175 @@
+/*
+ * Fill Window with SSE2-optimized hash shifting
+ *
+ * Copyright (C) 2013 Intel Corporation
+ * Authors:
+ *  Arjan van de Ven    <arjan@linux.intel.com>
+ *  Jim Kukunas         <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifdef X86_SSE2
+
+#include "zbuild.h"
+#include <immintrin.h>
+#include "deflate.h"
+#include "deflate_p.h"
+#include "functable.h"
+
+extern int read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
+
+ZLIB_INTERNAL void fill_window_sse(deflate_state *s) {
+    const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
+
+    register unsigned n;
+    register Pos *p;
+    unsigned more;    /* Amount of free space at the end of the window. */
+    unsigned int wsize = s->w_size;
+
+    Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
+
+    do {
+        more = (unsigned)(s->window_size -(unsigned long)s->lookahead -(unsigned long)s->strstart);
+
+        /* Deal with !@#$% 64K limit: */
+        if (sizeof(int) <= 2) {
+            if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
+                more = wsize;
+
+            } else if (more == (unsigned)(-1)) {
+                /* Very unlikely, but possible on 16 bit machine if
+                 * strstart == 0 && lookahead == 1 (input done a byte at time)
+                 */
+                more--;
+            }
+        }
+
+        /* If the window is almost full and there is insufficient lookahead,
+         * move the upper half to the lower one to make room in the upper half.
+         */
+        if (s->strstart >= wsize+MAX_DIST(s)) {
+            memcpy(s->window, s->window+wsize, (unsigned)wsize);
+            s->match_start = (s->match_start >= wsize) ? s->match_start - wsize : 0;
+            s->strstart    -= wsize; /* we now have strstart >= MAX_DIST */
+            s->block_start -= (long) wsize;
+
+            /* Slide the hash table (could be avoided with 32 bit values
+               at the expense of memory usage). We slide even when level == 0
+               to keep the hash table consistent if we switch back to level > 0
+               later. (Using level 0 permanently is not an optimal usage of
+               zlib, so we don't care about this pathological case.)
+             */
+            n = s->hash_size;
+            p = &s->head[n];
+            p -= 8;
+            do {
+                __m128i value, result;
+
+                value = _mm_loadu_si128((__m128i *)p);
+                result = _mm_subs_epu16(value, xmm_wsize);
+                _mm_storeu_si128((__m128i *)p, result);
+
+                p -= 8;
+                n -= 8;
+            } while (n > 0);
+
+            n = wsize;
+            p = &s->prev[n];
+            p -= 8;
+            do {
+                __m128i value, result;
+
+                value = _mm_loadu_si128((__m128i *)p);
+                result = _mm_subs_epu16(value, xmm_wsize);
+                _mm_storeu_si128((__m128i *)p, result);
+
+                p -= 8;
+                n -= 8;
+            } while (n > 0);
+            more += wsize;
+        }
+        if (s->strm->avail_in == 0) break;
+
+        /* If there was no sliding:
+         *    strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
+         *    more == window_size - lookahead - strstart
+         * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
+         * => more >= window_size - 2*WSIZE + 2
+         * In the BIG_MEM or MMAP case (not yet supported),
+         *   window_size == input_size + MIN_LOOKAHEAD  &&
+         *   strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
+         * Otherwise, window_size == 2*WSIZE so more >= 2.
+         * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
+         */
+        Assert(more >= 2, "more < 2");
+
+        n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
+        s->lookahead += n;
+
+        /* Initialize the hash value now that we have some input: */
+        if (s->lookahead + s->insert >= MIN_MATCH) {
+            unsigned int str = s->strstart - s->insert;
+            s->ins_h = s->window[str];
+            if (str >= 1)
+                functable.insert_string(s, str + 2 - MIN_MATCH, 1);
+#if MIN_MATCH != 3
+#error Call insert_string() MIN_MATCH-3 more times
+            while (s->insert) {
+                functable.insert_string(s, str, 1);
+                str++;
+                s->insert--;
+                if (s->lookahead + s->insert < MIN_MATCH)
+                    break;
+            }
+#else
+            unsigned int count;
+            if (unlikely(s->lookahead == 1)){
+                count = s->insert - 1;
+            }else{
+                count = s->insert;
+            }
+            functable.insert_string(s, str, count);
+            s->insert -= count;
+#endif
+        }
+        /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
+         * but this is not important since only literal bytes will be emitted.
+         */
+    } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
+
+    /* If the WIN_INIT bytes after the end of the current data have never been
+     * written, then zero those bytes in order to avoid memory check reports of
+     * the use of uninitialized (or uninitialised as Julian writes) bytes by
+     * the longest match routines.  Update the high water mark for the next
+     * time through here.  WIN_INIT is set to MAX_MATCH since the longest match
+     * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
+     */
+    if (s->high_water < s->window_size) {
+        unsigned long curr = s->strstart + (unsigned long)(s->lookahead);
+        unsigned long init;
+
+        if (s->high_water < curr) {
+            /* Previous high water mark below current data -- zero WIN_INIT
+             * bytes or up to end of window, whichever is less.
+             */
+            init = s->window_size - curr;
+            if (init > WIN_INIT)
+                init = WIN_INIT;
+            memset(s->window + curr, 0, (unsigned)init);
+            s->high_water = curr + init;
+        } else if (s->high_water < (unsigned long)curr + WIN_INIT) {
+            /* High water mark at or above current data, but below current data
+             * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
+             * to end of window, whichever is less.
+             */
+            init = (unsigned long)curr + WIN_INIT - s->high_water;
+            if (init > s->window_size - s->high_water)
+                init = s->window_size - s->high_water;
+            memset(s->window + s->high_water, 0, (unsigned)init);
+            s->high_water += init;
+        }
+    }
+
+    Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "not enough room for search");
+}
+#endif
--- a/libs/zlibng/arch/x86/insert_string_sse.c
+++ b/libs/zlibng/arch/x86/insert_string_sse.c
@ -0,0 +1,56 @@
+/* insert_string_sse -- insert_string variant using SSE4.2's CRC instructions
+ *
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ */
+
+#include "zbuild.h"
+#include "deflate.h"
+
+/* ===========================================================================
+ * Insert string str in the dictionary and set match_head to the previous head
+ * of the hash chain (the most recent string with same hash key). Return
+ * the previous length of the hash chain.
+ * IN  assertion: all calls to to INSERT_STRING are made with consecutive
+ *    input characters and the first MIN_MATCH bytes of str are valid
+ *    (except for the last MIN_MATCH-1 bytes of the input file).
+ */
+#ifdef X86_SSE4_2_CRC_HASH
+ZLIB_INTERNAL Pos insert_string_sse(deflate_state *const s, const Pos str, unsigned int count) {
+    Pos ret = 0;
+    unsigned int idx;
+    unsigned int *ip, val, h;
+
+    for (idx = 0; idx < count; idx++) {
+        ip = (unsigned *)&s->window[str+idx];
+        memcpy(&val, ip, sizeof(val));
+        h = 0;
+
+        if (s->level >= TRIGGER_LEVEL)
+            val &= 0xFFFFFF;
+
+#ifdef _MSC_VER
+        h = _mm_crc32_u32(h, val);
+#elif defined(X86_SSE4_2_CRC_INTRIN)
+        h = __builtin_ia32_crc32si(h, val);
+#else
+        __asm__ __volatile__ (
+            "crc32 %1,%0\n\t"
+            : "+r" (h)
+            : "r" (val)
+        );
+#endif
+        Pos head = s->head[h & s->hash_mask];
+        if (head != str+idx) {
+            s->prev[(str+idx) & s->w_mask] = head;
+            s->head[h & s->hash_mask] = str+idx;
+            if (idx == count-1)
+              ret = head;
+        } else if (idx == count - 1) {
+          ret = str + idx;
+        }
+    }
+    return ret;
+}
+#endif
--- a/libs/zlibng/arch/x86/x86.c
+++ b/libs/zlibng/arch/x86/x86.c
@ -0,0 +1,68 @@
+/*
+ * x86 feature check
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Author:
+ *  Jim Kukunas
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zutil.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#else
+// Newer versions of GCC and clang come with cpuid.h
+#include <cpuid.h>
+#endif
+
+ZLIB_INTERNAL int x86_cpu_has_sse2;
+ZLIB_INTERNAL int x86_cpu_has_sse42;
+ZLIB_INTERNAL int x86_cpu_has_pclmulqdq;
+ZLIB_INTERNAL int x86_cpu_has_tzcnt;
+
+static void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
+#ifdef _MSC_VER
+	unsigned int registers[4];
+	__cpuid(registers, info);
+
+	*eax = registers[0];
+	*ebx = registers[1];
+	*ecx = registers[2];
+	*edx = registers[3];
+#else
+	unsigned int _eax;
+	unsigned int _ebx;
+	unsigned int _ecx;
+	unsigned int _edx;
+	__cpuid(info, _eax, _ebx, _ecx, _edx);
+	*eax = _eax;
+	*ebx = _ebx;
+	*ecx = _ecx;
+	*edx = _edx;
+#endif
+}
+
+void ZLIB_INTERNAL x86_check_features(void) {
+	unsigned eax, ebx, ecx, edx;
+	unsigned maxbasic;
+
+	cpuid(0, &maxbasic, &ebx, &ecx, &edx);
+
+	cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
+
+	x86_cpu_has_sse2 = edx & 0x4000000;
+	x86_cpu_has_sse42 = ecx & 0x100000;
+	x86_cpu_has_pclmulqdq = ecx & 0x2;
+
+	if (maxbasic >= 7) {
+	  cpuid(7, &eax, &ebx, &ecx, &edx);
+
+	  // check BMI1 bit
+	  // Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
+	  x86_cpu_has_tzcnt = ebx & 0x8;
+	} else {
+	  x86_cpu_has_tzcnt = 0;
+	}
+}
--- a/libs/zlibng/arch/x86/x86.h
+++ b/libs/zlibng/arch/x86/x86.h
@ -0,0 +1,16 @@
+ /* cpu.h -- check for CPU features
+ * Copyright (C) 2013 Intel Corporation Jim Kukunas
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CPU_H_
+#define CPU_H_
+
+extern int x86_cpu_has_sse2;
+extern int x86_cpu_has_sse42;
+extern int x86_cpu_has_pclmulqdq;
+extern int x86_cpu_has_tzcnt;
+
+void ZLIB_INTERNAL x86_check_features(void);
+
+#endif /* CPU_H_ */