mirror of
https://github.com/EQEmu/Server.git
synced 2026-06-24 17:48:20 +00:00
[Library] Update zlibng (#1255)
* Update zlibng * Set cmake path more directly in zlibng to hopefully fix an issue with the build on drone * I'm dumb, missing / in path * Mackal helps with a dumb gitignore issue * Adding all the files, not sure what's ignoring them and im tired of looking * Some tweaks to zlibng build to hopefully get it to build properly. works on msvc now
This commit is contained in:
@@ -1,3 +0,0 @@
|
||||
fill_window_sse.c SSE2 optimized fill_window
|
||||
deflate_quick.c SSE4 optimized deflate strategy for use as level 1
|
||||
crc_folding.c SSE4 + PCLMULQDQ optimized CRC folding implementation
|
||||
@@ -0,0 +1,8 @@
|
||||
Contents
|
||||
--------
|
||||
|
||||
|Name|Description|
|
||||
|:-|:-|
|
||||
|deflate_quick.c|SSE4 optimized deflate strategy for use as level 1|
|
||||
|crc_folding.c|SSE4 + PCLMULQDQ optimized CRC folding implementation|
|
||||
|slide_sse2.c|SSE2 optimized slide_hash|
|
||||
@@ -8,7 +8,9 @@ SFLAGS=
|
||||
INCLUDES=
|
||||
SUFFIX=
|
||||
|
||||
AVX2FLAG=-mavx2
|
||||
SSE2FLAG=-msse2
|
||||
SSSE3FLAG=-mssse3
|
||||
SSE4FLAG=-msse4
|
||||
PCLMULFLAG=-mpclmul
|
||||
|
||||
@@ -16,7 +18,18 @@ SRCDIR=.
|
||||
SRCTOP=../..
|
||||
TOPDIR=$(SRCTOP)
|
||||
|
||||
all: x86.o x86.lo fill_window_sse.o fill_window_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo
|
||||
all: \
|
||||
x86.o x86.lo \
|
||||
adler32_avx.o adler32.lo \
|
||||
adler32_ssse3.o adler32_ssse3.lo \
|
||||
chunkset_avx.o chunkset_avx.lo \
|
||||
chunkset_sse.o chunkset_sse.lo \
|
||||
compare258_avx.o compare258_avx.lo \
|
||||
compare258_sse.o compare258_sse.lo \
|
||||
insert_string_sse.o insert_string_sse.lo \
|
||||
crc_folding.o crc_folding.lo \
|
||||
slide_avx.o slide_avx.lo \
|
||||
slide_sse.o slide_sse.lo
|
||||
|
||||
x86.o:
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
|
||||
@@ -24,17 +37,29 @@ x86.o:
|
||||
x86.lo:
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
|
||||
|
||||
fill_window_sse.o:
|
||||
$(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_sse.c
|
||||
chunkset_avx.o:
|
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx.c
|
||||
|
||||
fill_window_sse.lo:
|
||||
$(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_sse.c
|
||||
chunkset_avx.lo:
|
||||
$(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx.c
|
||||
|
||||
deflate_quick.o:
|
||||
$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/deflate_quick.c
|
||||
chunkset_sse.o:
|
||||
$(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse.c
|
||||
|
||||
deflate_quick.lo:
|
||||
$(CC) $(SFLAGS) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/deflate_quick.c
|
||||
chunkset_sse.lo:
|
||||
$(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse.c
|
||||
|
||||
compare258_avx.o:
|
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx.c
|
||||
|
||||
compare258_avx.lo:
|
||||
$(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx.c
|
||||
|
||||
compare258_sse.o:
|
||||
$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c
|
||||
|
||||
compare258_sse.lo:
|
||||
$(CC) $(SFLAGS) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c
|
||||
|
||||
insert_string_sse.o:
|
||||
$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse.c
|
||||
@@ -48,6 +73,30 @@ crc_folding.o:
|
||||
crc_folding.lo:
|
||||
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c
|
||||
|
||||
slide_avx.o:
|
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_avx.c
|
||||
|
||||
slide_avx.lo:
|
||||
$(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_avx.c
|
||||
|
||||
slide_sse.o:
|
||||
$(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_sse.c
|
||||
|
||||
slide_sse.lo:
|
||||
$(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_sse.c
|
||||
|
||||
adler32_avx.o: $(SRCDIR)/adler32_avx.c
|
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx.c
|
||||
|
||||
adler32_avx.lo: $(SRCDIR)/adler32_avx.c
|
||||
$(CC) $(SFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx.c
|
||||
|
||||
adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
|
||||
$(CC) $(CFLAGS) $(SSSE3FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
|
||||
|
||||
adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c
|
||||
$(CC) $(SFLAGS) $(SSSE3FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
|
||||
|
||||
mostlyclean: clean
|
||||
clean:
|
||||
rm -f *.o *.lo *~
|
||||
|
||||
@@ -0,0 +1,117 @@
|
||||
/* adler32.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011 Mark Adler
|
||||
* Authors:
|
||||
* Brian Bockelman <bockelman@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "../../zutil.h"
|
||||
|
||||
#include "../../adler32_p.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#ifdef X86_AVX2_ADLER32
|
||||
|
||||
Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len) {
|
||||
uint32_t sum2;
|
||||
|
||||
/* split Adler-32 into component sums */
|
||||
sum2 = (adler >> 16) & 0xffff;
|
||||
adler &= 0xffff;
|
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */
|
||||
if (UNLIKELY(len == 1))
|
||||
return adler32_len_1(adler, buf, sum2);
|
||||
|
||||
/* initial Adler-32 value (deferred check for len == 1 speed) */
|
||||
if (UNLIKELY(buf == NULL))
|
||||
return 1L;
|
||||
|
||||
/* in case short lengths are provided, keep it somewhat fast */
|
||||
if (UNLIKELY(len < 16))
|
||||
return adler32_len_16(adler, buf, len, sum2);
|
||||
|
||||
uint32_t ALIGNED_(32) s1[8], s2[8];
|
||||
|
||||
memset(s1, 0, sizeof(s1)); s1[7] = adler; // TODO: would a masked load be faster?
|
||||
memset(s2, 0, sizeof(s2)); s2[7] = sum2;
|
||||
|
||||
char ALIGNED_(32) dot1[32] = \
|
||||
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
|
||||
__m256i dot1v = _mm256_load_si256((__m256i*)dot1);
|
||||
char ALIGNED_(32) dot2[32] = \
|
||||
{32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
|
||||
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
|
||||
__m256i dot2v = _mm256_load_si256((__m256i*)dot2);
|
||||
short ALIGNED_(32) dot3[16] = \
|
||||
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
|
||||
__m256i dot3v = _mm256_load_si256((__m256i*)dot3);
|
||||
|
||||
// We will need to multiply by
|
||||
char ALIGNED_(32) shift[16] = {5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
__m128i shiftv = _mm_load_si128((__m128i*)shift);
|
||||
|
||||
while (len >= 32) {
|
||||
__m256i vs1 = _mm256_load_si256((__m256i*)s1);
|
||||
__m256i vs2 = _mm256_load_si256((__m256i*)s2);
|
||||
__m256i vs1_0 = vs1;
|
||||
|
||||
int k = (len < NMAX ? (int)len : NMAX);
|
||||
k -= k % 32;
|
||||
len -= k;
|
||||
|
||||
while (k >= 32) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
|
||||
*/
|
||||
__m256i vbuf = _mm256_loadu_si256((__m256i*)buf);
|
||||
buf += 32;
|
||||
k -= 32;
|
||||
|
||||
__m256i v_short_sum1 = _mm256_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts.
|
||||
__m256i vsum1 = _mm256_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t;
|
||||
__m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v);
|
||||
vs1 = _mm256_add_epi32(vsum1, vs1);
|
||||
__m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v);
|
||||
vs1_0 = _mm256_sll_epi32(vs1_0, shiftv);
|
||||
vsum2 = _mm256_add_epi32(vsum2, vs2);
|
||||
vs2 = _mm256_add_epi32(vsum2, vs1_0);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
// At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that
|
||||
// would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on.
|
||||
uint32_t ALIGNED_(32) s1_unpack[8];
|
||||
uint32_t ALIGNED_(32) s2_unpack[8];
|
||||
|
||||
_mm256_store_si256((__m256i*)s1_unpack, vs1);
|
||||
_mm256_store_si256((__m256i*)s2_unpack, vs2);
|
||||
|
||||
adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
|
||||
(s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
|
||||
adler %= BASE;
|
||||
s1[7] = adler;
|
||||
|
||||
sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE) +
|
||||
(s2_unpack[4] % BASE) + (s2_unpack[5] % BASE) + (s2_unpack[6] % BASE) + (s2_unpack[7] % BASE);
|
||||
sum2 %= BASE;
|
||||
s2[7] = sum2;
|
||||
}
|
||||
|
||||
while (len) {
|
||||
len--;
|
||||
adler += *buf++;
|
||||
sum2 += adler;
|
||||
}
|
||||
adler %= BASE;
|
||||
sum2 %= BASE;
|
||||
|
||||
/* return recombined sums */
|
||||
return adler | (sum2 << 16);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,118 @@
|
||||
/* adler32.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011 Mark Adler
|
||||
* Authors:
|
||||
* Brian Bockelman <bockelman@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "../../zutil.h"
|
||||
|
||||
#include "../../adler32_p.h"
|
||||
|
||||
#ifdef X86_SSSE3_ADLER32
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len) {
|
||||
uint32_t sum2;
|
||||
|
||||
/* split Adler-32 into component sums */
|
||||
sum2 = (adler >> 16) & 0xffff;
|
||||
adler &= 0xffff;
|
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */
|
||||
if (UNLIKELY(len == 1))
|
||||
return adler32_len_1(adler, buf, sum2);
|
||||
|
||||
/* initial Adler-32 value (deferred check for len == 1 speed) */
|
||||
if (UNLIKELY(buf == NULL))
|
||||
return 1L;
|
||||
|
||||
/* in case short lengths are provided, keep it somewhat fast */
|
||||
if (UNLIKELY(len < 16))
|
||||
return adler32_len_16(adler, buf, len, sum2);
|
||||
|
||||
uint32_t ALIGNED_(16) s1[4], s2[4];
|
||||
|
||||
s1[0] = s1[1] = s1[2] = 0; s1[3] = adler;
|
||||
s2[0] = s2[1] = s2[2] = 0; s2[3] = sum2;
|
||||
|
||||
char ALIGNED_(16) dot1[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
|
||||
__m128i dot1v = _mm_load_si128((__m128i*)dot1);
|
||||
char ALIGNED_(16) dot2[16] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
|
||||
__m128i dot2v = _mm_load_si128((__m128i*)dot2);
|
||||
short ALIGNED_(16) dot3[8] = {1, 1, 1, 1, 1, 1, 1, 1};
|
||||
__m128i dot3v = _mm_load_si128((__m128i*)dot3);
|
||||
|
||||
// We will need to multiply by
|
||||
//char ALIGNED_(16) shift[4] = {0, 0, 0, 4}; //{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4};
|
||||
|
||||
char ALIGNED_(16) shift[16] = {4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
__m128i shiftv = _mm_load_si128((__m128i*)shift);
|
||||
|
||||
while (len >= 16) {
|
||||
__m128i vs1 = _mm_load_si128((__m128i*)s1);
|
||||
__m128i vs2 = _mm_load_si128((__m128i*)s2);
|
||||
__m128i vs1_0 = vs1;
|
||||
|
||||
int k = (len < NMAX ? (int)len : NMAX);
|
||||
k -= k % 16;
|
||||
len -= k;
|
||||
|
||||
while (k >= 16) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
|
||||
|
||||
NOTE: 256-bit equivalents are:
|
||||
_mm256_maddubs_epi16 <- operates on 32 bytes to 16 shorts
|
||||
_mm256_madd_epi16 <- Sums 16 shorts to 8 int32_t.
|
||||
We could rewrite the below to use 256-bit instructions instead of 128-bit.
|
||||
*/
|
||||
__m128i vbuf = _mm_loadu_si128((__m128i*)buf);
|
||||
buf += 16;
|
||||
k -= 16;
|
||||
|
||||
__m128i v_short_sum1 = _mm_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts.
|
||||
__m128i vsum1 = _mm_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t;
|
||||
__m128i v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
|
||||
vs1 = _mm_add_epi32(vsum1, vs1);
|
||||
__m128i vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
|
||||
vs1_0 = _mm_sll_epi32(vs1_0, shiftv);
|
||||
vsum2 = _mm_add_epi32(vsum2, vs2);
|
||||
vs2 = _mm_add_epi32(vsum2, vs1_0);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
// At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that
|
||||
// would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on.
|
||||
|
||||
uint32_t ALIGNED_(16) s1_unpack[4];
|
||||
uint32_t ALIGNED_(16) s2_unpack[4];
|
||||
|
||||
_mm_store_si128((__m128i*)s1_unpack, vs1);
|
||||
_mm_store_si128((__m128i*)s2_unpack, vs2);
|
||||
|
||||
adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE);
|
||||
adler %= BASE;
|
||||
s1[3] = adler;
|
||||
|
||||
sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE);
|
||||
sum2 %= BASE;
|
||||
s2[3] = sum2;
|
||||
}
|
||||
|
||||
while (len) {
|
||||
len--;
|
||||
adler += *buf++;
|
||||
sum2 += adler;
|
||||
}
|
||||
adler %= BASE;
|
||||
sum2 %= BASE;
|
||||
|
||||
/* return recombined sums */
|
||||
return adler | (sum2 << 16);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,50 @@
|
||||
/* chunkset_avx.c -- AVX inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include "zbuild.h"
|
||||
#include "zutil.h"
|
||||
|
||||
#ifdef X86_AVX_CHUNKSET
|
||||
#include <immintrin.h>
|
||||
|
||||
typedef __m256i chunk_t;
|
||||
|
||||
#define HAVE_CHUNKMEMSET_1
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
|
||||
static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm256_set1_epi8(*(int8_t *)from);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm256_set1_epi16(*(int16_t *)from);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm256_set1_epi32(*(int32_t *)from);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm256_set1_epi64x(*(int64_t *)from);
|
||||
}
|
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
|
||||
*chunk = _mm256_loadu_si256((__m256i *)s);
|
||||
}
|
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
|
||||
_mm256_storeu_si256((__m256i *)out, *chunk);
|
||||
}
|
||||
|
||||
#define CHUNKSIZE chunksize_avx
|
||||
#define CHUNKCOPY chunkcopy_avx
|
||||
#define CHUNKCOPY_SAFE chunkcopy_safe_avx
|
||||
#define CHUNKUNROLL chunkunroll_avx
|
||||
#define CHUNKMEMSET chunkmemset_avx
|
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_avx
|
||||
|
||||
#include "chunkset_tpl.h"
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,51 @@
|
||||
/* chunkset_sse.c -- SSE inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zutil.h"
|
||||
|
||||
#ifdef X86_SSE2
|
||||
#include <immintrin.h>
|
||||
|
||||
typedef __m128i chunk_t;
|
||||
|
||||
#define HAVE_CHUNKMEMSET_1
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
|
||||
static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm_set1_epi8(*(int8_t *)from);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm_set1_epi16(*(int16_t *)from);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm_set1_epi32(*(int32_t *)from);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm_set1_epi64x(*(int64_t *)from);
|
||||
}
|
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
|
||||
*chunk = _mm_loadu_si128((__m128i *)s);
|
||||
}
|
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
|
||||
_mm_storeu_si128((__m128i *)out, *chunk);
|
||||
}
|
||||
|
||||
#define CHUNKSIZE chunksize_sse2
|
||||
#define CHUNKCOPY chunkcopy_sse2
|
||||
#define CHUNKCOPY_SAFE chunkcopy_safe_sse2
|
||||
#define CHUNKUNROLL chunkunroll_sse2
|
||||
#define CHUNKMEMSET chunkmemset_sse2
|
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_sse2
|
||||
|
||||
#include "chunkset_tpl.h"
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,67 @@
|
||||
/* compare258_avx.c -- AVX2 version of compare258
|
||||
* Copyright Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "../../zutil.h"
|
||||
|
||||
#include "fallback_builtins.h"
|
||||
|
||||
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
|
||||
|
||||
#include <immintrin.h>
|
||||
#ifdef _MSC_VER
|
||||
# include <nmmintrin.h>
|
||||
#endif
|
||||
|
||||
/* UNALIGNED_OK, AVX2 intrinsic comparison */
|
||||
static inline uint32_t compare256_unaligned_avx2_static(const unsigned char *src0, const unsigned char *src1) {
|
||||
uint32_t len = 0;
|
||||
|
||||
do {
|
||||
__m256i ymm_src0, ymm_src1, ymm_cmp;
|
||||
ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
|
||||
ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
|
||||
ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
|
||||
unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
|
||||
if (mask != 0xFFFFFFFF) {
|
||||
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */
|
||||
return len + match_byte;
|
||||
}
|
||||
|
||||
src0 += 32, src1 += 32, len += 32;
|
||||
|
||||
ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
|
||||
ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
|
||||
ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1);
|
||||
mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
|
||||
if (mask != 0xFFFFFFFF) {
|
||||
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
|
||||
return len + match_byte;
|
||||
}
|
||||
|
||||
src0 += 32, src1 += 32, len += 32;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
static inline uint32_t compare258_unaligned_avx2_static(const unsigned char *src0, const unsigned char *src1) {
|
||||
if (*(uint16_t *)src0 != *(uint16_t *)src1)
|
||||
return (*src0 == *src1);
|
||||
|
||||
return compare256_unaligned_avx2_static(src0+2, src1+2) + 2;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1) {
|
||||
return compare258_unaligned_avx2_static(src0, src1);
|
||||
}
|
||||
|
||||
#define LONGEST_MATCH longest_match_unaligned_avx2
|
||||
#define COMPARE256 compare256_unaligned_avx2_static
|
||||
#define COMPARE258 compare258_unaligned_avx2_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,74 @@
|
||||
/* compare258_sse.c -- SSE4.2 version of compare258
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved.
|
||||
* Authors:
|
||||
* Wajdi Feghali <wajdi.k.feghali@intel.com>
|
||||
* Jim Guilford <james.guilford@intel.com>
|
||||
* Vinodh Gopal <vinodh.gopal@intel.com>
|
||||
* Erdinc Ozturk <erdinc.ozturk@intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
*
|
||||
* Portions are Copyright (C) 2016 12Sided Technology, LLC.
|
||||
* Author:
|
||||
* Phil Vachon <pvachon@12sidedtech.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "../../zutil.h"
|
||||
|
||||
#ifdef X86_SSE42_CMP_STR
|
||||
|
||||
#include <immintrin.h>
|
||||
#ifdef _MSC_VER
|
||||
# include <nmmintrin.h>
|
||||
#endif
|
||||
|
||||
/* UNALIGNED_OK, SSE4.2 intrinsic comparison */
|
||||
static inline uint32_t compare256_unaligned_sse4_static(const unsigned char *src0, const unsigned char *src1) {
|
||||
uint32_t len = 0;
|
||||
|
||||
do {
|
||||
#define mode _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY
|
||||
__m128i xmm_src0, xmm_src1;
|
||||
uint32_t ret;
|
||||
|
||||
xmm_src0 = _mm_loadu_si128((__m128i *)src0);
|
||||
xmm_src1 = _mm_loadu_si128((__m128i *)src1);
|
||||
ret = (uint32_t)_mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode);
|
||||
if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) {
|
||||
return len + ret;
|
||||
}
|
||||
src0 += 16, src1 += 16, len += 16;
|
||||
|
||||
xmm_src0 = _mm_loadu_si128((__m128i *)src0);
|
||||
xmm_src1 = _mm_loadu_si128((__m128i *)src1);
|
||||
ret = (uint32_t)_mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode);
|
||||
if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) {
|
||||
return len + ret;
|
||||
}
|
||||
src0 += 16, src1 += 16, len += 16;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
static inline uint32_t compare258_unaligned_sse4_static(const unsigned char *src0, const unsigned char *src1) {
|
||||
if (*(uint16_t *)src0 != *(uint16_t *)src1)
|
||||
return (*src0 == *src1);
|
||||
|
||||
return compare256_unaligned_sse4_static(src0+2, src1+2) + 2;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1) {
|
||||
return compare258_unaligned_sse4_static(src0, src1);
|
||||
}
|
||||
|
||||
#define LONGEST_MATCH longest_match_unaligned_sse4
|
||||
#define COMPARE256 compare256_unaligned_sse4_static
|
||||
#define COMPARE258 compare258_unaligned_sse4_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#endif
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
|
||||
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
|
||||
* instruction.
|
||||
*
|
||||
* A white paper describing this algorithm can be found at:
|
||||
@@ -18,14 +18,14 @@
|
||||
|
||||
#ifdef X86_PCLMULQDQ_CRC
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "../../zbuild.h"
|
||||
#include <inttypes.h>
|
||||
#include <immintrin.h>
|
||||
#include <wmmintrin.h>
|
||||
|
||||
#include "crc_folding.h"
|
||||
|
||||
ZLIB_INTERNAL void crc_fold_init(deflate_state *const s) {
|
||||
Z_INTERNAL void crc_fold_init(deflate_state *const s) {
|
||||
/* CRC_SAVE */
|
||||
_mm_storeu_si128((__m128i *)s->crc0 + 0, _mm_cvtsi32_si128(0x9db42487));
|
||||
_mm_storeu_si128((__m128i *)s->crc0 + 1, _mm_setzero_si128());
|
||||
@@ -227,9 +227,10 @@ static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res);
|
||||
}
|
||||
|
||||
ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const unsigned char *src, long len) {
|
||||
Z_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const unsigned char *src, long len) {
|
||||
unsigned long algn_diff;
|
||||
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
|
||||
char ALIGNED_(16) partial_buf[16] = { 0 };
|
||||
|
||||
/* CRC_LOAD */
|
||||
__m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);
|
||||
@@ -241,11 +242,14 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
|
||||
if (len < 16) {
|
||||
if (len == 0)
|
||||
return;
|
||||
xmm_crc_part = _mm_loadu_si128((__m128i *)src);
|
||||
|
||||
memcpy(partial_buf, src, len);
|
||||
xmm_crc_part = _mm_loadu_si128((const __m128i *)partial_buf);
|
||||
memcpy(dst, partial_buf, len);
|
||||
goto partial;
|
||||
}
|
||||
|
||||
algn_diff = (0 - (uintptr_t)src) & 0xF;
|
||||
algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
|
||||
if (algn_diff) {
|
||||
xmm_crc_part = _mm_loadu_si128((__m128i *)src);
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_crc_part);
|
||||
@@ -255,6 +259,8 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
|
||||
len -= algn_diff;
|
||||
|
||||
partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
|
||||
} else {
|
||||
xmm_crc_part = _mm_setzero_si128();
|
||||
}
|
||||
|
||||
while ((len -= 64) >= 0) {
|
||||
@@ -305,7 +311,7 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
|
||||
goto done;
|
||||
|
||||
dst += 48;
|
||||
xmm_crc_part = _mm_load_si128((__m128i *)src + 3);
|
||||
memcpy(&xmm_crc_part, (__m128i *)src + 3, len);
|
||||
} else if (len + 32 >= 0) {
|
||||
len += 32;
|
||||
|
||||
@@ -324,7 +330,7 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
|
||||
goto done;
|
||||
|
||||
dst += 32;
|
||||
xmm_crc_part = _mm_load_si128((__m128i *)src + 2);
|
||||
memcpy(&xmm_crc_part, (__m128i *)src + 2, len);
|
||||
} else if (len + 48 >= 0) {
|
||||
len += 48;
|
||||
|
||||
@@ -340,16 +346,18 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
|
||||
goto done;
|
||||
|
||||
dst += 16;
|
||||
xmm_crc_part = _mm_load_si128((__m128i *)src + 1);
|
||||
memcpy(&xmm_crc_part, (__m128i *)src + 1, len);
|
||||
} else {
|
||||
len += 64;
|
||||
if (len == 0)
|
||||
goto done;
|
||||
xmm_crc_part = _mm_load_si128((__m128i *)src);
|
||||
memcpy(&xmm_crc_part, src, len);
|
||||
}
|
||||
|
||||
_mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);
|
||||
memcpy(dst, partial_buf, len);
|
||||
|
||||
partial:
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_crc_part);
|
||||
partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
|
||||
done:
|
||||
/* CRC_SAVE */
|
||||
@@ -377,7 +385,7 @@ static const unsigned ALIGNED_(16) crc_mask2[4] = {
|
||||
0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
|
||||
};
|
||||
|
||||
uint32_t ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s) {
|
||||
uint32_t Z_INTERNAL crc_fold_512to32(deflate_state *const s) {
|
||||
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
|
||||
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
|
||||
|
||||
@@ -447,4 +455,3 @@ uint32_t ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s) {
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -10,10 +10,10 @@
|
||||
#ifndef CRC_FOLDING_H_
|
||||
#define CRC_FOLDING_H_
|
||||
|
||||
#include "deflate.h"
|
||||
#include "../../deflate.h"
|
||||
|
||||
ZLIB_INTERNAL void crc_fold_init(deflate_state *const);
|
||||
ZLIB_INTERNAL uint32_t crc_fold_512to32(deflate_state *const);
|
||||
ZLIB_INTERNAL void crc_fold_copy(deflate_state *const, unsigned char *, const unsigned char *, long);
|
||||
Z_INTERNAL void crc_fold_init(deflate_state *const);
|
||||
Z_INTERNAL uint32_t crc_fold_512to32(deflate_state *const);
|
||||
Z_INTERNAL void crc_fold_copy(deflate_state *const, unsigned char *, const unsigned char *, long);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,25 +0,0 @@
|
||||
#ifndef X86_CTZL_H
|
||||
#define X86_CTZL_H
|
||||
|
||||
#include <intrin.h>
|
||||
#ifdef X86_CPUID
|
||||
# include "x86.h"
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
/* This is not a general purpose replacement for __builtin_ctzl. The function expects that value is != 0
|
||||
* Because of that assumption trailing_zero is not initialized and the return value of _BitScanForward is not checked
|
||||
*/
|
||||
static __forceinline unsigned long __builtin_ctzl(unsigned long value)
|
||||
{
|
||||
#ifdef X86_CPUID
|
||||
if (x86_cpu_has_tzcnt)
|
||||
return _tzcnt_u32(value);
|
||||
#endif
|
||||
unsigned long trailing_zero;
|
||||
_BitScanForward(&trailing_zero, value);
|
||||
return trailing_zero;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,175 +0,0 @@
|
||||
/*
|
||||
* Fill Window with SSE2-optimized hash shifting
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation
|
||||
* Authors:
|
||||
* Arjan van de Ven <arjan@linux.intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#ifdef X86_SSE2
|
||||
|
||||
#include "zbuild.h"
|
||||
#include <immintrin.h>
|
||||
#include "deflate.h"
|
||||
#include "deflate_p.h"
|
||||
#include "functable.h"
|
||||
|
||||
extern int read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
|
||||
|
||||
ZLIB_INTERNAL void fill_window_sse(deflate_state *s) {
|
||||
const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
|
||||
|
||||
register unsigned n;
|
||||
register Pos *p;
|
||||
unsigned more; /* Amount of free space at the end of the window. */
|
||||
unsigned int wsize = s->w_size;
|
||||
|
||||
Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
|
||||
|
||||
do {
|
||||
more = (unsigned)(s->window_size -(unsigned long)s->lookahead -(unsigned long)s->strstart);
|
||||
|
||||
/* Deal with !@#$% 64K limit: */
|
||||
if (sizeof(int) <= 2) {
|
||||
if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
|
||||
more = wsize;
|
||||
|
||||
} else if (more == (unsigned)(-1)) {
|
||||
/* Very unlikely, but possible on 16 bit machine if
|
||||
* strstart == 0 && lookahead == 1 (input done a byte at time)
|
||||
*/
|
||||
more--;
|
||||
}
|
||||
}
|
||||
|
||||
/* If the window is almost full and there is insufficient lookahead,
|
||||
* move the upper half to the lower one to make room in the upper half.
|
||||
*/
|
||||
if (s->strstart >= wsize+MAX_DIST(s)) {
|
||||
memcpy(s->window, s->window+wsize, (unsigned)wsize);
|
||||
s->match_start = (s->match_start >= wsize) ? s->match_start - wsize : 0;
|
||||
s->strstart -= wsize; /* we now have strstart >= MAX_DIST */
|
||||
s->block_start -= (long) wsize;
|
||||
|
||||
/* Slide the hash table (could be avoided with 32 bit values
|
||||
at the expense of memory usage). We slide even when level == 0
|
||||
to keep the hash table consistent if we switch back to level > 0
|
||||
later. (Using level 0 permanently is not an optimal usage of
|
||||
zlib, so we don't care about this pathological case.)
|
||||
*/
|
||||
n = s->hash_size;
|
||||
p = &s->head[n];
|
||||
p -= 8;
|
||||
do {
|
||||
__m128i value, result;
|
||||
|
||||
value = _mm_loadu_si128((__m128i *)p);
|
||||
result = _mm_subs_epu16(value, xmm_wsize);
|
||||
_mm_storeu_si128((__m128i *)p, result);
|
||||
|
||||
p -= 8;
|
||||
n -= 8;
|
||||
} while (n > 0);
|
||||
|
||||
n = wsize;
|
||||
p = &s->prev[n];
|
||||
p -= 8;
|
||||
do {
|
||||
__m128i value, result;
|
||||
|
||||
value = _mm_loadu_si128((__m128i *)p);
|
||||
result = _mm_subs_epu16(value, xmm_wsize);
|
||||
_mm_storeu_si128((__m128i *)p, result);
|
||||
|
||||
p -= 8;
|
||||
n -= 8;
|
||||
} while (n > 0);
|
||||
more += wsize;
|
||||
}
|
||||
if (s->strm->avail_in == 0) break;
|
||||
|
||||
/* If there was no sliding:
|
||||
* strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
|
||||
* more == window_size - lookahead - strstart
|
||||
* => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
|
||||
* => more >= window_size - 2*WSIZE + 2
|
||||
* In the BIG_MEM or MMAP case (not yet supported),
|
||||
* window_size == input_size + MIN_LOOKAHEAD &&
|
||||
* strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
|
||||
* Otherwise, window_size == 2*WSIZE so more >= 2.
|
||||
* If there was sliding, more >= WSIZE. So in all cases, more >= 2.
|
||||
*/
|
||||
Assert(more >= 2, "more < 2");
|
||||
|
||||
n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
|
||||
s->lookahead += n;
|
||||
|
||||
/* Initialize the hash value now that we have some input: */
|
||||
if (s->lookahead + s->insert >= MIN_MATCH) {
|
||||
unsigned int str = s->strstart - s->insert;
|
||||
s->ins_h = s->window[str];
|
||||
if (str >= 1)
|
||||
functable.insert_string(s, str + 2 - MIN_MATCH, 1);
|
||||
#if MIN_MATCH != 3
|
||||
#error Call insert_string() MIN_MATCH-3 more times
|
||||
while (s->insert) {
|
||||
functable.insert_string(s, str, 1);
|
||||
str++;
|
||||
s->insert--;
|
||||
if (s->lookahead + s->insert < MIN_MATCH)
|
||||
break;
|
||||
}
|
||||
#else
|
||||
unsigned int count;
|
||||
if (unlikely(s->lookahead == 1)){
|
||||
count = s->insert - 1;
|
||||
}else{
|
||||
count = s->insert;
|
||||
}
|
||||
functable.insert_string(s, str, count);
|
||||
s->insert -= count;
|
||||
#endif
|
||||
}
|
||||
/* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
|
||||
* but this is not important since only literal bytes will be emitted.
|
||||
*/
|
||||
} while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
|
||||
|
||||
/* If the WIN_INIT bytes after the end of the current data have never been
|
||||
* written, then zero those bytes in order to avoid memory check reports of
|
||||
* the use of uninitialized (or uninitialised as Julian writes) bytes by
|
||||
* the longest match routines. Update the high water mark for the next
|
||||
* time through here. WIN_INIT is set to MAX_MATCH since the longest match
|
||||
* routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
|
||||
*/
|
||||
if (s->high_water < s->window_size) {
|
||||
unsigned long curr = s->strstart + (unsigned long)(s->lookahead);
|
||||
unsigned long init;
|
||||
|
||||
if (s->high_water < curr) {
|
||||
/* Previous high water mark below current data -- zero WIN_INIT
|
||||
* bytes or up to end of window, whichever is less.
|
||||
*/
|
||||
init = s->window_size - curr;
|
||||
if (init > WIN_INIT)
|
||||
init = WIN_INIT;
|
||||
memset(s->window + curr, 0, (unsigned)init);
|
||||
s->high_water = curr + init;
|
||||
} else if (s->high_water < (unsigned long)curr + WIN_INIT) {
|
||||
/* High water mark at or above current data, but below current data
|
||||
* plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
|
||||
* to end of window, whichever is less.
|
||||
*/
|
||||
init = (unsigned long)curr + WIN_INIT - s->high_water;
|
||||
if (init > s->window_size - s->high_water)
|
||||
init = s->window_size - s->high_water;
|
||||
memset(s->window + s->high_water, 0, (unsigned)init);
|
||||
s->high_water += init;
|
||||
}
|
||||
}
|
||||
|
||||
Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "not enough room for search");
|
||||
}
|
||||
#endif
|
||||
@@ -5,52 +5,42 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "deflate.h"
|
||||
|
||||
/* ===========================================================================
|
||||
* Insert string str in the dictionary and set match_head to the previous head
|
||||
* of the hash chain (the most recent string with same hash key). Return
|
||||
* the previous length of the hash chain.
|
||||
* IN assertion: all calls to to INSERT_STRING are made with consecutive
|
||||
* input characters and the first MIN_MATCH bytes of str are valid
|
||||
* (except for the last MIN_MATCH-1 bytes of the input file).
|
||||
*/
|
||||
#ifdef X86_SSE4_2_CRC_HASH
|
||||
ZLIB_INTERNAL Pos insert_string_sse(deflate_state *const s, const Pos str, unsigned int count) {
|
||||
Pos ret = 0;
|
||||
unsigned int idx;
|
||||
unsigned int *ip, val, h;
|
||||
|
||||
for (idx = 0; idx < count; idx++) {
|
||||
ip = (unsigned *)&s->window[str+idx];
|
||||
memcpy(&val, ip, sizeof(val));
|
||||
h = 0;
|
||||
|
||||
if (s->level >= TRIGGER_LEVEL)
|
||||
val &= 0xFFFFFF;
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include <immintrin.h>
|
||||
#ifdef _MSC_VER
|
||||
h = _mm_crc32_u32(h, val);
|
||||
#elif defined(X86_SSE4_2_CRC_INTRIN)
|
||||
h = __builtin_ia32_crc32si(h, val);
|
||||
# include <nmmintrin.h>
|
||||
#endif
|
||||
#include "../../deflate.h"
|
||||
|
||||
#ifdef X86_SSE42_CRC_INTRIN
|
||||
# ifdef _MSC_VER
|
||||
# define UPDATE_HASH(s, h, val)\
|
||||
h = _mm_crc32_u32(h, val)
|
||||
# else
|
||||
# define UPDATE_HASH(s, h, val)\
|
||||
h = __builtin_ia32_crc32si(h, val)
|
||||
# endif
|
||||
#else
|
||||
__asm__ __volatile__ (
|
||||
"crc32 %1,%0\n\t"
|
||||
: "+r" (h)
|
||||
: "r" (val)
|
||||
);
|
||||
#endif
|
||||
Pos head = s->head[h & s->hash_mask];
|
||||
if (head != str+idx) {
|
||||
s->prev[(str+idx) & s->w_mask] = head;
|
||||
s->head[h & s->hash_mask] = str+idx;
|
||||
if (idx == count-1)
|
||||
ret = head;
|
||||
} else if (idx == count - 1) {
|
||||
ret = str + idx;
|
||||
}
|
||||
# ifdef _MSC_VER
|
||||
# define UPDATE_HASH(s, h, val) {\
|
||||
__asm mov edx, h\
|
||||
__asm mov eax, val\
|
||||
__asm crc32 eax, edx\
|
||||
__asm mov val, eax\
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
# else
|
||||
# define UPDATE_HASH(s, h, val) \
|
||||
__asm__ __volatile__ (\
|
||||
"crc32 %1,%0\n\t"\
|
||||
: "+r" (h)\
|
||||
: "r" (val)\
|
||||
);
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define INSERT_STRING insert_string_sse4
|
||||
#define QUICK_INSERT_STRING quick_insert_string_sse4
|
||||
|
||||
#ifdef X86_SSE42_CRC_HASH
|
||||
# include "../../insert_string_tpl.h"
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
* AVX2 optimized hash slide, based on Intel's slide_sse implementation
|
||||
*
|
||||
* Copyright (C) 2017 Intel Corporation
|
||||
* Authors:
|
||||
* Arjan van de Ven <arjan@linux.intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
* Mika T. Lindqvist <postmaster@raasu.org>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include "../../zbuild.h"
|
||||
#include "../../deflate.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
|
||||
Pos *p;
|
||||
unsigned n;
|
||||
uint16_t wsize = (uint16_t)s->w_size;
|
||||
const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
|
||||
|
||||
n = HASH_SIZE;
|
||||
p = &s->head[n] - 16;
|
||||
do {
|
||||
__m256i value, result;
|
||||
|
||||
value = _mm256_loadu_si256((__m256i *)p);
|
||||
result= _mm256_subs_epu16(value, ymm_wsize);
|
||||
_mm256_storeu_si256((__m256i *)p, result);
|
||||
p -= 16;
|
||||
n -= 16;
|
||||
} while (n > 0);
|
||||
|
||||
n = wsize;
|
||||
p = &s->prev[n] - 16;
|
||||
do {
|
||||
__m256i value, result;
|
||||
|
||||
value = _mm256_loadu_si256((__m256i *)p);
|
||||
result= _mm256_subs_epu16(value, ymm_wsize);
|
||||
_mm256_storeu_si256((__m256i *)p, result);
|
||||
|
||||
p -= 16;
|
||||
n -= 16;
|
||||
} while (n > 0);
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
* SSE optimized hash slide
|
||||
*
|
||||
* Copyright (C) 2017 Intel Corporation
|
||||
* Authors:
|
||||
* Arjan van de Ven <arjan@linux.intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include "../../zbuild.h"
|
||||
#include "../../deflate.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
|
||||
Pos *p;
|
||||
unsigned n;
|
||||
uint16_t wsize = (uint16_t)s->w_size;
|
||||
const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
|
||||
|
||||
n = HASH_SIZE;
|
||||
p = &s->head[n] - 8;
|
||||
do {
|
||||
__m128i value, result;
|
||||
|
||||
value = _mm_loadu_si128((__m128i *)p);
|
||||
result= _mm_subs_epu16(value, xmm_wsize);
|
||||
_mm_storeu_si128((__m128i *)p, result);
|
||||
p -= 8;
|
||||
n -= 8;
|
||||
} while (n > 0);
|
||||
|
||||
n = wsize;
|
||||
p = &s->prev[n] - 8;
|
||||
do {
|
||||
__m128i value, result;
|
||||
|
||||
value = _mm_loadu_si128((__m128i *)p);
|
||||
result= _mm_subs_epu16(value, xmm_wsize);
|
||||
_mm_storeu_si128((__m128i *)p, result);
|
||||
|
||||
p -= 8;
|
||||
n -= 8;
|
||||
} while (n > 0);
|
||||
}
|
||||
+54
-42
@@ -8,61 +8,73 @@
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zutil.h"
|
||||
#include "../../zutil.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
# include <intrin.h>
|
||||
#else
|
||||
// Newer versions of GCC and clang come with cpuid.h
|
||||
#include <cpuid.h>
|
||||
# include <cpuid.h>
|
||||
#endif
|
||||
|
||||
ZLIB_INTERNAL int x86_cpu_has_sse2;
|
||||
ZLIB_INTERNAL int x86_cpu_has_sse42;
|
||||
ZLIB_INTERNAL int x86_cpu_has_pclmulqdq;
|
||||
ZLIB_INTERNAL int x86_cpu_has_tzcnt;
|
||||
Z_INTERNAL int x86_cpu_has_avx2;
|
||||
Z_INTERNAL int x86_cpu_has_sse2;
|
||||
Z_INTERNAL int x86_cpu_has_ssse3;
|
||||
Z_INTERNAL int x86_cpu_has_sse42;
|
||||
Z_INTERNAL int x86_cpu_has_pclmulqdq;
|
||||
Z_INTERNAL int x86_cpu_has_tzcnt;
|
||||
|
||||
static void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
|
||||
#ifdef _MSC_VER
|
||||
unsigned int registers[4];
|
||||
__cpuid(registers, info);
|
||||
unsigned int registers[4];
|
||||
__cpuid((int *)registers, info);
|
||||
|
||||
*eax = registers[0];
|
||||
*ebx = registers[1];
|
||||
*ecx = registers[2];
|
||||
*edx = registers[3];
|
||||
*eax = registers[0];
|
||||
*ebx = registers[1];
|
||||
*ecx = registers[2];
|
||||
*edx = registers[3];
|
||||
#else
|
||||
unsigned int _eax;
|
||||
unsigned int _ebx;
|
||||
unsigned int _ecx;
|
||||
unsigned int _edx;
|
||||
__cpuid(info, _eax, _ebx, _ecx, _edx);
|
||||
*eax = _eax;
|
||||
*ebx = _ebx;
|
||||
*ecx = _ecx;
|
||||
*edx = _edx;
|
||||
__cpuid(info, *eax, *ebx, *ecx, *edx);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ZLIB_INTERNAL x86_check_features(void) {
|
||||
unsigned eax, ebx, ecx, edx;
|
||||
unsigned maxbasic;
|
||||
static void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
|
||||
#ifdef _MSC_VER
|
||||
unsigned int registers[4];
|
||||
__cpuidex((int *)registers, info, subinfo);
|
||||
|
||||
cpuid(0, &maxbasic, &ebx, &ecx, &edx);
|
||||
|
||||
cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
x86_cpu_has_sse2 = edx & 0x4000000;
|
||||
x86_cpu_has_sse42 = ecx & 0x100000;
|
||||
x86_cpu_has_pclmulqdq = ecx & 0x2;
|
||||
|
||||
if (maxbasic >= 7) {
|
||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
// check BMI1 bit
|
||||
// Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
|
||||
x86_cpu_has_tzcnt = ebx & 0x8;
|
||||
} else {
|
||||
x86_cpu_has_tzcnt = 0;
|
||||
}
|
||||
*eax = registers[0];
|
||||
*ebx = registers[1];
|
||||
*ecx = registers[2];
|
||||
*edx = registers[3];
|
||||
#else
|
||||
__cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
|
||||
#endif
|
||||
}
|
||||
|
||||
void Z_INTERNAL x86_check_features(void) {
|
||||
unsigned eax, ebx, ecx, edx;
|
||||
unsigned maxbasic;
|
||||
|
||||
cpuid(0, &maxbasic, &ebx, &ecx, &edx);
|
||||
|
||||
cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
x86_cpu_has_sse2 = edx & 0x4000000;
|
||||
x86_cpu_has_ssse3 = ecx & 0x200;
|
||||
x86_cpu_has_sse42 = ecx & 0x100000;
|
||||
x86_cpu_has_pclmulqdq = ecx & 0x2;
|
||||
|
||||
if (maxbasic >= 7) {
|
||||
cpuidex(7, 0, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
// check BMI1 bit
|
||||
// Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
|
||||
x86_cpu_has_tzcnt = ebx & 0x8;
|
||||
// check AVX2 bit
|
||||
x86_cpu_has_avx2 = ebx & 0x20;
|
||||
} else {
|
||||
x86_cpu_has_tzcnt = 0;
|
||||
x86_cpu_has_avx2 = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,16 +1,18 @@
|
||||
/* cpu.h -- check for CPU features
|
||||
* Copyright (C) 2013 Intel Corporation Jim Kukunas
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
/* cpu.h -- check for CPU features
|
||||
* Copyright (C) 2013 Intel Corporation Jim Kukunas
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef CPU_H_
|
||||
#define CPU_H_
|
||||
|
||||
extern int x86_cpu_has_avx2;
|
||||
extern int x86_cpu_has_sse2;
|
||||
extern int x86_cpu_has_ssse3;
|
||||
extern int x86_cpu_has_sse42;
|
||||
extern int x86_cpu_has_pclmulqdq;
|
||||
extern int x86_cpu_has_tzcnt;
|
||||
|
||||
void ZLIB_INTERNAL x86_check_features(void);
|
||||
void Z_INTERNAL x86_check_features(void);
|
||||
|
||||
#endif /* CPU_H_ */
|
||||
|
||||
Reference in New Issue
Block a user