[Library] Update zlibng (#1255)

* Update zlibng

* Set cmake path more directly in zlibng to hopefully fix an issue with the build on drone

* I'm dumb, missing / in path

* Mackal helps with a dumb gitignore issue

* Adding all the files, not sure what's ignoring them and im tired of looking

* Some tweaks to zlibng build to hopefully get it to build properly. works on msvc now
This commit is contained in:
Alex
2021-02-23 17:00:26 -08:00
committed by GitHub
parent e6dee96266
commit 2957f5084d
184 changed files with 22029 additions and 11703 deletions
-3
View File
@@ -1,3 +0,0 @@
fill_window_sse.c SSE2 optimized fill_window
deflate_quick.c SSE4 optimized deflate strategy for use as level 1
crc_folding.c SSE4 + PCLMULQDQ optimized CRC folding implementation
+8
View File
@@ -0,0 +1,8 @@
Contents
--------
|Name|Description|
|:-|:-|
|deflate_quick.c|SSE4 optimized deflate strategy for use as level 1|
|crc_folding.c|SSE4 + PCLMULQDQ optimized CRC folding implementation|
|slide_sse2.c|SSE2 optimized slide_hash|
+58 -9
View File
@@ -8,7 +8,9 @@ SFLAGS=
INCLUDES=
SUFFIX=
AVX2FLAG=-mavx2
SSE2FLAG=-msse2
SSSE3FLAG=-mssse3
SSE4FLAG=-msse4
PCLMULFLAG=-mpclmul
@@ -16,7 +18,18 @@ SRCDIR=.
SRCTOP=../..
TOPDIR=$(SRCTOP)
all: x86.o x86.lo fill_window_sse.o fill_window_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo
all: \
x86.o x86.lo \
adler32_avx.o adler32.lo \
adler32_ssse3.o adler32_ssse3.lo \
chunkset_avx.o chunkset_avx.lo \
chunkset_sse.o chunkset_sse.lo \
compare258_avx.o compare258_avx.lo \
compare258_sse.o compare258_sse.lo \
insert_string_sse.o insert_string_sse.lo \
crc_folding.o crc_folding.lo \
slide_avx.o slide_avx.lo \
slide_sse.o slide_sse.lo
x86.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
@@ -24,17 +37,29 @@ x86.o:
x86.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
fill_window_sse.o:
$(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_sse.c
chunkset_avx.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx.c
fill_window_sse.lo:
$(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_sse.c
chunkset_avx.lo:
$(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx.c
deflate_quick.o:
$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/deflate_quick.c
chunkset_sse.o:
$(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse.c
deflate_quick.lo:
$(CC) $(SFLAGS) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/deflate_quick.c
chunkset_sse.lo:
$(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse.c
compare258_avx.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx.c
compare258_avx.lo:
$(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx.c
compare258_sse.o:
$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c
compare258_sse.lo:
$(CC) $(SFLAGS) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c
insert_string_sse.o:
$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse.c
@@ -48,6 +73,30 @@ crc_folding.o:
crc_folding.lo:
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c
slide_avx.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_avx.c
slide_avx.lo:
$(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_avx.c
slide_sse.o:
$(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_sse.c
slide_sse.lo:
$(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_sse.c
adler32_avx.o: $(SRCDIR)/adler32_avx.c
$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx.c
adler32_avx.lo: $(SRCDIR)/adler32_avx.c
$(CC) $(SFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx.c
adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
$(CC) $(CFLAGS) $(SSSE3FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c
$(CC) $(SFLAGS) $(SSSE3FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
mostlyclean: clean
clean:
rm -f *.o *.lo *~
+117
View File
@@ -0,0 +1,117 @@
/* adler32.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011 Mark Adler
* Authors:
* Brian Bockelman <bockelman@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../zutil.h"
#include "../../adler32_p.h"
#include <immintrin.h>
#ifdef X86_AVX2_ADLER32
Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len) {
uint32_t sum2;
/* split Adler-32 into component sums */
sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
return adler32_len_1(adler, buf, sum2);
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (UNLIKELY(buf == NULL))
return 1L;
/* in case short lengths are provided, keep it somewhat fast */
if (UNLIKELY(len < 16))
return adler32_len_16(adler, buf, len, sum2);
uint32_t ALIGNED_(32) s1[8], s2[8];
memset(s1, 0, sizeof(s1)); s1[7] = adler; // TODO: would a masked load be faster?
memset(s2, 0, sizeof(s2)); s2[7] = sum2;
char ALIGNED_(32) dot1[32] = \
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
__m256i dot1v = _mm256_load_si256((__m256i*)dot1);
char ALIGNED_(32) dot2[32] = \
{32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
__m256i dot2v = _mm256_load_si256((__m256i*)dot2);
short ALIGNED_(32) dot3[16] = \
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
__m256i dot3v = _mm256_load_si256((__m256i*)dot3);
// We will need to multiply by
char ALIGNED_(32) shift[16] = {5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
__m128i shiftv = _mm_load_si128((__m128i*)shift);
while (len >= 32) {
__m256i vs1 = _mm256_load_si256((__m256i*)s1);
__m256i vs2 = _mm256_load_si256((__m256i*)s2);
__m256i vs1_0 = vs1;
int k = (len < NMAX ? (int)len : NMAX);
k -= k % 32;
len -= k;
while (k >= 32) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
*/
__m256i vbuf = _mm256_loadu_si256((__m256i*)buf);
buf += 32;
k -= 32;
__m256i v_short_sum1 = _mm256_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts.
__m256i vsum1 = _mm256_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t;
__m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v);
vs1 = _mm256_add_epi32(vsum1, vs1);
__m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v);
vs1_0 = _mm256_sll_epi32(vs1_0, shiftv);
vsum2 = _mm256_add_epi32(vsum2, vs2);
vs2 = _mm256_add_epi32(vsum2, vs1_0);
vs1_0 = vs1;
}
// At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that
// would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on.
uint32_t ALIGNED_(32) s1_unpack[8];
uint32_t ALIGNED_(32) s2_unpack[8];
_mm256_store_si256((__m256i*)s1_unpack, vs1);
_mm256_store_si256((__m256i*)s2_unpack, vs2);
adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
(s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
adler %= BASE;
s1[7] = adler;
sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE) +
(s2_unpack[4] % BASE) + (s2_unpack[5] % BASE) + (s2_unpack[6] % BASE) + (s2_unpack[7] % BASE);
sum2 %= BASE;
s2[7] = sum2;
}
while (len) {
len--;
adler += *buf++;
sum2 += adler;
}
adler %= BASE;
sum2 %= BASE;
/* return recombined sums */
return adler | (sum2 << 16);
}
#endif
+118
View File
@@ -0,0 +1,118 @@
/* adler32.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011 Mark Adler
* Authors:
* Brian Bockelman <bockelman@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../zutil.h"
#include "../../adler32_p.h"
#ifdef X86_SSSE3_ADLER32
#include <immintrin.h>
Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len) {
uint32_t sum2;
/* split Adler-32 into component sums */
sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
return adler32_len_1(adler, buf, sum2);
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (UNLIKELY(buf == NULL))
return 1L;
/* in case short lengths are provided, keep it somewhat fast */
if (UNLIKELY(len < 16))
return adler32_len_16(adler, buf, len, sum2);
uint32_t ALIGNED_(16) s1[4], s2[4];
s1[0] = s1[1] = s1[2] = 0; s1[3] = adler;
s2[0] = s2[1] = s2[2] = 0; s2[3] = sum2;
char ALIGNED_(16) dot1[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
__m128i dot1v = _mm_load_si128((__m128i*)dot1);
char ALIGNED_(16) dot2[16] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
__m128i dot2v = _mm_load_si128((__m128i*)dot2);
short ALIGNED_(16) dot3[8] = {1, 1, 1, 1, 1, 1, 1, 1};
__m128i dot3v = _mm_load_si128((__m128i*)dot3);
// We will need to multiply by
//char ALIGNED_(16) shift[4] = {0, 0, 0, 4}; //{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4};
char ALIGNED_(16) shift[16] = {4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
__m128i shiftv = _mm_load_si128((__m128i*)shift);
while (len >= 16) {
__m128i vs1 = _mm_load_si128((__m128i*)s1);
__m128i vs2 = _mm_load_si128((__m128i*)s2);
__m128i vs1_0 = vs1;
int k = (len < NMAX ? (int)len : NMAX);
k -= k % 16;
len -= k;
while (k >= 16) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
NOTE: 256-bit equivalents are:
_mm256_maddubs_epi16 <- operates on 32 bytes to 16 shorts
_mm256_madd_epi16 <- Sums 16 shorts to 8 int32_t.
We could rewrite the below to use 256-bit instructions instead of 128-bit.
*/
__m128i vbuf = _mm_loadu_si128((__m128i*)buf);
buf += 16;
k -= 16;
__m128i v_short_sum1 = _mm_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts.
__m128i vsum1 = _mm_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t;
__m128i v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
vs1 = _mm_add_epi32(vsum1, vs1);
__m128i vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
vs1_0 = _mm_sll_epi32(vs1_0, shiftv);
vsum2 = _mm_add_epi32(vsum2, vs2);
vs2 = _mm_add_epi32(vsum2, vs1_0);
vs1_0 = vs1;
}
// At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that
// would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on.
uint32_t ALIGNED_(16) s1_unpack[4];
uint32_t ALIGNED_(16) s2_unpack[4];
_mm_store_si128((__m128i*)s1_unpack, vs1);
_mm_store_si128((__m128i*)s2_unpack, vs2);
adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE);
adler %= BASE;
s1[3] = adler;
sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE);
sum2 %= BASE;
s2[3] = sum2;
}
while (len) {
len--;
adler += *buf++;
sum2 += adler;
}
adler %= BASE;
sum2 %= BASE;
/* return recombined sums */
return adler | (sum2 << 16);
}
#endif
+50
View File
@@ -0,0 +1,50 @@
/* chunkset_avx.c -- AVX inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "zutil.h"
#ifdef X86_AVX_CHUNKSET
#include <immintrin.h>
typedef __m256i chunk_t;
#define HAVE_CHUNKMEMSET_1
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
*chunk = _mm256_set1_epi8(*(int8_t *)from);
}
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
*chunk = _mm256_set1_epi16(*(int16_t *)from);
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
*chunk = _mm256_set1_epi32(*(int32_t *)from);
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
*chunk = _mm256_set1_epi64x(*(int64_t *)from);
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
*chunk = _mm256_loadu_si256((__m256i *)s);
}
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
_mm256_storeu_si256((__m256i *)out, *chunk);
}
#define CHUNKSIZE chunksize_avx
#define CHUNKCOPY chunkcopy_avx
#define CHUNKCOPY_SAFE chunkcopy_safe_avx
#define CHUNKUNROLL chunkunroll_avx
#define CHUNKMEMSET chunkmemset_avx
#define CHUNKMEMSET_SAFE chunkmemset_safe_avx
#include "chunkset_tpl.h"
#endif
+51
View File
@@ -0,0 +1,51 @@
/* chunkset_sse.c -- SSE inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "zutil.h"
#ifdef X86_SSE2
#include <immintrin.h>
typedef __m128i chunk_t;
#define HAVE_CHUNKMEMSET_1
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
*chunk = _mm_set1_epi8(*(int8_t *)from);
}
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
*chunk = _mm_set1_epi16(*(int16_t *)from);
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
*chunk = _mm_set1_epi32(*(int32_t *)from);
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
*chunk = _mm_set1_epi64x(*(int64_t *)from);
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
*chunk = _mm_loadu_si128((__m128i *)s);
}
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
_mm_storeu_si128((__m128i *)out, *chunk);
}
#define CHUNKSIZE chunksize_sse2
#define CHUNKCOPY chunkcopy_sse2
#define CHUNKCOPY_SAFE chunkcopy_safe_sse2
#define CHUNKUNROLL chunkunroll_sse2
#define CHUNKMEMSET chunkmemset_sse2
#define CHUNKMEMSET_SAFE chunkmemset_safe_sse2
#include "chunkset_tpl.h"
#endif
+67
View File
@@ -0,0 +1,67 @@
/* compare258_avx.c -- AVX2 version of compare258
* Copyright Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../zutil.h"
#include "fallback_builtins.h"
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
#include <immintrin.h>
#ifdef _MSC_VER
# include <nmmintrin.h>
#endif
/* UNALIGNED_OK, AVX2 intrinsic comparison */
static inline uint32_t compare256_unaligned_avx2_static(const unsigned char *src0, const unsigned char *src1) {
uint32_t len = 0;
do {
__m256i ymm_src0, ymm_src1, ymm_cmp;
ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
if (mask != 0xFFFFFFFF) {
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */
return len + match_byte;
}
src0 += 32, src1 += 32, len += 32;
ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1);
mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
if (mask != 0xFFFFFFFF) {
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
return len + match_byte;
}
src0 += 32, src1 += 32, len += 32;
} while (len < 256);
return 256;
}
static inline uint32_t compare258_unaligned_avx2_static(const unsigned char *src0, const unsigned char *src1) {
if (*(uint16_t *)src0 != *(uint16_t *)src1)
return (*src0 == *src1);
return compare256_unaligned_avx2_static(src0+2, src1+2) + 2;
}
Z_INTERNAL uint32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1) {
return compare258_unaligned_avx2_static(src0, src1);
}
#define LONGEST_MATCH longest_match_unaligned_avx2
#define COMPARE256 compare256_unaligned_avx2_static
#define COMPARE258 compare258_unaligned_avx2_static
#include "match_tpl.h"
#endif
+74
View File
@@ -0,0 +1,74 @@
/* compare258_sse.c -- SSE4.2 version of compare258
*
* Copyright (C) 2013 Intel Corporation. All rights reserved.
* Authors:
* Wajdi Feghali <wajdi.k.feghali@intel.com>
* Jim Guilford <james.guilford@intel.com>
* Vinodh Gopal <vinodh.gopal@intel.com>
* Erdinc Ozturk <erdinc.ozturk@intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
*
* Portions are Copyright (C) 2016 12Sided Technology, LLC.
* Author:
* Phil Vachon <pvachon@12sidedtech.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../zutil.h"
#ifdef X86_SSE42_CMP_STR
#include <immintrin.h>
#ifdef _MSC_VER
# include <nmmintrin.h>
#endif
/* UNALIGNED_OK, SSE4.2 intrinsic comparison */
static inline uint32_t compare256_unaligned_sse4_static(const unsigned char *src0, const unsigned char *src1) {
uint32_t len = 0;
do {
#define mode _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY
__m128i xmm_src0, xmm_src1;
uint32_t ret;
xmm_src0 = _mm_loadu_si128((__m128i *)src0);
xmm_src1 = _mm_loadu_si128((__m128i *)src1);
ret = (uint32_t)_mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode);
if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) {
return len + ret;
}
src0 += 16, src1 += 16, len += 16;
xmm_src0 = _mm_loadu_si128((__m128i *)src0);
xmm_src1 = _mm_loadu_si128((__m128i *)src1);
ret = (uint32_t)_mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode);
if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) {
return len + ret;
}
src0 += 16, src1 += 16, len += 16;
} while (len < 256);
return 256;
}
static inline uint32_t compare258_unaligned_sse4_static(const unsigned char *src0, const unsigned char *src1) {
if (*(uint16_t *)src0 != *(uint16_t *)src1)
return (*src0 == *src1);
return compare256_unaligned_sse4_static(src0+2, src1+2) + 2;
}
Z_INTERNAL uint32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1) {
return compare258_unaligned_sse4_static(src0, src1);
}
#define LONGEST_MATCH longest_match_unaligned_sse4
#define COMPARE256 compare256_unaligned_sse4_static
#define COMPARE258 compare258_unaligned_sse4_static
#include "match_tpl.h"
#endif
+20 -13
View File
@@ -1,5 +1,5 @@
/*
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
* instruction.
*
* A white paper describing this algorithm can be found at:
@@ -18,14 +18,14 @@
#ifdef X86_PCLMULQDQ_CRC
#include "zbuild.h"
#include "../../zbuild.h"
#include <inttypes.h>
#include <immintrin.h>
#include <wmmintrin.h>
#include "crc_folding.h"
ZLIB_INTERNAL void crc_fold_init(deflate_state *const s) {
Z_INTERNAL void crc_fold_init(deflate_state *const s) {
/* CRC_SAVE */
_mm_storeu_si128((__m128i *)s->crc0 + 0, _mm_cvtsi32_si128(0x9db42487));
_mm_storeu_si128((__m128i *)s->crc0 + 1, _mm_setzero_si128());
@@ -227,9 +227,10 @@ static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
*xmm_crc3 = _mm_castps_si128(ps_res);
}
ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const unsigned char *src, long len) {
Z_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const unsigned char *src, long len) {
unsigned long algn_diff;
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
char ALIGNED_(16) partial_buf[16] = { 0 };
/* CRC_LOAD */
__m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);
@@ -241,11 +242,14 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
if (len < 16) {
if (len == 0)
return;
xmm_crc_part = _mm_loadu_si128((__m128i *)src);
memcpy(partial_buf, src, len);
xmm_crc_part = _mm_loadu_si128((const __m128i *)partial_buf);
memcpy(dst, partial_buf, len);
goto partial;
}
algn_diff = (0 - (uintptr_t)src) & 0xF;
algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
if (algn_diff) {
xmm_crc_part = _mm_loadu_si128((__m128i *)src);
_mm_storeu_si128((__m128i *)dst, xmm_crc_part);
@@ -255,6 +259,8 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
len -= algn_diff;
partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
} else {
xmm_crc_part = _mm_setzero_si128();
}
while ((len -= 64) >= 0) {
@@ -305,7 +311,7 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
goto done;
dst += 48;
xmm_crc_part = _mm_load_si128((__m128i *)src + 3);
memcpy(&xmm_crc_part, (__m128i *)src + 3, len);
} else if (len + 32 >= 0) {
len += 32;
@@ -324,7 +330,7 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
goto done;
dst += 32;
xmm_crc_part = _mm_load_si128((__m128i *)src + 2);
memcpy(&xmm_crc_part, (__m128i *)src + 2, len);
} else if (len + 48 >= 0) {
len += 48;
@@ -340,16 +346,18 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
goto done;
dst += 16;
xmm_crc_part = _mm_load_si128((__m128i *)src + 1);
memcpy(&xmm_crc_part, (__m128i *)src + 1, len);
} else {
len += 64;
if (len == 0)
goto done;
xmm_crc_part = _mm_load_si128((__m128i *)src);
memcpy(&xmm_crc_part, src, len);
}
_mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);
memcpy(dst, partial_buf, len);
partial:
_mm_storeu_si128((__m128i *)dst, xmm_crc_part);
partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
done:
/* CRC_SAVE */
@@ -377,7 +385,7 @@ static const unsigned ALIGNED_(16) crc_mask2[4] = {
0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
};
uint32_t ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s) {
uint32_t Z_INTERNAL crc_fold_512to32(deflate_state *const s) {
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
@@ -447,4 +455,3 @@ uint32_t ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s) {
}
#endif
+4 -4
View File
@@ -10,10 +10,10 @@
#ifndef CRC_FOLDING_H_
#define CRC_FOLDING_H_
#include "deflate.h"
#include "../../deflate.h"
ZLIB_INTERNAL void crc_fold_init(deflate_state *const);
ZLIB_INTERNAL uint32_t crc_fold_512to32(deflate_state *const);
ZLIB_INTERNAL void crc_fold_copy(deflate_state *const, unsigned char *, const unsigned char *, long);
Z_INTERNAL void crc_fold_init(deflate_state *const);
Z_INTERNAL uint32_t crc_fold_512to32(deflate_state *const);
Z_INTERNAL void crc_fold_copy(deflate_state *const, unsigned char *, const unsigned char *, long);
#endif
-25
View File
@@ -1,25 +0,0 @@
#ifndef X86_CTZL_H
#define X86_CTZL_H
#include <intrin.h>
#ifdef X86_CPUID
# include "x86.h"
#endif
#if defined(_MSC_VER) && !defined(__clang__)
/* This is not a general purpose replacement for __builtin_ctzl. The function expects that value is != 0
* Because of that assumption trailing_zero is not initialized and the return value of _BitScanForward is not checked
*/
static __forceinline unsigned long __builtin_ctzl(unsigned long value)
{
#ifdef X86_CPUID
if (x86_cpu_has_tzcnt)
return _tzcnt_u32(value);
#endif
unsigned long trailing_zero;
_BitScanForward(&trailing_zero, value);
return trailing_zero;
}
#endif
#endif
File diff suppressed because it is too large Load Diff
-175
View File
@@ -1,175 +0,0 @@
/*
* Fill Window with SSE2-optimized hash shifting
*
* Copyright (C) 2013 Intel Corporation
* Authors:
* Arjan van de Ven <arjan@linux.intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef X86_SSE2
#include "zbuild.h"
#include <immintrin.h>
#include "deflate.h"
#include "deflate_p.h"
#include "functable.h"
extern int read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
ZLIB_INTERNAL void fill_window_sse(deflate_state *s) {
const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
register unsigned n;
register Pos *p;
unsigned more; /* Amount of free space at the end of the window. */
unsigned int wsize = s->w_size;
Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
do {
more = (unsigned)(s->window_size -(unsigned long)s->lookahead -(unsigned long)s->strstart);
/* Deal with !@#$% 64K limit: */
if (sizeof(int) <= 2) {
if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
more = wsize;
} else if (more == (unsigned)(-1)) {
/* Very unlikely, but possible on 16 bit machine if
* strstart == 0 && lookahead == 1 (input done a byte at time)
*/
more--;
}
}
/* If the window is almost full and there is insufficient lookahead,
* move the upper half to the lower one to make room in the upper half.
*/
if (s->strstart >= wsize+MAX_DIST(s)) {
memcpy(s->window, s->window+wsize, (unsigned)wsize);
s->match_start = (s->match_start >= wsize) ? s->match_start - wsize : 0;
s->strstart -= wsize; /* we now have strstart >= MAX_DIST */
s->block_start -= (long) wsize;
/* Slide the hash table (could be avoided with 32 bit values
at the expense of memory usage). We slide even when level == 0
to keep the hash table consistent if we switch back to level > 0
later. (Using level 0 permanently is not an optimal usage of
zlib, so we don't care about this pathological case.)
*/
n = s->hash_size;
p = &s->head[n];
p -= 8;
do {
__m128i value, result;
value = _mm_loadu_si128((__m128i *)p);
result = _mm_subs_epu16(value, xmm_wsize);
_mm_storeu_si128((__m128i *)p, result);
p -= 8;
n -= 8;
} while (n > 0);
n = wsize;
p = &s->prev[n];
p -= 8;
do {
__m128i value, result;
value = _mm_loadu_si128((__m128i *)p);
result = _mm_subs_epu16(value, xmm_wsize);
_mm_storeu_si128((__m128i *)p, result);
p -= 8;
n -= 8;
} while (n > 0);
more += wsize;
}
if (s->strm->avail_in == 0) break;
/* If there was no sliding:
* strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
* more == window_size - lookahead - strstart
* => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
* => more >= window_size - 2*WSIZE + 2
* In the BIG_MEM or MMAP case (not yet supported),
* window_size == input_size + MIN_LOOKAHEAD &&
* strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
* Otherwise, window_size == 2*WSIZE so more >= 2.
* If there was sliding, more >= WSIZE. So in all cases, more >= 2.
*/
Assert(more >= 2, "more < 2");
n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
s->lookahead += n;
/* Initialize the hash value now that we have some input: */
if (s->lookahead + s->insert >= MIN_MATCH) {
unsigned int str = s->strstart - s->insert;
s->ins_h = s->window[str];
if (str >= 1)
functable.insert_string(s, str + 2 - MIN_MATCH, 1);
#if MIN_MATCH != 3
#error Call insert_string() MIN_MATCH-3 more times
while (s->insert) {
functable.insert_string(s, str, 1);
str++;
s->insert--;
if (s->lookahead + s->insert < MIN_MATCH)
break;
}
#else
unsigned int count;
if (unlikely(s->lookahead == 1)){
count = s->insert - 1;
}else{
count = s->insert;
}
functable.insert_string(s, str, count);
s->insert -= count;
#endif
}
/* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
* but this is not important since only literal bytes will be emitted.
*/
} while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
/* If the WIN_INIT bytes after the end of the current data have never been
* written, then zero those bytes in order to avoid memory check reports of
* the use of uninitialized (or uninitialised as Julian writes) bytes by
* the longest match routines. Update the high water mark for the next
* time through here. WIN_INIT is set to MAX_MATCH since the longest match
* routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
*/
if (s->high_water < s->window_size) {
unsigned long curr = s->strstart + (unsigned long)(s->lookahead);
unsigned long init;
if (s->high_water < curr) {
/* Previous high water mark below current data -- zero WIN_INIT
* bytes or up to end of window, whichever is less.
*/
init = s->window_size - curr;
if (init > WIN_INIT)
init = WIN_INIT;
memset(s->window + curr, 0, (unsigned)init);
s->high_water = curr + init;
} else if (s->high_water < (unsigned long)curr + WIN_INIT) {
/* High water mark at or above current data, but below current data
* plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
* to end of window, whichever is less.
*/
init = (unsigned long)curr + WIN_INIT - s->high_water;
if (init > s->window_size - s->high_water)
init = s->window_size - s->high_water;
memset(s->window + s->high_water, 0, (unsigned)init);
s->high_water += init;
}
}
Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "not enough room for search");
}
#endif
+35 -45
View File
@@ -5,52 +5,42 @@
*
*/
#include "zbuild.h"
#include "deflate.h"
/* ===========================================================================
* Insert string str in the dictionary and set match_head to the previous head
* of the hash chain (the most recent string with same hash key). Return
* the previous length of the hash chain.
* IN assertion: all calls to to INSERT_STRING are made with consecutive
* input characters and the first MIN_MATCH bytes of str are valid
* (except for the last MIN_MATCH-1 bytes of the input file).
*/
#ifdef X86_SSE4_2_CRC_HASH
ZLIB_INTERNAL Pos insert_string_sse(deflate_state *const s, const Pos str, unsigned int count) {
Pos ret = 0;
unsigned int idx;
unsigned int *ip, val, h;
for (idx = 0; idx < count; idx++) {
ip = (unsigned *)&s->window[str+idx];
memcpy(&val, ip, sizeof(val));
h = 0;
if (s->level >= TRIGGER_LEVEL)
val &= 0xFFFFFF;
#include "../../zbuild.h"
#include <immintrin.h>
#ifdef _MSC_VER
h = _mm_crc32_u32(h, val);
#elif defined(X86_SSE4_2_CRC_INTRIN)
h = __builtin_ia32_crc32si(h, val);
# include <nmmintrin.h>
#endif
#include "../../deflate.h"
#ifdef X86_SSE42_CRC_INTRIN
# ifdef _MSC_VER
# define UPDATE_HASH(s, h, val)\
h = _mm_crc32_u32(h, val)
# else
# define UPDATE_HASH(s, h, val)\
h = __builtin_ia32_crc32si(h, val)
# endif
#else
__asm__ __volatile__ (
"crc32 %1,%0\n\t"
: "+r" (h)
: "r" (val)
);
#endif
Pos head = s->head[h & s->hash_mask];
if (head != str+idx) {
s->prev[(str+idx) & s->w_mask] = head;
s->head[h & s->hash_mask] = str+idx;
if (idx == count-1)
ret = head;
} else if (idx == count - 1) {
ret = str + idx;
}
# ifdef _MSC_VER
# define UPDATE_HASH(s, h, val) {\
__asm mov edx, h\
__asm mov eax, val\
__asm crc32 eax, edx\
__asm mov val, eax\
}
return ret;
}
# else
# define UPDATE_HASH(s, h, val) \
__asm__ __volatile__ (\
"crc32 %1,%0\n\t"\
: "+r" (h)\
: "r" (val)\
);
# endif
#endif
#define INSERT_STRING insert_string_sse4
#define QUICK_INSERT_STRING quick_insert_string_sse4
#ifdef X86_SSE42_CRC_HASH
# include "../../insert_string_tpl.h"
#endif
+47
View File
@@ -0,0 +1,47 @@
/*
* AVX2 optimized hash slide, based on Intel's slide_sse implementation
*
* Copyright (C) 2017 Intel Corporation
* Authors:
* Arjan van de Ven <arjan@linux.intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
* Mika T. Lindqvist <postmaster@raasu.org>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../deflate.h"
#include <immintrin.h>
Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
Pos *p;
unsigned n;
uint16_t wsize = (uint16_t)s->w_size;
const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
n = HASH_SIZE;
p = &s->head[n] - 16;
do {
__m256i value, result;
value = _mm256_loadu_si256((__m256i *)p);
result= _mm256_subs_epu16(value, ymm_wsize);
_mm256_storeu_si256((__m256i *)p, result);
p -= 16;
n -= 16;
} while (n > 0);
n = wsize;
p = &s->prev[n] - 16;
do {
__m256i value, result;
value = _mm256_loadu_si256((__m256i *)p);
result= _mm256_subs_epu16(value, ymm_wsize);
_mm256_storeu_si256((__m256i *)p, result);
p -= 16;
n -= 16;
} while (n > 0);
}
+46
View File
@@ -0,0 +1,46 @@
/*
* SSE optimized hash slide
*
* Copyright (C) 2017 Intel Corporation
* Authors:
* Arjan van de Ven <arjan@linux.intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../deflate.h"
#include <immintrin.h>
Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
Pos *p;
unsigned n;
uint16_t wsize = (uint16_t)s->w_size;
const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
n = HASH_SIZE;
p = &s->head[n] - 8;
do {
__m128i value, result;
value = _mm_loadu_si128((__m128i *)p);
result= _mm_subs_epu16(value, xmm_wsize);
_mm_storeu_si128((__m128i *)p, result);
p -= 8;
n -= 8;
} while (n > 0);
n = wsize;
p = &s->prev[n] - 8;
do {
__m128i value, result;
value = _mm_loadu_si128((__m128i *)p);
result= _mm_subs_epu16(value, xmm_wsize);
_mm_storeu_si128((__m128i *)p, result);
p -= 8;
n -= 8;
} while (n > 0);
}
+54 -42
View File
@@ -8,61 +8,73 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zutil.h"
#include "../../zutil.h"
#ifdef _MSC_VER
#include <intrin.h>
# include <intrin.h>
#else
// Newer versions of GCC and clang come with cpuid.h
#include <cpuid.h>
# include <cpuid.h>
#endif
ZLIB_INTERNAL int x86_cpu_has_sse2;
ZLIB_INTERNAL int x86_cpu_has_sse42;
ZLIB_INTERNAL int x86_cpu_has_pclmulqdq;
ZLIB_INTERNAL int x86_cpu_has_tzcnt;
Z_INTERNAL int x86_cpu_has_avx2;
Z_INTERNAL int x86_cpu_has_sse2;
Z_INTERNAL int x86_cpu_has_ssse3;
Z_INTERNAL int x86_cpu_has_sse42;
Z_INTERNAL int x86_cpu_has_pclmulqdq;
Z_INTERNAL int x86_cpu_has_tzcnt;
static void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
#ifdef _MSC_VER
unsigned int registers[4];
__cpuid(registers, info);
unsigned int registers[4];
__cpuid((int *)registers, info);
*eax = registers[0];
*ebx = registers[1];
*ecx = registers[2];
*edx = registers[3];
*eax = registers[0];
*ebx = registers[1];
*ecx = registers[2];
*edx = registers[3];
#else
unsigned int _eax;
unsigned int _ebx;
unsigned int _ecx;
unsigned int _edx;
__cpuid(info, _eax, _ebx, _ecx, _edx);
*eax = _eax;
*ebx = _ebx;
*ecx = _ecx;
*edx = _edx;
__cpuid(info, *eax, *ebx, *ecx, *edx);
#endif
}
void ZLIB_INTERNAL x86_check_features(void) {
unsigned eax, ebx, ecx, edx;
unsigned maxbasic;
static void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
#ifdef _MSC_VER
unsigned int registers[4];
__cpuidex((int *)registers, info, subinfo);
cpuid(0, &maxbasic, &ebx, &ecx, &edx);
cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
x86_cpu_has_sse2 = edx & 0x4000000;
x86_cpu_has_sse42 = ecx & 0x100000;
x86_cpu_has_pclmulqdq = ecx & 0x2;
if (maxbasic >= 7) {
cpuid(7, &eax, &ebx, &ecx, &edx);
// check BMI1 bit
// Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
x86_cpu_has_tzcnt = ebx & 0x8;
} else {
x86_cpu_has_tzcnt = 0;
}
*eax = registers[0];
*ebx = registers[1];
*ecx = registers[2];
*edx = registers[3];
#else
__cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
#endif
}
void Z_INTERNAL x86_check_features(void) {
unsigned eax, ebx, ecx, edx;
unsigned maxbasic;
cpuid(0, &maxbasic, &ebx, &ecx, &edx);
cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
x86_cpu_has_sse2 = edx & 0x4000000;
x86_cpu_has_ssse3 = ecx & 0x200;
x86_cpu_has_sse42 = ecx & 0x100000;
x86_cpu_has_pclmulqdq = ecx & 0x2;
if (maxbasic >= 7) {
cpuidex(7, 0, &eax, &ebx, &ecx, &edx);
// check BMI1 bit
// Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
x86_cpu_has_tzcnt = ebx & 0x8;
// check AVX2 bit
x86_cpu_has_avx2 = ebx & 0x20;
} else {
x86_cpu_has_tzcnt = 0;
x86_cpu_has_avx2 = 0;
}
}
+7 -5
View File
@@ -1,16 +1,18 @@
/* cpu.h -- check for CPU features
* Copyright (C) 2013 Intel Corporation Jim Kukunas
* For conditions of distribution and use, see copyright notice in zlib.h
*/
/* cpu.h -- check for CPU features
* Copyright (C) 2013 Intel Corporation Jim Kukunas
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef CPU_H_
#define CPU_H_
extern int x86_cpu_has_avx2;
extern int x86_cpu_has_sse2;
extern int x86_cpu_has_ssse3;
extern int x86_cpu_has_sse42;
extern int x86_cpu_has_pclmulqdq;
extern int x86_cpu_has_tzcnt;
void ZLIB_INTERNAL x86_check_features(void);
void Z_INTERNAL x86_check_features(void);
#endif /* CPU_H_ */