mirror of
https://github.com/EQEmu/Server.git
synced 2025-12-11 21:01:29 +00:00
Fix for x86 zlib issue
This commit is contained in:
parent
e086f53b97
commit
f50d502f3d
2
.gitignore
vendored
2
.gitignore
vendored
@ -24,8 +24,6 @@ Makefile
|
||||
cmake_install.cmake
|
||||
install_manifest.txt
|
||||
[Bb]uild*/
|
||||
x64/
|
||||
x86/
|
||||
log/
|
||||
logs/
|
||||
vcpkg/
|
||||
|
||||
3
libs/zlibng/arch/x86/INDEX
Normal file
3
libs/zlibng/arch/x86/INDEX
Normal file
@ -0,0 +1,3 @@
|
||||
fill_window_sse.c SSE2 optimized fill_window
|
||||
deflate_quick.c SSE4 optimized deflate strategy for use as level 1
|
||||
crc_folding.c SSE4 + PCLMULQDQ optimized CRC folding implementation
|
||||
58
libs/zlibng/arch/x86/Makefile.in
Normal file
58
libs/zlibng/arch/x86/Makefile.in
Normal file
@ -0,0 +1,58 @@
|
||||
# Makefile for zlib
|
||||
# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
|
||||
# For conditions of distribution and use, see copyright notice in zlib.h
|
||||
|
||||
CC=
|
||||
CFLAGS=
|
||||
SFLAGS=
|
||||
INCLUDES=
|
||||
SUFFIX=
|
||||
|
||||
SSE2FLAG=-msse2
|
||||
SSE4FLAG=-msse4
|
||||
PCLMULFLAG=-mpclmul
|
||||
|
||||
SRCDIR=.
|
||||
SRCTOP=../..
|
||||
TOPDIR=$(SRCTOP)
|
||||
|
||||
all: x86.o x86.lo fill_window_sse.o fill_window_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo
|
||||
|
||||
x86.o:
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
|
||||
|
||||
x86.lo:
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
|
||||
|
||||
fill_window_sse.o:
|
||||
$(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_sse.c
|
||||
|
||||
fill_window_sse.lo:
|
||||
$(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_sse.c
|
||||
|
||||
deflate_quick.o:
|
||||
$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/deflate_quick.c
|
||||
|
||||
deflate_quick.lo:
|
||||
$(CC) $(SFLAGS) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/deflate_quick.c
|
||||
|
||||
insert_string_sse.o:
|
||||
$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse.c
|
||||
|
||||
insert_string_sse.lo:
|
||||
$(CC) $(SFLAGS) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse.c
|
||||
|
||||
crc_folding.o:
|
||||
$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c
|
||||
|
||||
crc_folding.lo:
|
||||
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c
|
||||
|
||||
mostlyclean: clean
|
||||
clean:
|
||||
rm -f *.o *.lo *~
|
||||
rm -rf objs
|
||||
rm -f *.gcda *.gcno *.gcov
|
||||
|
||||
distclean:
|
||||
rm -f Makefile
|
||||
450
libs/zlibng/arch/x86/crc_folding.c
Normal file
450
libs/zlibng/arch/x86/crc_folding.c
Normal file
@ -0,0 +1,450 @@
|
||||
/*
|
||||
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
|
||||
* instruction.
|
||||
*
|
||||
* A white paper describing this algorithm can be found at:
|
||||
* http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved.
|
||||
* Authors:
|
||||
* Wajdi Feghali <wajdi.k.feghali@intel.com>
|
||||
* Jim Guilford <james.guilford@intel.com>
|
||||
* Vinodh Gopal <vinodh.gopal@intel.com>
|
||||
* Erdinc Ozturk <erdinc.ozturk@intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef X86_PCLMULQDQ_CRC
|
||||
|
||||
#include "zbuild.h"
|
||||
#include <inttypes.h>
|
||||
#include <immintrin.h>
|
||||
#include <wmmintrin.h>
|
||||
|
||||
#include "crc_folding.h"
|
||||
|
||||
ZLIB_INTERNAL void crc_fold_init(deflate_state *const s) {
|
||||
/* CRC_SAVE */
|
||||
_mm_storeu_si128((__m128i *)s->crc0 + 0, _mm_cvtsi32_si128(0x9db42487));
|
||||
_mm_storeu_si128((__m128i *)s->crc0 + 1, _mm_setzero_si128());
|
||||
_mm_storeu_si128((__m128i *)s->crc0 + 2, _mm_setzero_si128());
|
||||
_mm_storeu_si128((__m128i *)s->crc0 + 3, _mm_setzero_si128());
|
||||
|
||||
s->strm->adler = 0;
|
||||
}
|
||||
|
||||
static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
|
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
__m128i x_tmp3;
|
||||
__m128 ps_crc0, ps_crc3, ps_res;
|
||||
|
||||
x_tmp3 = *xmm_crc3;
|
||||
|
||||
*xmm_crc3 = *xmm_crc0;
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
|
||||
|
||||
*xmm_crc0 = *xmm_crc1;
|
||||
*xmm_crc1 = *xmm_crc2;
|
||||
*xmm_crc2 = x_tmp3;
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res);
|
||||
}
|
||||
|
||||
static void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
|
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
__m128i x_tmp3, x_tmp2;
|
||||
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
|
||||
|
||||
x_tmp3 = *xmm_crc3;
|
||||
x_tmp2 = *xmm_crc2;
|
||||
|
||||
*xmm_crc3 = *xmm_crc1;
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_res31 = _mm_xor_ps(ps_crc3, ps_crc1);
|
||||
|
||||
*xmm_crc2 = *xmm_crc0;
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_res20 = _mm_xor_ps(ps_crc0, ps_crc2);
|
||||
|
||||
*xmm_crc0 = x_tmp2;
|
||||
*xmm_crc1 = x_tmp3;
|
||||
*xmm_crc2 = _mm_castps_si128(ps_res20);
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res31);
|
||||
}
|
||||
|
||||
static void fold_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
|
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
__m128i x_tmp3;
|
||||
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
|
||||
|
||||
x_tmp3 = *xmm_crc3;
|
||||
|
||||
*xmm_crc3 = *xmm_crc2;
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
|
||||
|
||||
*xmm_crc2 = *xmm_crc1;
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_res21 = _mm_xor_ps(ps_crc1, ps_crc2);
|
||||
|
||||
*xmm_crc1 = *xmm_crc0;
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_res10 = _mm_xor_ps(ps_crc0, ps_crc1);
|
||||
|
||||
*xmm_crc0 = x_tmp3;
|
||||
*xmm_crc1 = _mm_castps_si128(ps_res10);
|
||||
*xmm_crc2 = _mm_castps_si128(ps_res21);
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res32);
|
||||
}
|
||||
|
||||
static void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
|
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
__m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
|
||||
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
|
||||
__m128 ps_t0, ps_t1, ps_t2, ps_t3;
|
||||
__m128 ps_res0, ps_res1, ps_res2, ps_res3;
|
||||
|
||||
x_tmp0 = *xmm_crc0;
|
||||
x_tmp1 = *xmm_crc1;
|
||||
x_tmp2 = *xmm_crc2;
|
||||
x_tmp3 = *xmm_crc3;
|
||||
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_t0 = _mm_castsi128_ps(x_tmp0);
|
||||
ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
|
||||
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
|
||||
x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_t1 = _mm_castsi128_ps(x_tmp1);
|
||||
ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
|
||||
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
|
||||
x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_t2 = _mm_castsi128_ps(x_tmp2);
|
||||
ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
|
||||
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
|
||||
x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_t3 = _mm_castsi128_ps(x_tmp3);
|
||||
ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
|
||||
|
||||
*xmm_crc0 = _mm_castps_si128(ps_res0);
|
||||
*xmm_crc1 = _mm_castps_si128(ps_res1);
|
||||
*xmm_crc2 = _mm_castps_si128(ps_res2);
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res3);
|
||||
}
|
||||
|
||||
static const unsigned ALIGNED_(32) pshufb_shf_table[60] = {
|
||||
0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
|
||||
0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */
|
||||
0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */
|
||||
0x87868584, 0x8b8a8988, 0x8f8e8d8c, 0x03020100, /* shl 12 (16 - 4)/shr4 */
|
||||
0x88878685, 0x8c8b8a89, 0x008f8e8d, 0x04030201, /* shl 11 (16 - 5)/shr5 */
|
||||
0x89888786, 0x8d8c8b8a, 0x01008f8e, 0x05040302, /* shl 10 (16 - 6)/shr6 */
|
||||
0x8a898887, 0x8e8d8c8b, 0x0201008f, 0x06050403, /* shl 9 (16 - 7)/shr7 */
|
||||
0x8b8a8988, 0x8f8e8d8c, 0x03020100, 0x07060504, /* shl 8 (16 - 8)/shr8 */
|
||||
0x8c8b8a89, 0x008f8e8d, 0x04030201, 0x08070605, /* shl 7 (16 - 9)/shr9 */
|
||||
0x8d8c8b8a, 0x01008f8e, 0x05040302, 0x09080706, /* shl 6 (16 -10)/shr10*/
|
||||
0x8e8d8c8b, 0x0201008f, 0x06050403, 0x0a090807, /* shl 5 (16 -11)/shr11*/
|
||||
0x8f8e8d8c, 0x03020100, 0x07060504, 0x0b0a0908, /* shl 4 (16 -12)/shr12*/
|
||||
0x008f8e8d, 0x04030201, 0x08070605, 0x0c0b0a09, /* shl 3 (16 -13)/shr13*/
|
||||
0x01008f8e, 0x05040302, 0x09080706, 0x0d0c0b0a, /* shl 2 (16 -14)/shr14*/
|
||||
0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b /* shl 1 (16 -15)/shr15*/
|
||||
};
|
||||
|
||||
static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2,
|
||||
__m128i *xmm_crc3, __m128i *xmm_crc_part) {
|
||||
|
||||
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080);
|
||||
|
||||
__m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
|
||||
__m128i xmm_a0_0, xmm_a0_1;
|
||||
__m128 ps_crc3, psa0_0, psa0_1, ps_res;
|
||||
|
||||
xmm_shl = _mm_load_si128((__m128i *)pshufb_shf_table + (len - 1));
|
||||
xmm_shr = xmm_shl;
|
||||
xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
|
||||
|
||||
xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
|
||||
|
||||
*xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
|
||||
xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
|
||||
*xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
|
||||
|
||||
*xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
|
||||
xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
|
||||
*xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
|
||||
|
||||
*xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
|
||||
xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
|
||||
*xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
|
||||
|
||||
*xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
|
||||
*xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
|
||||
*xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
|
||||
|
||||
xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
|
||||
xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
|
||||
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
psa0_0 = _mm_castsi128_ps(xmm_a0_0);
|
||||
psa0_1 = _mm_castsi128_ps(xmm_a0_1);
|
||||
|
||||
ps_res = _mm_xor_ps(ps_crc3, psa0_0);
|
||||
ps_res = _mm_xor_ps(ps_res, psa0_1);
|
||||
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res);
|
||||
}
|
||||
|
||||
ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const unsigned char *src, long len) {
|
||||
unsigned long algn_diff;
|
||||
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
|
||||
|
||||
/* CRC_LOAD */
|
||||
__m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);
|
||||
__m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);
|
||||
__m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);
|
||||
__m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);
|
||||
__m128i xmm_crc_part;
|
||||
|
||||
if (len < 16) {
|
||||
if (len == 0)
|
||||
return;
|
||||
xmm_crc_part = _mm_loadu_si128((__m128i *)src);
|
||||
goto partial;
|
||||
}
|
||||
|
||||
algn_diff = (0 - (uintptr_t)src) & 0xF;
|
||||
if (algn_diff) {
|
||||
xmm_crc_part = _mm_loadu_si128((__m128i *)src);
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_crc_part);
|
||||
|
||||
dst += algn_diff;
|
||||
src += algn_diff;
|
||||
len -= algn_diff;
|
||||
|
||||
partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
|
||||
}
|
||||
|
||||
while ((len -= 64) >= 0) {
|
||||
/* CRC_LOAD */
|
||||
xmm_t0 = _mm_load_si128((__m128i *)src);
|
||||
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
||||
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
|
||||
xmm_t3 = _mm_load_si128((__m128i *)src + 3);
|
||||
|
||||
fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
/* CRC_SAVE */
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_t0);
|
||||
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
|
||||
_mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
|
||||
_mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
|
||||
|
||||
xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
|
||||
|
||||
src += 64;
|
||||
dst += 64;
|
||||
}
|
||||
|
||||
/*
|
||||
* len = num bytes left - 64
|
||||
*/
|
||||
if (len + 16 >= 0) {
|
||||
len += 16;
|
||||
|
||||
xmm_t0 = _mm_load_si128((__m128i *)src);
|
||||
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
||||
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
|
||||
|
||||
fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_t0);
|
||||
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
|
||||
_mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
|
||||
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
|
||||
|
||||
if (len == 0)
|
||||
goto done;
|
||||
|
||||
dst += 48;
|
||||
xmm_crc_part = _mm_load_si128((__m128i *)src + 3);
|
||||
} else if (len + 32 >= 0) {
|
||||
len += 32;
|
||||
|
||||
xmm_t0 = _mm_load_si128((__m128i *)src);
|
||||
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
||||
|
||||
fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_t0);
|
||||
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
|
||||
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
|
||||
|
||||
if (len == 0)
|
||||
goto done;
|
||||
|
||||
dst += 32;
|
||||
xmm_crc_part = _mm_load_si128((__m128i *)src + 2);
|
||||
} else if (len + 48 >= 0) {
|
||||
len += 48;
|
||||
|
||||
xmm_t0 = _mm_load_si128((__m128i *)src);
|
||||
|
||||
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_t0);
|
||||
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
|
||||
|
||||
if (len == 0)
|
||||
goto done;
|
||||
|
||||
dst += 16;
|
||||
xmm_crc_part = _mm_load_si128((__m128i *)src + 1);
|
||||
} else {
|
||||
len += 64;
|
||||
if (len == 0)
|
||||
goto done;
|
||||
xmm_crc_part = _mm_load_si128((__m128i *)src);
|
||||
}
|
||||
|
||||
partial:
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_crc_part);
|
||||
partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
|
||||
done:
|
||||
/* CRC_SAVE */
|
||||
_mm_storeu_si128((__m128i *)s->crc0 + 0, xmm_crc0);
|
||||
_mm_storeu_si128((__m128i *)s->crc0 + 1, xmm_crc1);
|
||||
_mm_storeu_si128((__m128i *)s->crc0 + 2, xmm_crc2);
|
||||
_mm_storeu_si128((__m128i *)s->crc0 + 3, xmm_crc3);
|
||||
_mm_storeu_si128((__m128i *)s->crc0 + 4, xmm_crc_part);
|
||||
}
|
||||
|
||||
static const unsigned ALIGNED_(16) crc_k[] = {
|
||||
0xccaa009e, 0x00000000, /* rk1 */
|
||||
0x751997d0, 0x00000001, /* rk2 */
|
||||
0xccaa009e, 0x00000000, /* rk5 */
|
||||
0x63cd6124, 0x00000001, /* rk6 */
|
||||
0xf7011640, 0x00000001, /* rk7 */
|
||||
0xdb710640, 0x00000001 /* rk8 */
|
||||
};
|
||||
|
||||
static const unsigned ALIGNED_(16) crc_mask[4] = {
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
|
||||
};
|
||||
|
||||
static const unsigned ALIGNED_(16) crc_mask2[4] = {
|
||||
0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
|
||||
};
|
||||
|
||||
uint32_t ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s) {
|
||||
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
|
||||
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
|
||||
|
||||
uint32_t crc;
|
||||
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
|
||||
|
||||
/* CRC_LOAD */
|
||||
__m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);
|
||||
__m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);
|
||||
__m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);
|
||||
__m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);
|
||||
|
||||
/*
|
||||
* k1
|
||||
*/
|
||||
crc_fold = _mm_load_si128((__m128i *)crc_k);
|
||||
|
||||
x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
|
||||
xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
|
||||
|
||||
x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
|
||||
xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
|
||||
|
||||
x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
|
||||
xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
|
||||
|
||||
/*
|
||||
* k5
|
||||
*/
|
||||
crc_fold = _mm_load_si128((__m128i *)crc_k + 1);
|
||||
|
||||
xmm_crc0 = xmm_crc3;
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
|
||||
xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
|
||||
|
||||
xmm_crc0 = xmm_crc3;
|
||||
xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
|
||||
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
|
||||
|
||||
/*
|
||||
* k7
|
||||
*/
|
||||
xmm_crc1 = xmm_crc3;
|
||||
xmm_crc2 = xmm_crc3;
|
||||
crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
|
||||
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
|
||||
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
|
||||
|
||||
xmm_crc2 = xmm_crc3;
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
|
||||
|
||||
crc = _mm_extract_epi32(xmm_crc3, 2);
|
||||
return ~crc;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
19
libs/zlibng/arch/x86/crc_folding.h
Normal file
19
libs/zlibng/arch/x86/crc_folding.h
Normal file
@ -0,0 +1,19 @@
|
||||
/* crc_folding.h
|
||||
*
|
||||
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
|
||||
* instruction.
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation Jim Kukunas
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef CRC_FOLDING_H_
|
||||
#define CRC_FOLDING_H_
|
||||
|
||||
#include "deflate.h"
|
||||
|
||||
ZLIB_INTERNAL void crc_fold_init(deflate_state *const);
|
||||
ZLIB_INTERNAL uint32_t crc_fold_512to32(deflate_state *const);
|
||||
ZLIB_INTERNAL void crc_fold_copy(deflate_state *const, unsigned char *, const unsigned char *, long);
|
||||
|
||||
#endif
|
||||
25
libs/zlibng/arch/x86/ctzl.h
Normal file
25
libs/zlibng/arch/x86/ctzl.h
Normal file
@ -0,0 +1,25 @@
|
||||
#ifndef X86_CTZL_H
|
||||
#define X86_CTZL_H
|
||||
|
||||
#include <intrin.h>
|
||||
#ifdef X86_CPUID
|
||||
# include "x86.h"
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
/* This is not a general purpose replacement for __builtin_ctzl. The function expects that value is != 0
|
||||
* Because of that assumption trailing_zero is not initialized and the return value of _BitScanForward is not checked
|
||||
*/
|
||||
static __forceinline unsigned long __builtin_ctzl(unsigned long value)
|
||||
{
|
||||
#ifdef X86_CPUID
|
||||
if (x86_cpu_has_tzcnt)
|
||||
return _tzcnt_u32(value);
|
||||
#endif
|
||||
unsigned long trailing_zero;
|
||||
_BitScanForward(&trailing_zero, value);
|
||||
return trailing_zero;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
2405
libs/zlibng/arch/x86/deflate_quick.c
Normal file
2405
libs/zlibng/arch/x86/deflate_quick.c
Normal file
File diff suppressed because it is too large
Load Diff
175
libs/zlibng/arch/x86/fill_window_sse.c
Normal file
175
libs/zlibng/arch/x86/fill_window_sse.c
Normal file
@ -0,0 +1,175 @@
|
||||
/*
|
||||
* Fill Window with SSE2-optimized hash shifting
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation
|
||||
* Authors:
|
||||
* Arjan van de Ven <arjan@linux.intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#ifdef X86_SSE2
|
||||
|
||||
#include "zbuild.h"
|
||||
#include <immintrin.h>
|
||||
#include "deflate.h"
|
||||
#include "deflate_p.h"
|
||||
#include "functable.h"
|
||||
|
||||
extern int read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
|
||||
|
||||
ZLIB_INTERNAL void fill_window_sse(deflate_state *s) {
|
||||
const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
|
||||
|
||||
register unsigned n;
|
||||
register Pos *p;
|
||||
unsigned more; /* Amount of free space at the end of the window. */
|
||||
unsigned int wsize = s->w_size;
|
||||
|
||||
Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
|
||||
|
||||
do {
|
||||
more = (unsigned)(s->window_size -(unsigned long)s->lookahead -(unsigned long)s->strstart);
|
||||
|
||||
/* Deal with !@#$% 64K limit: */
|
||||
if (sizeof(int) <= 2) {
|
||||
if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
|
||||
more = wsize;
|
||||
|
||||
} else if (more == (unsigned)(-1)) {
|
||||
/* Very unlikely, but possible on 16 bit machine if
|
||||
* strstart == 0 && lookahead == 1 (input done a byte at time)
|
||||
*/
|
||||
more--;
|
||||
}
|
||||
}
|
||||
|
||||
/* If the window is almost full and there is insufficient lookahead,
|
||||
* move the upper half to the lower one to make room in the upper half.
|
||||
*/
|
||||
if (s->strstart >= wsize+MAX_DIST(s)) {
|
||||
memcpy(s->window, s->window+wsize, (unsigned)wsize);
|
||||
s->match_start = (s->match_start >= wsize) ? s->match_start - wsize : 0;
|
||||
s->strstart -= wsize; /* we now have strstart >= MAX_DIST */
|
||||
s->block_start -= (long) wsize;
|
||||
|
||||
/* Slide the hash table (could be avoided with 32 bit values
|
||||
at the expense of memory usage). We slide even when level == 0
|
||||
to keep the hash table consistent if we switch back to level > 0
|
||||
later. (Using level 0 permanently is not an optimal usage of
|
||||
zlib, so we don't care about this pathological case.)
|
||||
*/
|
||||
n = s->hash_size;
|
||||
p = &s->head[n];
|
||||
p -= 8;
|
||||
do {
|
||||
__m128i value, result;
|
||||
|
||||
value = _mm_loadu_si128((__m128i *)p);
|
||||
result = _mm_subs_epu16(value, xmm_wsize);
|
||||
_mm_storeu_si128((__m128i *)p, result);
|
||||
|
||||
p -= 8;
|
||||
n -= 8;
|
||||
} while (n > 0);
|
||||
|
||||
n = wsize;
|
||||
p = &s->prev[n];
|
||||
p -= 8;
|
||||
do {
|
||||
__m128i value, result;
|
||||
|
||||
value = _mm_loadu_si128((__m128i *)p);
|
||||
result = _mm_subs_epu16(value, xmm_wsize);
|
||||
_mm_storeu_si128((__m128i *)p, result);
|
||||
|
||||
p -= 8;
|
||||
n -= 8;
|
||||
} while (n > 0);
|
||||
more += wsize;
|
||||
}
|
||||
if (s->strm->avail_in == 0) break;
|
||||
|
||||
/* If there was no sliding:
|
||||
* strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
|
||||
* more == window_size - lookahead - strstart
|
||||
* => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
|
||||
* => more >= window_size - 2*WSIZE + 2
|
||||
* In the BIG_MEM or MMAP case (not yet supported),
|
||||
* window_size == input_size + MIN_LOOKAHEAD &&
|
||||
* strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
|
||||
* Otherwise, window_size == 2*WSIZE so more >= 2.
|
||||
* If there was sliding, more >= WSIZE. So in all cases, more >= 2.
|
||||
*/
|
||||
Assert(more >= 2, "more < 2");
|
||||
|
||||
n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
|
||||
s->lookahead += n;
|
||||
|
||||
/* Initialize the hash value now that we have some input: */
|
||||
if (s->lookahead + s->insert >= MIN_MATCH) {
|
||||
unsigned int str = s->strstart - s->insert;
|
||||
s->ins_h = s->window[str];
|
||||
if (str >= 1)
|
||||
functable.insert_string(s, str + 2 - MIN_MATCH, 1);
|
||||
#if MIN_MATCH != 3
|
||||
#error Call insert_string() MIN_MATCH-3 more times
|
||||
while (s->insert) {
|
||||
functable.insert_string(s, str, 1);
|
||||
str++;
|
||||
s->insert--;
|
||||
if (s->lookahead + s->insert < MIN_MATCH)
|
||||
break;
|
||||
}
|
||||
#else
|
||||
unsigned int count;
|
||||
if (unlikely(s->lookahead == 1)){
|
||||
count = s->insert - 1;
|
||||
}else{
|
||||
count = s->insert;
|
||||
}
|
||||
functable.insert_string(s, str, count);
|
||||
s->insert -= count;
|
||||
#endif
|
||||
}
|
||||
/* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
|
||||
* but this is not important since only literal bytes will be emitted.
|
||||
*/
|
||||
} while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
|
||||
|
||||
/* If the WIN_INIT bytes after the end of the current data have never been
|
||||
* written, then zero those bytes in order to avoid memory check reports of
|
||||
* the use of uninitialized (or uninitialised as Julian writes) bytes by
|
||||
* the longest match routines. Update the high water mark for the next
|
||||
* time through here. WIN_INIT is set to MAX_MATCH since the longest match
|
||||
* routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
|
||||
*/
|
||||
if (s->high_water < s->window_size) {
|
||||
unsigned long curr = s->strstart + (unsigned long)(s->lookahead);
|
||||
unsigned long init;
|
||||
|
||||
if (s->high_water < curr) {
|
||||
/* Previous high water mark below current data -- zero WIN_INIT
|
||||
* bytes or up to end of window, whichever is less.
|
||||
*/
|
||||
init = s->window_size - curr;
|
||||
if (init > WIN_INIT)
|
||||
init = WIN_INIT;
|
||||
memset(s->window + curr, 0, (unsigned)init);
|
||||
s->high_water = curr + init;
|
||||
} else if (s->high_water < (unsigned long)curr + WIN_INIT) {
|
||||
/* High water mark at or above current data, but below current data
|
||||
* plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
|
||||
* to end of window, whichever is less.
|
||||
*/
|
||||
init = (unsigned long)curr + WIN_INIT - s->high_water;
|
||||
if (init > s->window_size - s->high_water)
|
||||
init = s->window_size - s->high_water;
|
||||
memset(s->window + s->high_water, 0, (unsigned)init);
|
||||
s->high_water += init;
|
||||
}
|
||||
}
|
||||
|
||||
Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "not enough room for search");
|
||||
}
|
||||
#endif
|
||||
56
libs/zlibng/arch/x86/insert_string_sse.c
Normal file
56
libs/zlibng/arch/x86/insert_string_sse.c
Normal file
@ -0,0 +1,56 @@
|
||||
/* insert_string_sse -- insert_string variant using SSE4.2's CRC instructions
|
||||
*
|
||||
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "deflate.h"
|
||||
|
||||
/* ===========================================================================
|
||||
* Insert string str in the dictionary and set match_head to the previous head
|
||||
* of the hash chain (the most recent string with same hash key). Return
|
||||
* the previous length of the hash chain.
|
||||
* IN assertion: all calls to to INSERT_STRING are made with consecutive
|
||||
* input characters and the first MIN_MATCH bytes of str are valid
|
||||
* (except for the last MIN_MATCH-1 bytes of the input file).
|
||||
*/
|
||||
#ifdef X86_SSE4_2_CRC_HASH
|
||||
ZLIB_INTERNAL Pos insert_string_sse(deflate_state *const s, const Pos str, unsigned int count) {
|
||||
Pos ret = 0;
|
||||
unsigned int idx;
|
||||
unsigned int *ip, val, h;
|
||||
|
||||
for (idx = 0; idx < count; idx++) {
|
||||
ip = (unsigned *)&s->window[str+idx];
|
||||
memcpy(&val, ip, sizeof(val));
|
||||
h = 0;
|
||||
|
||||
if (s->level >= TRIGGER_LEVEL)
|
||||
val &= 0xFFFFFF;
|
||||
|
||||
#ifdef _MSC_VER
|
||||
h = _mm_crc32_u32(h, val);
|
||||
#elif defined(X86_SSE4_2_CRC_INTRIN)
|
||||
h = __builtin_ia32_crc32si(h, val);
|
||||
#else
|
||||
__asm__ __volatile__ (
|
||||
"crc32 %1,%0\n\t"
|
||||
: "+r" (h)
|
||||
: "r" (val)
|
||||
);
|
||||
#endif
|
||||
Pos head = s->head[h & s->hash_mask];
|
||||
if (head != str+idx) {
|
||||
s->prev[(str+idx) & s->w_mask] = head;
|
||||
s->head[h & s->hash_mask] = str+idx;
|
||||
if (idx == count-1)
|
||||
ret = head;
|
||||
} else if (idx == count - 1) {
|
||||
ret = str + idx;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
68
libs/zlibng/arch/x86/x86.c
Normal file
68
libs/zlibng/arch/x86/x86.c
Normal file
@ -0,0 +1,68 @@
|
||||
/*
|
||||
* x86 feature check
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved.
|
||||
* Author:
|
||||
* Jim Kukunas
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zutil.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#else
|
||||
// Newer versions of GCC and clang come with cpuid.h
|
||||
#include <cpuid.h>
|
||||
#endif
|
||||
|
||||
ZLIB_INTERNAL int x86_cpu_has_sse2;
|
||||
ZLIB_INTERNAL int x86_cpu_has_sse42;
|
||||
ZLIB_INTERNAL int x86_cpu_has_pclmulqdq;
|
||||
ZLIB_INTERNAL int x86_cpu_has_tzcnt;
|
||||
|
||||
static void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
|
||||
#ifdef _MSC_VER
|
||||
unsigned int registers[4];
|
||||
__cpuid(registers, info);
|
||||
|
||||
*eax = registers[0];
|
||||
*ebx = registers[1];
|
||||
*ecx = registers[2];
|
||||
*edx = registers[3];
|
||||
#else
|
||||
unsigned int _eax;
|
||||
unsigned int _ebx;
|
||||
unsigned int _ecx;
|
||||
unsigned int _edx;
|
||||
__cpuid(info, _eax, _ebx, _ecx, _edx);
|
||||
*eax = _eax;
|
||||
*ebx = _ebx;
|
||||
*ecx = _ecx;
|
||||
*edx = _edx;
|
||||
#endif
|
||||
}
|
||||
|
||||
void ZLIB_INTERNAL x86_check_features(void) {
|
||||
unsigned eax, ebx, ecx, edx;
|
||||
unsigned maxbasic;
|
||||
|
||||
cpuid(0, &maxbasic, &ebx, &ecx, &edx);
|
||||
|
||||
cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
x86_cpu_has_sse2 = edx & 0x4000000;
|
||||
x86_cpu_has_sse42 = ecx & 0x100000;
|
||||
x86_cpu_has_pclmulqdq = ecx & 0x2;
|
||||
|
||||
if (maxbasic >= 7) {
|
||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
// check BMI1 bit
|
||||
// Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
|
||||
x86_cpu_has_tzcnt = ebx & 0x8;
|
||||
} else {
|
||||
x86_cpu_has_tzcnt = 0;
|
||||
}
|
||||
}
|
||||
16
libs/zlibng/arch/x86/x86.h
Normal file
16
libs/zlibng/arch/x86/x86.h
Normal file
@ -0,0 +1,16 @@
|
||||
/* cpu.h -- check for CPU features
|
||||
* Copyright (C) 2013 Intel Corporation Jim Kukunas
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef CPU_H_
|
||||
#define CPU_H_
|
||||
|
||||
extern int x86_cpu_has_sse2;
|
||||
extern int x86_cpu_has_sse42;
|
||||
extern int x86_cpu_has_pclmulqdq;
|
||||
extern int x86_cpu_has_tzcnt;
|
||||
|
||||
void ZLIB_INTERNAL x86_check_features(void);
|
||||
|
||||
#endif /* CPU_H_ */
|
||||
Loading…
x
Reference in New Issue
Block a user