mirror of
https://github.com/EQEmu/Server.git
synced 2025-12-11 21:01:29 +00:00
* Update zlibng * Set cmake path more directly in zlibng to hopefully fix an issue with the build on drone * I'm dumb, missing / in path * Mackal helps with a dumb gitignore issue * Adding all the files, not sure what's ignoring them and im tired of looking * Some tweaks to zlibng build to hopefully get it to build properly. works on msvc now
467 lines
14 KiB
C
467 lines
14 KiB
C
/* functable.c -- Choose relevant optimized functions at runtime
|
|
* Copyright (C) 2017 Hans Kristian Rosbach
|
|
* For conditions of distribution and use, see copyright notice in zlib.h
|
|
*/
|
|
|
|
#include "zbuild.h"
|
|
#include "zendian.h"
|
|
#include "deflate.h"
|
|
#include "deflate_p.h"
|
|
|
|
#include "functable.h"
|
|
|
|
#ifdef X86_FEATURES
|
|
# include "fallback_builtins.h"
|
|
#endif
|
|
|
|
/* insert_string */
|
|
extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count);
|
|
#ifdef X86_SSE42_CRC_HASH
|
|
extern void insert_string_sse4(deflate_state *const s, const uint32_t str, uint32_t count);
|
|
#elif defined(ARM_ACLE_CRC_HASH)
|
|
extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count);
|
|
#endif
|
|
|
|
/* quick_insert_string */
|
|
extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str);
|
|
#ifdef X86_SSE42_CRC_HASH
|
|
extern Pos quick_insert_string_sse4(deflate_state *const s, const uint32_t str);
|
|
#elif defined(ARM_ACLE_CRC_HASH)
|
|
extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str);
|
|
#endif
|
|
|
|
/* slide_hash */
|
|
#ifdef X86_SSE2
|
|
void slide_hash_sse2(deflate_state *s);
|
|
#elif defined(ARM_NEON_SLIDEHASH)
|
|
void slide_hash_neon(deflate_state *s);
|
|
#elif defined(POWER8_VSX_SLIDEHASH)
|
|
void slide_hash_power8(deflate_state *s);
|
|
#endif
|
|
#ifdef X86_AVX2
|
|
void slide_hash_avx2(deflate_state *s);
|
|
#endif
|
|
|
|
/* adler32 */
|
|
extern uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len);
|
|
#ifdef ARM_NEON_ADLER32
|
|
extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len);
|
|
#endif
|
|
#ifdef X86_SSSE3_ADLER32
|
|
extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len);
|
|
#endif
|
|
#ifdef X86_AVX2_ADLER32
|
|
extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len);
|
|
#endif
|
|
#ifdef POWER8_VSX_ADLER32
|
|
extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len);
|
|
#endif
|
|
|
|
/* memory chunking */
|
|
extern uint32_t chunksize_c(void);
|
|
extern uint8_t* chunkcopy_c(uint8_t *out, uint8_t const *from, unsigned len);
|
|
extern uint8_t* chunkcopy_safe_c(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
|
|
extern uint8_t* chunkunroll_c(uint8_t *out, unsigned *dist, unsigned *len);
|
|
extern uint8_t* chunkmemset_c(uint8_t *out, unsigned dist, unsigned len);
|
|
extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
|
|
#ifdef X86_SSE2_CHUNKSET
|
|
extern uint32_t chunksize_sse2(void);
|
|
extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
|
|
extern uint8_t* chunkcopy_safe_sse2(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
|
|
extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
|
|
extern uint8_t* chunkmemset_sse2(uint8_t *out, unsigned dist, unsigned len);
|
|
extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
|
|
#endif
|
|
#ifdef X86_AVX_CHUNKSET
|
|
extern uint32_t chunksize_avx(void);
|
|
extern uint8_t* chunkcopy_avx(uint8_t *out, uint8_t const *from, unsigned len);
|
|
extern uint8_t* chunkcopy_safe_avx(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
|
|
extern uint8_t* chunkunroll_avx(uint8_t *out, unsigned *dist, unsigned *len);
|
|
extern uint8_t* chunkmemset_avx(uint8_t *out, unsigned dist, unsigned len);
|
|
extern uint8_t* chunkmemset_safe_avx(uint8_t *out, unsigned dist, unsigned len, unsigned left);
|
|
#endif
|
|
#ifdef ARM_NEON_CHUNKSET
|
|
extern uint32_t chunksize_neon(void);
|
|
extern uint8_t* chunkcopy_neon(uint8_t *out, uint8_t const *from, unsigned len);
|
|
extern uint8_t* chunkcopy_safe_neon(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
|
|
extern uint8_t* chunkunroll_neon(uint8_t *out, unsigned *dist, unsigned *len);
|
|
extern uint8_t* chunkmemset_neon(uint8_t *out, unsigned dist, unsigned len);
|
|
extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
|
|
#endif
|
|
|
|
/* CRC32 */
|
|
Z_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t);
|
|
|
|
#ifdef ARM_ACLE_CRC_HASH
|
|
extern uint32_t crc32_acle(uint32_t, const unsigned char *, uint64_t);
|
|
#endif
|
|
|
|
#if BYTE_ORDER == LITTLE_ENDIAN
|
|
extern uint32_t crc32_little(uint32_t, const unsigned char *, uint64_t);
|
|
#elif BYTE_ORDER == BIG_ENDIAN
|
|
extern uint32_t crc32_big(uint32_t, const unsigned char *, uint64_t);
|
|
#endif
|
|
|
|
/* compare258 */
|
|
extern uint32_t compare258_c(const unsigned char *src0, const unsigned char *src1);
|
|
#ifdef UNALIGNED_OK
|
|
extern uint32_t compare258_unaligned_16(const unsigned char *src0, const unsigned char *src1);
|
|
extern uint32_t compare258_unaligned_32(const unsigned char *src0, const unsigned char *src1);
|
|
#ifdef UNALIGNED64_OK
|
|
extern uint32_t compare258_unaligned_64(const unsigned char *src0, const unsigned char *src1);
|
|
#endif
|
|
#ifdef X86_SSE42_CMP_STR
|
|
extern uint32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1);
|
|
#endif
|
|
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
|
|
extern uint32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1);
|
|
#endif
|
|
#endif
|
|
|
|
/* longest_match */
|
|
extern uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
|
|
#ifdef UNALIGNED_OK
|
|
extern uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match);
|
|
extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match);
|
|
#ifdef UNALIGNED64_OK
|
|
extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
|
|
#endif
|
|
#ifdef X86_SSE42_CMP_STR
|
|
extern uint32_t longest_match_unaligned_sse4(deflate_state *const s, Pos cur_match);
|
|
#endif
|
|
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
|
|
extern uint32_t longest_match_unaligned_avx2(deflate_state *const s, Pos cur_match);
|
|
#endif
|
|
#endif
|
|
|
|
Z_INTERNAL Z_TLS struct functable_s functable;
|
|
|
|
Z_INTERNAL void cpu_check_features(void)
|
|
{
|
|
static int features_checked = 0;
|
|
if (features_checked)
|
|
return;
|
|
#if defined(X86_FEATURES)
|
|
x86_check_features();
|
|
#elif defined(ARM_FEATURES)
|
|
arm_check_features();
|
|
#elif defined(POWER_FEATURES)
|
|
power_check_features();
|
|
#endif
|
|
features_checked = 1;
|
|
}
|
|
|
|
/* stub functions */
|
|
Z_INTERNAL void insert_string_stub(deflate_state *const s, const uint32_t str, uint32_t count) {
|
|
// Initialize default
|
|
|
|
functable.insert_string = &insert_string_c;
|
|
cpu_check_features();
|
|
|
|
#ifdef X86_SSE42_CRC_HASH
|
|
if (x86_cpu_has_sse42)
|
|
functable.insert_string = &insert_string_sse4;
|
|
#elif defined(ARM_ACLE_CRC_HASH)
|
|
if (arm_cpu_has_crc32)
|
|
functable.insert_string = &insert_string_acle;
|
|
#endif
|
|
|
|
functable.insert_string(s, str, count);
|
|
}
|
|
|
|
Z_INTERNAL Pos quick_insert_string_stub(deflate_state *const s, const uint32_t str) {
|
|
functable.quick_insert_string = &quick_insert_string_c;
|
|
|
|
#ifdef X86_SSE42_CRC_HASH
|
|
if (x86_cpu_has_sse42)
|
|
functable.quick_insert_string = &quick_insert_string_sse4;
|
|
#elif defined(ARM_ACLE_CRC_HASH)
|
|
if (arm_cpu_has_crc32)
|
|
functable.quick_insert_string = &quick_insert_string_acle;
|
|
#endif
|
|
|
|
return functable.quick_insert_string(s, str);
|
|
}
|
|
|
|
Z_INTERNAL void slide_hash_stub(deflate_state *s) {
|
|
|
|
functable.slide_hash = &slide_hash_c;
|
|
cpu_check_features();
|
|
|
|
#ifdef X86_SSE2
|
|
# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
|
|
if (x86_cpu_has_sse2)
|
|
# endif
|
|
functable.slide_hash = &slide_hash_sse2;
|
|
#elif defined(ARM_NEON_SLIDEHASH)
|
|
# ifndef ARM_NOCHECK_NEON
|
|
if (arm_cpu_has_neon)
|
|
# endif
|
|
functable.slide_hash = &slide_hash_neon;
|
|
#endif
|
|
#ifdef X86_AVX2
|
|
if (x86_cpu_has_avx2)
|
|
functable.slide_hash = &slide_hash_avx2;
|
|
#endif
|
|
#ifdef POWER8_VSX_SLIDEHASH
|
|
if (power_cpu_has_arch_2_07)
|
|
functable.slide_hash = &slide_hash_power8;
|
|
#endif
|
|
|
|
functable.slide_hash(s);
|
|
}
|
|
|
|
Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len) {
|
|
// Initialize default
|
|
functable.adler32 = &adler32_c;
|
|
cpu_check_features();
|
|
|
|
#ifdef ARM_NEON_ADLER32
|
|
# ifndef ARM_NOCHECK_NEON
|
|
if (arm_cpu_has_neon)
|
|
# endif
|
|
functable.adler32 = &adler32_neon;
|
|
#endif
|
|
#ifdef X86_SSSE3_ADLER32
|
|
if (x86_cpu_has_ssse3)
|
|
functable.adler32 = &adler32_ssse3;
|
|
#endif
|
|
#ifdef X86_AVX2_ADLER32
|
|
if (x86_cpu_has_avx2)
|
|
functable.adler32 = &adler32_avx2;
|
|
#endif
|
|
#ifdef POWER8_VSX_ADLER32
|
|
if (power_cpu_has_arch_2_07)
|
|
functable.adler32 = &adler32_power8;
|
|
#endif
|
|
|
|
return functable.adler32(adler, buf, len);
|
|
}
|
|
|
|
Z_INTERNAL uint32_t chunksize_stub(void) {
|
|
// Initialize default
|
|
functable.chunksize = &chunksize_c;
|
|
|
|
#ifdef X86_SSE2_CHUNKSET
|
|
# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
|
|
if (x86_cpu_has_sse2)
|
|
# endif
|
|
functable.chunksize = &chunksize_sse2;
|
|
#endif
|
|
#ifdef X86_AVX_CHUNKSET
|
|
if (x86_cpu_has_avx2)
|
|
functable.chunksize = &chunksize_avx;
|
|
#endif
|
|
#ifdef ARM_NEON_CHUNKSET
|
|
if (arm_cpu_has_neon)
|
|
functable.chunksize = &chunksize_neon;
|
|
#endif
|
|
|
|
return functable.chunksize();
|
|
}
|
|
|
|
Z_INTERNAL uint8_t* chunkcopy_stub(uint8_t *out, uint8_t const *from, unsigned len) {
|
|
// Initialize default
|
|
functable.chunkcopy = &chunkcopy_c;
|
|
|
|
#ifdef X86_SSE2_CHUNKSET
|
|
# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
|
|
if (x86_cpu_has_sse2)
|
|
# endif
|
|
functable.chunkcopy = &chunkcopy_sse2;
|
|
#endif
|
|
#ifdef X86_AVX_CHUNKSET
|
|
if (x86_cpu_has_avx2)
|
|
functable.chunkcopy = &chunkcopy_avx;
|
|
#endif
|
|
#ifdef ARM_NEON_CHUNKSET
|
|
if (arm_cpu_has_neon)
|
|
functable.chunkcopy = &chunkcopy_neon;
|
|
#endif
|
|
|
|
return functable.chunkcopy(out, from, len);
|
|
}
|
|
|
|
Z_INTERNAL uint8_t* chunkcopy_safe_stub(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) {
|
|
// Initialize default
|
|
functable.chunkcopy_safe = &chunkcopy_safe_c;
|
|
|
|
#ifdef X86_SSE2_CHUNKSET
|
|
# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
|
|
if (x86_cpu_has_sse2)
|
|
# endif
|
|
functable.chunkcopy_safe = &chunkcopy_safe_sse2;
|
|
#endif
|
|
#ifdef X86_AVX_CHUNKSET
|
|
if (x86_cpu_has_avx2)
|
|
functable.chunkcopy_safe = &chunkcopy_safe_avx;
|
|
#endif
|
|
#ifdef ARM_NEON_CHUNKSET
|
|
if (arm_cpu_has_neon)
|
|
functable.chunkcopy_safe = &chunkcopy_safe_neon;
|
|
#endif
|
|
|
|
return functable.chunkcopy_safe(out, from, len, safe);
|
|
}
|
|
|
|
Z_INTERNAL uint8_t* chunkunroll_stub(uint8_t *out, unsigned *dist, unsigned *len) {
|
|
// Initialize default
|
|
functable.chunkunroll = &chunkunroll_c;
|
|
|
|
#ifdef X86_SSE2_CHUNKSET
|
|
# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
|
|
if (x86_cpu_has_sse2)
|
|
# endif
|
|
functable.chunkunroll = &chunkunroll_sse2;
|
|
#endif
|
|
#ifdef X86_AVX_CHUNKSET
|
|
if (x86_cpu_has_avx2)
|
|
functable.chunkunroll = &chunkunroll_avx;
|
|
#endif
|
|
#ifdef ARM_NEON_CHUNKSET
|
|
if (arm_cpu_has_neon)
|
|
functable.chunkunroll = &chunkunroll_neon;
|
|
#endif
|
|
|
|
return functable.chunkunroll(out, dist, len);
|
|
}
|
|
|
|
Z_INTERNAL uint8_t* chunkmemset_stub(uint8_t *out, unsigned dist, unsigned len) {
|
|
// Initialize default
|
|
functable.chunkmemset = &chunkmemset_c;
|
|
|
|
#ifdef X86_SSE2_CHUNKSET
|
|
# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
|
|
if (x86_cpu_has_sse2)
|
|
# endif
|
|
functable.chunkmemset = &chunkmemset_sse2;
|
|
#endif
|
|
#ifdef X86_AVX_CHUNKSET
|
|
if (x86_cpu_has_avx2)
|
|
functable.chunkmemset = &chunkmemset_avx;
|
|
#endif
|
|
#ifdef ARM_NEON_CHUNKSET
|
|
if (arm_cpu_has_neon)
|
|
functable.chunkmemset = &chunkmemset_neon;
|
|
#endif
|
|
|
|
return functable.chunkmemset(out, dist, len);
|
|
}
|
|
|
|
Z_INTERNAL uint8_t* chunkmemset_safe_stub(uint8_t *out, unsigned dist, unsigned len, unsigned left) {
|
|
// Initialize default
|
|
functable.chunkmemset_safe = &chunkmemset_safe_c;
|
|
|
|
#ifdef X86_SSE2_CHUNKSET
|
|
# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
|
|
if (x86_cpu_has_sse2)
|
|
# endif
|
|
functable.chunkmemset_safe = &chunkmemset_safe_sse2;
|
|
#endif
|
|
#ifdef X86_AVX_CHUNKSET
|
|
if (x86_cpu_has_avx2)
|
|
functable.chunkmemset_safe = &chunkmemset_safe_avx;
|
|
#endif
|
|
#ifdef ARM_NEON_CHUNKSET
|
|
if (arm_cpu_has_neon)
|
|
functable.chunkmemset_safe = &chunkmemset_safe_neon;
|
|
#endif
|
|
|
|
return functable.chunkmemset_safe(out, dist, len, left);
|
|
}
|
|
|
|
Z_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t len) {
|
|
int32_t use_byfour = sizeof(void *) == sizeof(ptrdiff_t);
|
|
|
|
Assert(sizeof(uint64_t) >= sizeof(size_t),
|
|
"crc32_z takes size_t but internally we have a uint64_t len");
|
|
/* return a function pointer for optimized arches here after a capability test */
|
|
|
|
cpu_check_features();
|
|
|
|
if (use_byfour) {
|
|
#if BYTE_ORDER == LITTLE_ENDIAN
|
|
functable.crc32 = crc32_little;
|
|
# if defined(ARM_ACLE_CRC_HASH)
|
|
if (arm_cpu_has_crc32)
|
|
functable.crc32 = crc32_acle;
|
|
# endif
|
|
#elif BYTE_ORDER == BIG_ENDIAN
|
|
functable.crc32 = crc32_big;
|
|
#else
|
|
# error No endian defined
|
|
#endif
|
|
} else {
|
|
functable.crc32 = crc32_generic;
|
|
}
|
|
|
|
return functable.crc32(crc, buf, len);
|
|
}
|
|
|
|
Z_INTERNAL uint32_t compare258_stub(const unsigned char *src0, const unsigned char *src1) {
|
|
|
|
functable.compare258 = &compare258_c;
|
|
|
|
#ifdef UNALIGNED_OK
|
|
# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
|
|
functable.compare258 = &compare258_unaligned_64;
|
|
# elif defined(HAVE_BUILTIN_CTZ)
|
|
functable.compare258 = &compare258_unaligned_32;
|
|
# else
|
|
functable.compare258 = &compare258_unaligned_16;
|
|
# endif
|
|
# ifdef X86_SSE42_CMP_STR
|
|
if (x86_cpu_has_sse42)
|
|
functable.compare258 = &compare258_unaligned_sse4;
|
|
# endif
|
|
# if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
|
|
if (x86_cpu_has_avx2)
|
|
functable.compare258 = &compare258_unaligned_avx2;
|
|
# endif
|
|
#endif
|
|
|
|
return functable.compare258(src0, src1);
|
|
}
|
|
|
|
Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) {
|
|
|
|
functable.longest_match = &longest_match_c;
|
|
|
|
#ifdef UNALIGNED_OK
|
|
# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
|
|
functable.longest_match = &longest_match_unaligned_64;
|
|
# elif defined(HAVE_BUILTIN_CTZ)
|
|
functable.longest_match = &longest_match_unaligned_32;
|
|
# else
|
|
functable.longest_match = &longest_match_unaligned_16;
|
|
# endif
|
|
# ifdef X86_SSE42_CMP_STR
|
|
if (x86_cpu_has_sse42)
|
|
functable.longest_match = &longest_match_unaligned_sse4;
|
|
# endif
|
|
# if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
|
|
if (x86_cpu_has_avx2)
|
|
functable.longest_match = &longest_match_unaligned_avx2;
|
|
# endif
|
|
#endif
|
|
|
|
return functable.longest_match(s, cur_match);
|
|
}
|
|
|
|
/* functable init */
|
|
Z_INTERNAL Z_TLS struct functable_s functable = {
|
|
insert_string_stub,
|
|
quick_insert_string_stub,
|
|
adler32_stub,
|
|
crc32_stub,
|
|
slide_hash_stub,
|
|
compare258_stub,
|
|
longest_match_stub,
|
|
chunksize_stub,
|
|
chunkcopy_stub,
|
|
chunkcopy_safe_stub,
|
|
chunkunroll_stub,
|
|
chunkmemset_stub,
|
|
chunkmemset_safe_stub
|
|
};
|