[Library] Update zlibng (#1255)

* Update zlibng

* Set cmake path more directly in zlibng to hopefully fix an issue with the build on drone

* I'm dumb, missing / in path

* Mackal helps with a dumb gitignore issue

* Adding all the files, not sure what's ignoring them and im tired of looking

* Some tweaks to zlibng build to hopefully get it to build properly. works on msvc now
This commit is contained in:
Alex
2021-02-23 17:00:26 -08:00
committed by GitHub
parent e6dee96266
commit 2957f5084d
184 changed files with 22029 additions and 11703 deletions
+25 -11
View File
@@ -6,19 +6,27 @@ CC=
CFLAGS=
SFLAGS=
INCLUDES=
ACLEFLAG=
NEONFLAG=
SUFFIX=
SRCDIR=.
SRCTOP=../..
TOPDIR=$(SRCTOP)
all: adler32_neon.o adler32_neon.lo armfeature.o armfeature.lo crc32_acle.o crc32_acle.lo fill_window_arm.o fill_window_arm.lo insert_string_acle.o insert_string_acle.lo
all: \
adler32_neon.o adler32_neon.lo \
armfeature.o armfeature.lo \
chunkset_neon.o chunkset_neon.lo \
crc32_acle.o crc32_acle.lo \
slide_neon.o slide_neon.lo \
insert_string_acle.o insert_string_acle.lo
adler32_neon.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
$(CC) $(CFLAGS) $(NEONFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
adler32_neon.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
$(CC) $(SFLAGS) $(NEONFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
armfeature.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/armfeature.c
@@ -26,23 +34,29 @@ armfeature.o:
armfeature.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/armfeature.c
chunkset_neon.o:
$(CC) $(CFLAGS) $(NEONFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
chunkset_neon.lo:
$(CC) $(SFLAGS) $(NEONFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
crc32_acle.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
$(CC) $(CFLAGS) $(ACLEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
crc32_acle.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
$(CC) $(SFLAGS) $(ACLEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
fill_window_arm.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_arm.c
slide_neon.o:
$(CC) $(CFLAGS) $(NEONFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_neon.c
fill_window_arm.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_arm.c
slide_neon.lo:
$(CC) $(SFLAGS) $(NEONFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_neon.c
insert_string_acle.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
$(CC) $(CFLAGS) $(ACLEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
insert_string_acle.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
$(CC) $(SFLAGS) $(ACLEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
mostlyclean: clean
clean:
+10 -18
View File
@@ -2,24 +2,16 @@
* Copyright (C) 2017 ARM Holdings Inc.
* Author: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "adler32_neon.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#include "adler32_p.h"
#ifdef ARM_NEON_ADLER32
#ifdef _M_ARM64
# include <arm64_neon.h>
#else
# include <arm_neon.h>
#endif
#include "../../zutil.h"
#include "../../adler32_p.h"
static void NEON_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
static const uint8_t taps[32] = {
@@ -109,7 +101,7 @@ uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {
for (i = 0; i < len; i += n) {
if ((i + n) > len)
n = len - i;
n = (int)(len - i);
if (n < 16)
break;
-29
View File
@@ -1,29 +0,0 @@
/* Copyright (C) 1995-2011, 2016 Mark Adler
* Copyright (C) 2017 ARM Holdings Inc.
* Author: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#ifndef __ADLER32_NEON__
#define __ADLER32_NEON__
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
// Depending on the compiler flavor, size_t may be defined in one or the other header. See:
// http://stackoverflow.com/questions/26410466/gcc-linaro-compiler-throws-error-unknown-type-name-size-t
#include <stdint.h>
#include <stddef.h>
uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len);
#endif
#endif
+1 -1
View File
@@ -8,6 +8,6 @@
extern int arm_cpu_has_neon;
extern int arm_cpu_has_crc32;
void ZLIB_INTERNAL arm_check_features(void);
void Z_INTERNAL arm_check_features(void);
#endif /* ARM_H_ */
+49 -30
View File
@@ -1,50 +1,69 @@
#include "zutil.h"
#include "../../zutil.h"
#if defined(__linux__)
# include <sys/auxv.h>
# include <asm/hwcap.h>
# include <sys/auxv.h>
# include <asm/hwcap.h>
#elif defined(__FreeBSD__) && defined(__aarch64__)
# include <machine/armreg.h>
# ifndef ID_AA64ISAR0_CRC32_VAL
# define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32
# endif
#elif defined(__APPLE__)
# include <sys/sysctl.h>
#elif defined(_WIN32)
# include <winapifamily.h>
# include <winapifamily.h>
#endif
static int arm_has_crc32() {
#if defined(__linux__) && defined(HWCAP2_CRC32)
return (getauxval(AT_HWCAP2) & HWCAP2_CRC32) != 0 ? 1 : 0;
return (getauxval(AT_HWCAP2) & HWCAP2_CRC32) != 0 ? 1 : 0;
#elif defined(__FreeBSD__) && defined(__aarch64__)
return getenv("QEMU_EMULATING") == NULL
&& ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE;
#elif defined(__APPLE__)
int hascrc32;
size_t size = sizeof(hascrc32);
return sysctlbyname("hw.optional.armv8_crc32", &hascrc32, &size, NULL, 0) == 0
&& hascrc32 == 1;
#elif defined(ARM_NOCHECK_ACLE)
return 1;
return 1;
#else
return 0;
return 0;
#endif
}
/* AArch64 has neon. */
#if !defined(__aarch64__)
static inline int arm_has_neon()
{
#if defined(__linux__) && defined(HWCAP_NEON)
#if !defined(__aarch64__) && !defined(_M_ARM64)
static inline int arm_has_neon() {
#if defined(__linux__) && defined(HWCAP_NEON)
return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0 ? 1 : 0;
#elif defined(_M_ARM) && defined(WINAPI_FAMILY_PARTITION)
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
#elif defined(__APPLE__)
int hasneon;
size_t size = sizeof(hasneon);
return sysctlbyname("hw.optional.neon", &hasneon, &size, NULL, 0) == 0
&& hasneon == 1;
#elif defined(_M_ARM) && defined(WINAPI_FAMILY_PARTITION)
# if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
return 1; /* Always supported */
#endif
#endif
# endif
#endif
#if defined(ARM_NOCHECK_NEON)
#if defined(ARM_NOCHECK_NEON)
return 1;
#else
return 0;
#endif
}
#endif
ZLIB_INTERNAL int arm_cpu_has_neon;
ZLIB_INTERNAL int arm_cpu_has_crc32;
void ZLIB_INTERNAL arm_check_features(void) {
#if defined(__aarch64__)
arm_cpu_has_neon = 1; /* always available */
#else
arm_cpu_has_neon = arm_has_neon();
return 0;
#endif
arm_cpu_has_crc32 = arm_has_crc32();
}
#endif
Z_INTERNAL int arm_cpu_has_neon;
Z_INTERNAL int arm_cpu_has_crc32;
void Z_INTERNAL arm_check_features(void) {
#if defined(__aarch64__) || defined(_M_ARM64)
arm_cpu_has_neon = 1; /* always available */
#else
arm_cpu_has_neon = arm_has_neon();
#endif
arm_cpu_has_crc32 = arm_has_crc32();
}
+54
View File
@@ -0,0 +1,54 @@
/* chunkset_neon.c -- NEON inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef ARM_NEON_CHUNKSET
#ifdef _M_ARM64
# include <arm64_neon.h>
#else
# include <arm_neon.h>
#endif
#include "../../zbuild.h"
#include "../../zutil.h"
typedef uint8x16_t chunk_t;
#define HAVE_CHUNKMEMSET_1
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
*chunk = vld1q_dup_u8(from);
}
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
*chunk = vreinterpretq_u8_s16(vdupq_n_s16(*(int16_t *)from));
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
*chunk = vreinterpretq_u8_s32(vdupq_n_s32(*(int32_t *)from));
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
*chunk = vcombine_u8(vld1_u8(from), vld1_u8(from));
}
#define CHUNKSIZE chunksize_neon
#define CHUNKCOPY chunkcopy_neon
#define CHUNKCOPY_SAFE chunkcopy_safe_neon
#define CHUNKUNROLL chunkunroll_neon
#define CHUNKMEMSET chunkmemset_neon
#define CHUNKMEMSET_SAFE chunkmemset_safe_neon
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
*chunk = vld1q_u8(s);
}
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
vst1q_u8(out, *chunk);
}
#include "chunkset_tpl.h"
#endif
+14 -19
View File
@@ -5,21 +5,16 @@
*
*/
#ifdef __ARM_FEATURE_CRC32
# include <arm_acle.h>
# ifdef ZLIB_COMPAT
# include <zconf.h>
# else
# include <zconf-ng.h>
# endif
# ifdef __linux__
# include <stddef.h>
# endif
#ifdef ARM_ACLE_CRC_HASH
#ifndef _MSC_VER
# include <arm_acle.h>
#endif
#include "../../zutil.h"
uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
register uint32_t c;
register const uint16_t *buf2;
register const uint32_t *buf4;
Z_REGISTER uint32_t c;
Z_REGISTER const uint16_t *buf2;
Z_REGISTER const uint32_t *buf4;
c = ~crc;
if (len && ((ptrdiff_t)buf & 1)) {
@@ -36,7 +31,7 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
buf4 = (const uint32_t *) buf;
}
# if defined(__aarch64__)
#if defined(__aarch64__)
if ((len > sizeof(uint32_t)) && ((ptrdiff_t)buf & sizeof(uint32_t))) {
c = __crc32w(c, *buf4++);
len -= sizeof(uint32_t);
@@ -44,7 +39,7 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
const uint64_t *buf8 = (const uint64_t *) buf4;
# ifdef UNROLL_MORE
#ifdef UNROLL_MORE
while (len >= 4 * sizeof(uint64_t)) {
c = __crc32d(c, *buf8++);
c = __crc32d(c, *buf8++);
@@ -52,7 +47,7 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
c = __crc32d(c, *buf8++);
len -= 4 * sizeof(uint64_t);
}
# endif
#endif
while (len >= sizeof(uint64_t)) {
c = __crc32d(c, *buf8++);
@@ -74,7 +69,7 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
}
buf = (const unsigned char *) buf2;
# else /* __aarch64__ */
#else /* __aarch64__ */
# ifdef UNROLL_MORE
while (len >= 8 * sizeof(uint32_t)) {
@@ -103,7 +98,7 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
} else {
buf = (const unsigned char *) buf4;
}
# endif /* __aarch64__ */
#endif /* __aarch64__ */
if (len) {
c = __crc32b(c, *buf);
@@ -112,4 +107,4 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
c = ~c;
return c;
}
#endif /* __ARM_FEATURE_CRC32 */
#endif
+1 -1
View File
@@ -5,7 +5,7 @@
#if defined(_MSC_VER) && !defined(__clang__)
static __forceinline unsigned long __builtin_ctzl(unsigned long value) {
return _arm_clz(_arm_rbit(value));
return _arm_clz(_arm_rbit(value));
}
#endif
-169
View File
@@ -1,169 +0,0 @@
/* fill_window_arm.c -- Optimized hash table shifting for ARM with support for NEON instructions
* Copyright (C) 2017 Mika T. Lindqvist
*
* Authors:
* Mika T. Lindqvist <postmaster@raasu.org>
* Jun He <jun.he@arm.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
/* @(#) $Id$ */
#include "zbuild.h"
#include "deflate.h"
#include "deflate_p.h"
#include "functable.h"
extern ZLIB_INTERNAL int read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
/* SIMD version of hash_chain rebase */
static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) {
register uint16x8_t v, *p;
register size_t n;
size_t size = entries*sizeof(table[0]);
Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
Assert(sizeof(Pos) == 2, "Wrong Pos size");
v = vdupq_n_u16(window_size);
p = (uint16x8_t *)table;
n = size / (sizeof(uint16x8_t) * 8);
do {
p[0] = vqsubq_u16(p[0], v);
p[1] = vqsubq_u16(p[1], v);
p[2] = vqsubq_u16(p[2], v);
p[3] = vqsubq_u16(p[3], v);
p[4] = vqsubq_u16(p[4], v);
p[5] = vqsubq_u16(p[5], v);
p[6] = vqsubq_u16(p[6], v);
p[7] = vqsubq_u16(p[7], v);
p += 8;
} while (--n);
}
#else
/* generic version for hash rebase */
static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) {
unsigned int i;
for (i = 0; i < entries; i++) {
table[i] = (table[i] >= window_size) ? (table[i] - window_size) : NIL;
}
}
#endif
void fill_window_arm(deflate_state *s) {
register unsigned n;
unsigned long more; /* Amount of free space at the end of the window. */
unsigned int wsize = s->w_size;
Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
do {
more = s->window_size - s->lookahead - s->strstart;
/* If the window is almost full and there is insufficient lookahead,
* move the upper half to the lower one to make room in the upper half.
*/
if (s->strstart >= wsize+MAX_DIST(s)) {
memcpy(s->window, s->window+wsize, wsize);
s->match_start -= wsize;
s->strstart -= wsize; /* we now have strstart >= MAX_DIST */
s->block_start -= wsize;
/* Slide the hash table (could be avoided with 32 bit values
at the expense of memory usage). We slide even when level == 0
to keep the hash table consistent if we switch back to level > 0
later. (Using level 0 permanently is not an optimal usage of
zlib, so we don't care about this pathological case.)
*/
slide_hash_chain(s->head, s->hash_size, wsize);
slide_hash_chain(s->prev, wsize, wsize);
more += wsize;
}
if (s->strm->avail_in == 0)
break;
/* If there was no sliding:
* strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
* more == window_size - lookahead - strstart
* => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
* => more >= window_size - 2*WSIZE + 2
* In the BIG_MEM or MMAP case (not yet supported),
* window_size == input_size + MIN_LOOKAHEAD &&
* strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
* Otherwise, window_size == 2*WSIZE so more >= 2.
* If there was sliding, more >= WSIZE. So in all cases, more >= 2.
*/
Assert(more >= 2, "more < 2");
n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
s->lookahead += n;
/* Initialize the hash value now that we have some input: */
if (s->lookahead + s->insert >= MIN_MATCH) {
unsigned int str = s->strstart - s->insert;
unsigned int insert_cnt = s->insert;
unsigned int slen;
s->ins_h = s->window[str];
if (unlikely(s->lookahead < MIN_MATCH))
insert_cnt += s->lookahead - MIN_MATCH;
slen = insert_cnt;
if (str >= (MIN_MATCH - 2))
{
str += 2 - MIN_MATCH;
insert_cnt += MIN_MATCH - 2;
}
if (insert_cnt > 0)
{
functable.insert_string(s, str, insert_cnt);
s->insert -= slen;
}
}
/* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
* but this is not important since only literal bytes will be emitted.
*/
} while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
/* If the WIN_INIT bytes after the end of the current data have never been
* written, then zero those bytes in order to avoid memory check reports of
* the use of uninitialized (or uninitialised as Julian writes) bytes by
* the longest match routines. Update the high water mark for the next
* time through here. WIN_INIT is set to MAX_MATCH since the longest match
* routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
*/
if (s->high_water < s->window_size) {
unsigned long curr = s->strstart + (unsigned long)s->lookahead;
unsigned long init;
if (s->high_water < curr) {
/* Previous high water mark below current data -- zero WIN_INIT
* bytes or up to end of window, whichever is less.
*/
init = s->window_size - curr;
if (init > WIN_INIT)
init = WIN_INIT;
memset(s->window + curr, 0, init);
s->high_water = curr + init;
} else if (s->high_water < curr + WIN_INIT) {
/* High water mark at or above current data, but below current data
* plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
* to end of window, whichever is less.
*/
init = curr + WIN_INIT;
if (init > s->window_size)
init = s->window_size;
init -= s->high_water;
memset(s->window + s->high_water, 0, init);
s->high_water += init;
}
}
Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "not enough room for search");
}
+14 -45
View File
@@ -5,49 +5,18 @@
*
*/
#if defined(__ARM_FEATURE_CRC32) && defined(ARM_ACLE_CRC_HASH)
#include <arm_acle.h>
#include "zbuild.h"
#include "deflate.h"
/* ===========================================================================
* Insert string str in the dictionary and set match_head to the previous head
* of the hash chain (the most recent string with same hash key). Return
* the previous length of the hash chain.
* IN assertion: all calls to to INSERT_STRING are made with consecutive
* input characters and the first MIN_MATCH bytes of str are valid
* (except for the last MIN_MATCH-1 bytes of the input file).
*/
Pos insert_string_acle(deflate_state *const s, const Pos str, unsigned int count) {
Pos p, lp, ret;
if (unlikely(count == 0)) {
return s->prev[str & s->w_mask];
}
ret = 0;
lp = str + count - 1; /* last position */
for (p = str; p <= lp; p++) {
uint32_t val, h, hm;
memcpy(&val, &s->window[p], sizeof(val));
if (s->level >= TRIGGER_LEVEL)
val &= 0xFFFFFF;
h = __crc32w(0, val);
hm = h & s->hash_mask;
Pos head = s->head[hm];
if (head != p) {
s->prev[p & s->w_mask] = head;
s->head[hm] = p;
if (p == lp)
ret = head;
} else if (p == lp) {
ret = p;
}
}
return ret;
}
#ifdef ARM_ACLE_CRC_HASH
#ifndef _MSC_VER
# include <arm_acle.h>
#endif
#include "../../zbuild.h"
#include "../../deflate.h"
#define UPDATE_HASH(s, h, val) \
h = __crc32w(0, val)
#define INSERT_STRING insert_string_acle
#define QUICK_INSERT_STRING quick_insert_string_acle
#include "../../insert_string_tpl.h"
#endif
+52
View File
@@ -0,0 +1,52 @@
/* slide_neon.c -- Optimized hash table shifting for ARM with support for NEON instructions
* Copyright (C) 2017-2020 Mika T. Lindqvist
*
* Authors:
* Mika T. Lindqvist <postmaster@raasu.org>
* Jun He <jun.he@arm.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#if defined(ARM_NEON_SLIDEHASH)
#ifdef _M_ARM64
# include <arm64_neon.h>
#else
# include <arm_neon.h>
#endif
#include "../../zbuild.h"
#include "../../deflate.h"
/* SIMD version of hash_chain rebase */
static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) {
Z_REGISTER uint16x8_t v, *p;
Z_REGISTER size_t n;
size_t size = entries*sizeof(table[0]);
Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
Assert(sizeof(Pos) == 2, "Wrong Pos size");
v = vdupq_n_u16(window_size);
p = (uint16x8_t *)table;
n = size / (sizeof(uint16x8_t) * 8);
do {
p[0] = vqsubq_u16(p[0], v);
p[1] = vqsubq_u16(p[1], v);
p[2] = vqsubq_u16(p[2], v);
p[3] = vqsubq_u16(p[3], v);
p[4] = vqsubq_u16(p[4], v);
p[5] = vqsubq_u16(p[5], v);
p[6] = vqsubq_u16(p[6], v);
p[7] = vqsubq_u16(p[7], v);
p += 8;
} while (--n);
}
Z_INTERNAL void slide_hash_neon(deflate_state *s) {
unsigned int wsize = s->w_size;
slide_hash_chain(s->head, HASH_SIZE, wsize);
slide_hash_chain(s->prev, wsize, wsize);
}
#endif
+49
View File
@@ -0,0 +1,49 @@
# Makefile for POWER-specific files
# Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
# For conditions of distribution and use, see copyright notice in zlib.h
CC=
CFLAGS=
SFLAGS=
INCLUDES=
SUFFIX=
SRCDIR=.
SRCTOP=../..
TOPDIR=$(SRCTOP)
P8FLAGS=-mcpu=power8
all: power.o \
power.lo \
adler32_power8.o \
adler32_power8.lo \
slide_hash_power8.o \
slide_hash_power8.lo
power.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power.c
power.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power.c
adler32_power8.o:
$(CC) $(CFLAGS) $(P8FLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
adler32_power8.lo:
$(CC) $(SFLAGS) $(P8FLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
slide_hash_power8.o:
$(CC) $(CFLAGS) $(P8FLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
slide_hash_power8.lo:
$(CC) $(SFLAGS) $(P8FLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
mostlyclean: clean
clean:
rm -f *.o *.lo *~
rm -rf objs
rm -f *.gcda *.gcno *.gcov
distclean:
rm -f Makefile
+154
View File
@@ -0,0 +1,154 @@
/* Adler32 for POWER8 using VSX instructions.
* Copyright (C) 2020 IBM Corporation
* Author: Rogerio Alves <rcardoso@linux.ibm.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*
* Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector)
* instructions.
*
* If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means
* iteration n) is the initial value of adler - at start _0 is 1 unless
* adler initial value is different than 1. So s1_1 = s1_0 + c[0] after
* the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on.
* Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on
* after iteration N.
*
* Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] +
* N-1*c[1] + ... + c[N]
*
* In a more general way:
*
* s1_N = s1_0 + sum(i=1 to N)c[i]
* s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i]
*
* Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we
* can process N-bit at time we can do this at once.
*
* Since VSX can support 16-bit vector instructions, we can process
* 16-bit at time using N = 16 we have:
*
* s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i]
* s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i]
*
* After the first iteration we calculate the adler32 checksum for 16 bytes.
*
* For more background about adler32 please check the RFC:
* https://www.ietf.org/rfc/rfc1950.txt
*/
#ifdef POWER8_VSX_ADLER32
#include <altivec.h>
#include "zbuild.h"
#include "zutil.h"
#include "adler32_p.h"
/* Vector across sum unsigned int (saturate). */
inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsigned int __b) {
__b = vec_sld(__a, __a, 8);
__b = vec_add(__b, __a);
__a = vec_sld(__b, __b, 4);
__a = vec_add(__a, __b);
return __a;
}
uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len) {
uint32_t s1 = adler & 0xffff;
uint32_t s2 = (adler >> 16) & 0xffff;
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
return adler32_len_1(s1, buf, s2);
/* If buffer is empty or len=0 we need to return adler initial value. */
if (UNLIKELY(buf == NULL))
return 1;
/* This is faster than VSX code for len < 64. */
if (len < 64)
return adler32_len_64(s1, buf, len, s2);
/* Use POWER VSX instructions for len >= 64. */
const vector unsigned int v_zeros = { 0 };
const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
6, 5, 4, 3, 2, 1};
const vector unsigned char vsh = vec_splat_u8(4);
const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};
vector unsigned int vs1 = { 0 };
vector unsigned int vs2 = { 0 };
vector unsigned int vs1_save = { 0 };
vector unsigned int vsum1, vsum2;
vector unsigned char vbuf;
int n;
vs1[0] = s1;
vs2[0] = s2;
/* Do length bigger than NMAX in blocks of NMAX size. */
while (len >= NMAX) {
len -= NMAX;
n = NMAX / 16;
do {
vbuf = vec_xl(0, (unsigned char *) buf);
vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
/* sum(i=1 to 16) buf[i]*(16-i+1). */
vsum2 = vec_msum(vbuf, v_mul, v_zeros);
/* Save vs1. */
vs1_save = vec_add(vs1_save, vs1);
/* Accumulate the sums. */
vs1 = vec_add(vsum1, vs1);
vs2 = vec_add(vsum2, vs2);
buf += 16;
} while (--n);
/* Once each block of NMAX size. */
vs1 = vec_sumsu(vs1, vsum1);
vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
vs2 = vec_add(vs1_save, vs2);
vs2 = vec_sumsu(vs2, vsum2);
/* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521. */
vs1[0] = vs1[0] % BASE;
/* vs2[0] = s2_i + 16*s1_save +
sum(i=1 to 16)(16-i+1)*buf[i] mod 65521. */
vs2[0] = vs2[0] % BASE;
vs1 = vec_and(vs1, vmask);
vs2 = vec_and(vs2, vmask);
vs1_save = v_zeros;
}
/* len is less than NMAX one modulo is needed. */
if (len >= 16) {
while (len >= 16) {
len -= 16;
vbuf = vec_xl(0, (unsigned char *) buf);
vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
/* sum(i=1 to 16) buf[i]*(16-i+1). */
vsum2 = vec_msum(vbuf, v_mul, v_zeros);
/* Save vs1. */
vs1_save = vec_add(vs1_save, vs1);
/* Accumulate the sums. */
vs1 = vec_add(vsum1, vs1);
vs2 = vec_add(vsum2, vs2);
buf += 16;
}
/* Since the size will be always less than NMAX we do this once. */
vs1 = vec_sumsu(vs1, vsum1);
vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
vs2 = vec_add(vs1_save, vs2);
vs2 = vec_sumsu(vs2, vsum2);
}
/* Copy result back to s1, s2 (mod 65521). */
s1 = vs1[0] % BASE;
s2 = vs2[0] % BASE;
/* Process tail (len < 16).and return */
return adler32_len_16(s1, buf, len, s2);
}
#endif /* POWER8_VSX_ADLER32 */
+19
View File
@@ -0,0 +1,19 @@
/* POWER feature check
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include <sys/auxv.h>
#include "../../zutil.h"
Z_INTERNAL int power_cpu_has_arch_2_07;
void Z_INTERNAL power_check_features(void) {
unsigned long hwcap2;
hwcap2 = getauxval(AT_HWCAP2);
#ifdef POWER8
if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
power_cpu_has_arch_2_07 = 1;
#endif
}
+13
View File
@@ -0,0 +1,13 @@
/* power.h -- check for POWER CPU features
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef POWER_H_
#define POWER_H_
extern int power_cpu_has_arch_2_07;
void Z_INTERNAL power_check_features(void);
#endif /* POWER_H_ */
@@ -0,0 +1,60 @@
/* Optimized slide_hash for POWER processors
* Copyright (C) 2019-2020 IBM Corporation
* Author: Matheus Castanho <msc@linux.ibm.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef POWER8_VSX_SLIDEHASH
#include <altivec.h>
#include "zbuild.h"
#include "deflate.h"
static inline void slide_hash_power8_loop(deflate_state *s, unsigned n_elems, Pos *table_end) {
vector unsigned short vw, vm, *vp;
unsigned chunks;
/* Each vector register (chunk) corresponds to 128 bits == 8 Posf,
* so instead of processing each of the n_elems in the hash table
* individually, we can do it in chunks of 8 with vector instructions.
*
* This function is only called from slide_hash_power8(), and both calls
* pass n_elems as a power of 2 higher than 2^7, as defined by
* deflateInit2_(), so n_elems will always be a multiple of 8. */
chunks = n_elems >> 3;
Assert(n_elems % 8 == 0, "Weird hash table size!");
/* This type casting is safe since s->w_size is always <= 64KB
* as defined by deflateInit2_() and Posf == unsigned short */
vw[0] = (Pos) s->w_size;
vw = vec_splat(vw,0);
vp = (vector unsigned short *) table_end;
do {
/* Processing 8 elements at a time */
vp--;
vm = *vp;
/* This is equivalent to: m >= w_size ? m - w_size : 0
* Since we are using a saturated unsigned subtraction, any
* values that are > w_size will be set to 0, while the others
* will be subtracted by w_size. */
*vp = vec_subs(vm,vw);
} while (--chunks);
}
void Z_INTERNAL slide_hash_power8(deflate_state *s) {
unsigned int n;
Pos *p;
n = HASH_SIZE;
p = &s->head[n];
slide_hash_power8_loop(s,n,p);
n = s->w_size;
p = &s->prev[n];
slide_hash_power8_loop(s,n,p);
}
#endif /* POWER8_VSX_SLIDEHASH */
+187 -40
View File
@@ -1,6 +1,7 @@
This directory contains IBM Z DEFLATE CONVERSION CALL support for
zlib-ng. In order to enable it, the following build commands should be
used:
# Introduction
This directory contains SystemZ deflate hardware acceleration support.
It can be enabled using the following build commands:
$ ./configure --with-dfltcc-deflate --with-dfltcc-inflate
$ make
@@ -10,60 +11,206 @@ or
$ cmake -DWITH_DFLTCC_DEFLATE=1 -DWITH_DFLTCC_INFLATE=1 .
$ make
When built like this, zlib-ng would compress in hardware on level 1,
and in software on all other levels. Decompression will always happen
in hardware. In order to enable DFLTCC compression for levels 1-6 (i.e.
to make it used by default) one could add -DDFLTCC_LEVEL_MASK=0x7e to
CFLAGS when building zlib-ng.
When built like this, zlib-ng would compress using hardware on level 1,
and using software on all other levels. Decompression will always happen
in hardware. In order to enable hardware compression for levels 1-6
(i.e. to make it used by default) one could add
`-DDFLTCC_LEVEL_MASK=0x7e` to CFLAGS when building zlib-ng.
Two DFLTCC compression calls produce the same results only when they
both are made on machines of the same generation, and when the
respective buffers have the same offset relative to the start of the
page. Therefore care should be taken when using hardware compression
when reproducible results are desired.
SystemZ deflate hardware acceleration is available on [IBM z15](
https://www.ibm.com/products/z15) and newer machines under the name [
"Integrated Accelerator for zEnterprise Data Compression"](
https://www.ibm.com/support/z-content-solutions/compression/). The
programming interface to it is a machine instruction called DEFLATE
CONVERSION CALL (DFLTCC). It is documented in Chapter 26 of [Principles
of Operation](http://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf). Both
the code and the rest of this document refer to this feature simply as
"DFLTCC".
# Performance
Performance figures are published [here](
https://github.com/iii-i/zlib-ng/wiki/Performance-with-dfltcc-patch-applied-and-dfltcc-support-built-on-dfltcc-enabled-machine
). The compression speed-up can be as high as 110x and the decompression
speed-up can be as high as 15x.
# Limitations
Two DFLTCC compression calls with identical inputs are not guaranteed to
produce identical outputs. Therefore care should be taken when using
hardware compression when reproducible results are desired. In
particular, zlib-ng-specific `zng_deflateSetParams` call allows setting
`Z_DEFLATE_REPRODUCIBLE` parameter, which disables DFLTCC support for a
particular stream.
DFLTCC does not support every single zlib-ng feature, in particular:
* inflate(Z_BLOCK) and inflate(Z_TREES)
* inflateMark()
* inflatePrime()
* deflateParams() after the first deflate() call
* `inflate(Z_BLOCK)` and `inflate(Z_TREES)`
* `inflateMark()`
* `inflatePrime()`
* `inflateSyncPoint()`
When used, these functions will either switch to software, or, in case
this is not possible, gracefully fail.
All SystemZ-specific code lives in a separate file and is integrated
with the rest of zlib-ng using hook macros, which are explained below.
# Code structure
All SystemZ-specific code lives in `arch/s390` directory and is
integrated with the rest of zlib-ng using hook macros.
## Hook macros
DFLTCC takes as arguments a parameter block, an input buffer, an output
buffer and a window. ZALLOC_STATE, ZFREE_STATE, ZCOPY_STATE,
ZALLOC_WINDOW and TRY_FREE_WINDOW macros encapsulate allocation details
for the parameter block (which is allocated alongside zlib-ng state)
and the window (which must be page-aligned).
buffer and a window. `ZALLOC_STATE()`, `ZFREE_STATE()`, `ZCOPY_STATE()`,
`ZALLOC_WINDOW()` and `TRY_FREE_WINDOW()` macros encapsulate allocation
details for the parameter block (which is allocated alongside zlib-ng
state) and the window (which must be page-aligned).
While for inflate software and hardware window formats match, this is
not the case for deflate. Therefore, deflateSetDictionary and
deflateGetDictionary need special handling, which is triggered using
the DEFLATE_SET_DICTIONARY_HOOK and DEFLATE_GET_DICTIONARY_HOOK macros.
While inflate software and hardware window formats match, this is not
the case for deflate. Therefore, `deflateSetDictionary()` and
`deflateGetDictionary()` need special handling, which is triggered using
`DEFLATE_SET_DICTIONARY_HOOK()` and `DEFLATE_GET_DICTIONARY_HOOK()`
macros.
deflateResetKeep() and inflateResetKeep() update the DFLTCC parameter
block using DEFLATE_RESET_KEEP_HOOK and INFLATE_RESET_KEEP_HOOK macros.
`deflateResetKeep()` and `inflateResetKeep()` update the DFLTCC
parameter block using `DEFLATE_RESET_KEEP_HOOK()` and
`INFLATE_RESET_KEEP_HOOK()` macros.
DEFLATE_PARAMS_HOOK, INFLATE_PRIME_HOOK and INFLATE_MARK_HOOK macros
make the unsupported deflateParams(), inflatePrime() and inflateMark()
calls fail gracefully.
`INFLATE_PRIME_HOOK()`, `INFLATE_MARK_HOOK()` and
`INFLATE_SYNC_POINT_HOOK()` macros make the respective unsupported
calls gracefully fail.
`DEFLATE_PARAMS_HOOK()` implements switching between hardware and
software compression mid-stream using `deflateParams()`. Switching
normally entails flushing the current block, which might not be possible
in low memory situations. `deflateParams()` uses `DEFLATE_DONE()` hook
in order to detect and gracefully handle such situations.
The algorithm implemented in hardware has different compression ratio
than the one implemented in software. DEFLATE_BOUND_ADJUST_COMPLEN and
DEFLATE_NEED_CONSERVATIVE_BOUND macros make deflateBound() return the
correct results for the hardware implementation.
than the one implemented in software. `DEFLATE_BOUND_ADJUST_COMPLEN()`
and `DEFLATE_NEED_CONSERVATIVE_BOUND()` macros make `deflateBound()`
return the correct results for the hardware implementation.
Actual compression and decompression are handled by DEFLATE_HOOK and
INFLATE_TYPEDO_HOOK macros. Since inflation with DFLTCC manages the
window on its own, calling updatewindow() is suppressed using
INFLATE_NEED_UPDATEWINDOW() macro.
Actual compression and decompression are handled by `DEFLATE_HOOK()` and
`INFLATE_TYPEDO_HOOK()` macros. Since inflation with DFLTCC manages the
window on its own, calling `updatewindow()` is suppressed using
`INFLATE_NEED_UPDATEWINDOW()` macro.
In addition to compression, DFLTCC computes CRC-32 and Adler-32
checksums, therefore, whenever it's used, software checksumming is
suppressed using DEFLATE_NEED_CHECKSUM and INFLATE_NEED_CHECKSUM
suppressed using `DEFLATE_NEED_CHECKSUM()` and `INFLATE_NEED_CHECKSUM()`
macros.
While software always produces reproducible compression results, this
is not the case for DFLTCC. Therefore, zlib-ng users are given the
ability to specify whether or not reproducible compression results
are required. While it is always possible to specify this setting
before the compression begins, it is not always possible to do so in
the middle of a deflate stream - the exact conditions for that are
determined by `DEFLATE_CAN_SET_REPRODUCIBLE()` macro.
## SystemZ-specific code
When zlib-ng is built with DFLTCC, the hooks described above are
converted to calls to functions, which are implemented in
`arch/s390/dfltcc_*` files. The functions can be grouped in three broad
categories:
* Base DFLTCC support, e.g. wrapping the machine instruction -
`dfltcc()` and allocating aligned memory - `dfltcc_alloc_state()`.
* Translating between software and hardware data formats, e.g.
`dfltcc_deflate_set_dictionary()`.
* Translating between software and hardware state machines, e.g.
`dfltcc_deflate()` and `dfltcc_inflate()`.
The functions from the first two categories are fairly simple, however,
various quirks in both software and hardware state machines make the
functions from the third category quite complicated.
### `dfltcc_deflate()` function
This function is called by `deflate()` and has the following
responsibilities:
* Checking whether DFLTCC can be used with the current stream. If this
is not the case, then it returns `0`, making `deflate()` use some
other function in order to compress in software. Otherwise it returns
`1`.
* Block management and Huffman table generation. DFLTCC ends blocks only
when explicitly instructed to do so by the software. Furthermore,
whether to use fixed or dynamic Huffman tables must also be determined
by the software. Since looking at data in order to gather statistics
would negate performance benefits, the following approach is used: the
first `DFLTCC_FIRST_FHT_BLOCK_SIZE` bytes are placed into a fixed
block, and every next `DFLTCC_BLOCK_SIZE` bytes are placed into
dynamic blocks.
* Writing EOBS. Block Closing Control bit in the parameter block
instructs DFLTCC to write EOBS, however, certain conditions need to be
met: input data length must be non-zero or Continuation Flag must be
set. To put this in simpler terms, DFLTCC will silently refuse to
write EOBS if this is the only thing that it is asked to do. Since the
code has to be able to emit EOBS in software anyway, in order to avoid
tricky corner cases Block Closing Control is never used. Whether to
write EOBS is instead controlled by `soft_bcc` variable.
* Triggering block post-processing. Depending on flush mode, `deflate()`
must perform various additional actions when a block or a stream ends.
`dfltcc_deflate()` informs `deflate()` about this using
`block_state *result` parameter.
* Converting software state fields into hardware parameter block fields,
and vice versa. For example, `wrap` and Check Value Type or `bi_valid`
and Sub-Byte Boundary. Certain fields cannot be translated and must
persist untouched in the parameter block between calls, for example,
Continuation Flag or Continuation State Buffer.
* Handling flush modes and low-memory situations. These aspects are
quite intertwined and pervasive. The general idea here is that the
code must not do anything in software - whether explicitly by e.g.
calling `send_eobs()`, or implicitly - by returning to `deflate()`
with certain return and `*result` values, when Continuation Flag is
set.
* Ending streams. When a new block is started and flush mode is
`Z_FINISH`, Block Header Final parameter block bit is used to mark
this block as final. However, sometimes an empty final block is
needed, and, unfortunately, just like with EOBS, DFLTCC will silently
refuse to do this. The general idea of DFLTCC implementation is to
rely as much as possible on the existing code. Here in order to do
this, the code pretends that it does not support DFLTCC, which makes
`deflate()` call a software compression function, which writes an
empty final block. Whether this is required is controlled by
`need_empty_block` variable.
* Error handling. This is simply converting
Operation-Ending-Supplemental Code to string. Errors can only happen
due to things like memory corruption, and therefore they don't affect
the `deflate()` return code.
### `dfltcc_inflate()` function
This function is called by `inflate()` from the `TYPEDO` state (that is,
when all the metadata is parsed and the stream is positioned at the type
bits of deflate block header) and it's responsible for the following:
* Falling back to software when flush mode is `Z_BLOCK` or `Z_TREES`.
Unfortunately, there is no way to ask DFLTCC to stop decompressing on
block or tree boundary.
* `inflate()` decompression loop management. This is controlled using
the return value, which can be either `DFLTCC_INFLATE_BREAK` or
`DFLTCC_INFLATE_CONTINUE`.
* Converting software state fields into hardware parameter block fields,
and vice versa. For example, `whave` and History Length or `wnext` and
History Offset.
* Ending streams. This instructs `inflate()` to return `Z_STREAM_END`
and is controlled by `last` state field.
* Error handling. Like deflate, error handling comprises
Operation-Ending-Supplemental Code to string conversion. Unlike
deflate, errors may happen due to bad inputs, therefore they are
propagated to `inflate()` by setting `mode` field to `MEM` or `BAD`.
# Testing
Given complexity of DFLTCC machine instruction, it is not clear whether
QEMU TCG will ever support it. At the time of writing, one has to have
access to an IBM z15+ VM or LPAR in order to test DFLTCC support. Since
DFLTCC is a non-privileged instruction, neither special VM/LPAR
configuration nor root are required.
Still, zlib-ng CI has a few QEMU TCG-based configurations that check
whether fallback to software is working.
+25 -22
View File
@@ -1,6 +1,6 @@
/* dfltcc_deflate.c - IBM Z DEFLATE CONVERSION CALL general support. */
#include "zbuild.h"
#include "../../zbuild.h"
#include "dfltcc_common.h"
#include "dfltcc_detail.h"
@@ -12,20 +12,31 @@
`posix_memalign' is not an option. Thus, we overallocate and take the
aligned portion of the buffer.
*/
static inline int is_dfltcc_enabled(void)
{
static inline int is_dfltcc_enabled(void) {
uint64_t facilities[(DFLTCC_FACILITY / 64) + 1];
register uint8_t r0 __asm__("r0");
Z_REGISTER uint8_t r0 __asm__("r0");
memset(facilities, 0, sizeof(facilities));
r0 = sizeof(facilities) / sizeof(facilities[0]) - 1;
__asm__ volatile("stfle %[facilities]\n" : [facilities] "=Q" (facilities), [r0] "+r" (r0) :: "cc");
/* STFLE is supported since z9-109 and only in z/Architecture mode. When
* compiling with -m31, gcc defaults to ESA mode, however, since the kernel
* is 64-bit, it's always z/Architecture mode at runtime.
*/
__asm__ volatile(
#ifndef __clang__
".machinemode push\n"
".machinemode zarch\n"
#endif
"stfle %[facilities]\n"
#ifndef __clang__
".machinemode pop\n"
#endif
: [facilities] "=Q" (facilities), [r0] "+r" (r0) :: "cc");
return is_bit_set((const char *)facilities, DFLTCC_FACILITY);
}
void ZLIB_INTERNAL dfltcc_reset(PREFIX3(streamp) strm, uInt size)
{
struct dfltcc_state *dfltcc_state = (struct dfltcc_state *)((char *)strm->state + size);
void Z_INTERNAL dfltcc_reset(PREFIX3(streamp) strm, uInt size) {
struct dfltcc_state *dfltcc_state = (struct dfltcc_state *)((char *)strm->state + ALIGN_UP(size, 8));
struct dfltcc_qaf_param *param = (struct dfltcc_qaf_param *)&dfltcc_state->param;
/* Initialize available functions */
@@ -47,24 +58,17 @@ void ZLIB_INTERNAL dfltcc_reset(PREFIX3(streamp) strm, uInt size)
dfltcc_state->param.ribm = DFLTCC_RIBM;
}
void ZLIB_INTERNAL *dfltcc_alloc_state(PREFIX3(streamp) strm, uInt items, uInt size)
{
Assert((items * size) % 8 == 0,
"The size of zlib-ng state must be a multiple of 8");
return ZALLOC(strm, items * size + sizeof(struct dfltcc_state), sizeof(unsigned char));
void Z_INTERNAL *dfltcc_alloc_state(PREFIX3(streamp) strm, uInt items, uInt size) {
return ZALLOC(strm, ALIGN_UP(items * size, 8) + sizeof(struct dfltcc_state), sizeof(unsigned char));
}
void ZLIB_INTERNAL dfltcc_copy_state(void *dst, const void *src, uInt size)
{
memcpy(dst, src, size + sizeof(struct dfltcc_state));
void Z_INTERNAL dfltcc_copy_state(void *dst, const void *src, uInt size) {
memcpy(dst, src, ALIGN_UP(size, 8) + sizeof(struct dfltcc_state));
}
static const int PAGE_ALIGN = 0x1000;
#define ALIGN_UP(p, size) (__typeof__(p))(((uintptr_t)(p) + ((size) - 1)) & ~((size) - 1))
void ZLIB_INTERNAL *dfltcc_alloc_window(PREFIX3(streamp) strm, uInt items, uInt size)
{
void Z_INTERNAL *dfltcc_alloc_window(PREFIX3(streamp) strm, uInt items, uInt size) {
void *p;
void *w;
@@ -79,8 +83,7 @@ void ZLIB_INTERNAL *dfltcc_alloc_window(PREFIX3(streamp) strm, uInt items, uInt
return w;
}
void ZLIB_INTERNAL dfltcc_free_window(PREFIX3(streamp) strm, void *w)
{
void Z_INTERNAL dfltcc_free_window(PREFIX3(streamp) strm, void *w) {
if (w)
ZFREE(strm, *(void **)((unsigned char *)w - sizeof(void *)));
}
+8 -8
View File
@@ -2,17 +2,17 @@
#define DFLTCC_COMMON_H
#ifdef ZLIB_COMPAT
#include "zlib.h"
#include "../../zlib.h"
#else
#include "zlib-ng.h"
#include "../../zlib-ng.h"
#endif
#include "zutil.h"
#include "../../zutil.h"
void ZLIB_INTERNAL *dfltcc_alloc_state(PREFIX3(streamp) strm, uInt items, uInt size);
void ZLIB_INTERNAL dfltcc_copy_state(void *dst, const void *src, uInt size);
void ZLIB_INTERNAL dfltcc_reset(PREFIX3(streamp) strm, uInt size);
void ZLIB_INTERNAL *dfltcc_alloc_window(PREFIX3(streamp) strm, uInt items, uInt size);
void ZLIB_INTERNAL dfltcc_free_window(PREFIX3(streamp) strm, void *w);
void Z_INTERNAL *dfltcc_alloc_state(PREFIX3(streamp) strm, uInt items, uInt size);
void Z_INTERNAL dfltcc_copy_state(void *dst, const void *src, uInt size);
void Z_INTERNAL dfltcc_reset(PREFIX3(streamp) strm, uInt size);
void Z_INTERNAL *dfltcc_alloc_window(PREFIX3(streamp) strm, uInt items, uInt size);
void Z_INTERNAL dfltcc_free_window(PREFIX3(streamp) strm, void *w);
#define ZALLOC_STATE dfltcc_alloc_state
+100 -59
View File
@@ -13,27 +13,26 @@
$ make
*/
#include "zbuild.h"
#include "zutil.h"
#include "deflate.h"
#include "../../zbuild.h"
#include "../../zutil.h"
#include "../../deflate.h"
#include "../../trees_emit.h"
#include "dfltcc_deflate.h"
#include "dfltcc_detail.h"
static inline int dfltcc_are_params_ok(int level, uInt window_bits, int strategy, uint16_t level_mask)
{
return (level_mask & ((uint16_t)1 << level)) != 0 &&
(window_bits == HB_BITS) &&
(strategy == Z_FIXED || strategy == Z_DEFAULT_STRATEGY);
}
int ZLIB_INTERNAL dfltcc_can_deflate(PREFIX3(streamp) strm)
{
static inline int dfltcc_can_deflate_with_params(PREFIX3(streamp) strm, int level, uInt window_bits, int strategy,
int reproducible) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
/* Unsupported compression settings */
if (!dfltcc_are_params_ok(state->level, state->w_bits, state->strategy, dfltcc_state->level_mask))
if ((dfltcc_state->level_mask & (1 << level)) == 0)
return 0;
if (window_bits != HB_BITS)
return 0;
if (strategy != Z_FIXED && strategy != Z_DEFAULT_STRATEGY)
return 0;
if (reproducible)
return 0;
/* Unsupported hardware */
@@ -45,8 +44,13 @@ int ZLIB_INTERNAL dfltcc_can_deflate(PREFIX3(streamp) strm)
return 1;
}
static inline void dfltcc_gdht(PREFIX3(streamp) strm)
{
int Z_INTERNAL dfltcc_can_deflate(PREFIX3(streamp) strm) {
deflate_state *state = (deflate_state *)strm->state;
return dfltcc_can_deflate_with_params(strm, state->level, state->w_bits, state->strategy, state->reproducible);
}
static inline void dfltcc_gdht(PREFIX3(streamp) strm) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
size_t avail_in = strm->avail_in;
@@ -54,8 +58,7 @@ static inline void dfltcc_gdht(PREFIX3(streamp) strm)
dfltcc(DFLTCC_GDHT, param, NULL, NULL, &strm->next_in, &avail_in, NULL);
}
static inline dfltcc_cc dfltcc_cmpr(PREFIX3(streamp) strm)
{
static inline dfltcc_cc dfltcc_cmpr(PREFIX3(streamp) strm) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
size_t avail_in = strm->avail_in;
@@ -72,11 +75,10 @@ static inline dfltcc_cc dfltcc_cmpr(PREFIX3(streamp) strm)
return cc;
}
static inline void send_eobs(PREFIX3(streamp) strm, const struct dfltcc_param_v0 *param)
{
static inline void send_eobs(PREFIX3(streamp) strm, const struct dfltcc_param_v0 *param) {
deflate_state *state = (deflate_state *)strm->state;
send_bits(state, bi_reverse(param->eobs >> (15 - param->eobl), param->eobl), param->eobl);
send_bits(state, bi_reverse(param->eobs >> (15 - param->eobl), param->eobl), param->eobl, state->bi_buf, state->bi_valid);
flush_pending(strm);
if (state->pending != 0) {
/* The remaining data is located in pending_out[0:pending]. If someone
@@ -93,8 +95,7 @@ static inline void send_eobs(PREFIX3(streamp) strm, const struct dfltcc_param_v0
#endif
}
int ZLIB_INTERNAL dfltcc_deflate(PREFIX3(streamp) strm, int flush, block_state *result)
{
int Z_INTERNAL dfltcc_deflate(PREFIX3(streamp) strm, int flush, block_state *result) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
struct dfltcc_param_v0 *param = &dfltcc_state->param;
@@ -104,31 +105,38 @@ int ZLIB_INTERNAL dfltcc_deflate(PREFIX3(streamp) strm, int flush, block_state *
int soft_bcc;
int no_flush;
if (!dfltcc_can_deflate(strm))
if (!dfltcc_can_deflate(strm)) {
/* Clear history. */
if (flush == Z_FULL_FLUSH)
param->hl = 0;
return 0;
}
again:
masked_avail_in = 0;
soft_bcc = 0;
no_flush = flush == Z_NO_FLUSH;
/* Trailing empty block. Switch to software, except when Continuation Flag
* is set, which means that DFLTCC has buffered some output in the
* parameter block and needs to be called again in order to flush it.
/* No input data. Return, except when Continuation Flag is set, which means
* that DFLTCC has buffered some output in the parameter block and needs to
* be called again in order to flush it.
*/
if (flush == Z_FINISH && strm->avail_in == 0 && !param->cf) {
if (param->bcf) {
/* A block is still open, and the hardware does not support closing
* blocks without adding data. Thus, close it manually.
*/
if (strm->avail_in == 0 && !param->cf) {
/* A block is still open, and the hardware does not support closing
* blocks without adding data. Thus, close it manually.
*/
if (!no_flush && param->bcf) {
send_eobs(strm, param);
param->bcf = 0;
}
return 0;
}
if (strm->avail_in == 0 && !param->cf) {
*result = need_more;
/* Let one of deflate_* functions write a trailing empty block. */
if (flush == Z_FINISH)
return 0;
/* Clear history. */
if (flush == Z_FULL_FLUSH)
param->hl = 0;
/* Trigger block post-processing if necessary. */
*result = no_flush ? need_more : block_done;
return 1;
}
@@ -154,13 +162,18 @@ again:
send_eobs(strm, param);
param->bcf = 0;
dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
if (strm->avail_out == 0) {
*result = need_more;
return 1;
}
}
}
/* No space for compressed data. If we proceed, dfltcc_cmpr() will return
* DFLTCC_CC_OP1_TOO_SHORT without buffering header bits, but we will still
* set BCF=1, which is wrong. Avoid complications and return early.
*/
if (strm->avail_out == 0) {
*result = need_more;
return 1;
}
/* The caller gave us too much data. Pass only one block worth of
* uncompressed data to DFLTCC and mask the rest, so that on the next
* iteration we start a new block.
@@ -180,7 +193,7 @@ again:
param->cvt = state->wrap == 2 ? CVT_CRC32 : CVT_ADLER32;
if (!no_flush)
/* We need to close a block. Always do this in software - when there is
* no input data, the hardware will not nohor BCC. */
* no input data, the hardware will not honor BCC. */
soft_bcc = 1;
if (flush == Z_FINISH && !param->bcf)
/* We are about to open a BFINAL block, set Block Header Final bit
@@ -195,8 +208,8 @@ again:
param->sbb = (unsigned int)state->bi_valid;
if (param->sbb > 0)
*strm->next_out = (unsigned char)state->bi_buf;
if (param->hl)
param->nt = 0; /* Honor history */
/* Honor history and check value */
param->nt = 0;
param->cv = state->wrap == 2 ? ZSWAP32(strm->adler) : strm->adler;
/* When opening a block, choose a Huffman-Table Type */
@@ -277,31 +290,60 @@ again:
fly with deflateParams, we need to convert between hardware and software
window formats.
*/
int ZLIB_INTERNAL dfltcc_deflate_params(PREFIX3(streamp) strm, int level, int strategy)
{
static int dfltcc_was_deflate_used(PREFIX3(streamp) strm) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
return strm->total_in > 0 || param->nt == 0 || param->hl > 0;
}
int Z_INTERNAL dfltcc_deflate_params(PREFIX3(streamp) strm, int level, int strategy, int *flush) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
struct dfltcc_param_v0 *param = &dfltcc_state->param;
int could_deflate = dfltcc_can_deflate(strm);
int can_deflate = dfltcc_are_params_ok(level, state->w_bits, strategy, dfltcc_state->level_mask);
int can_deflate = dfltcc_can_deflate_with_params(strm, level, state->w_bits, strategy, state->reproducible);
if (can_deflate == could_deflate)
/* We continue to work in the same mode - no changes needed */
return Z_OK;
if (strm->total_in == 0 && param->nt == 1 && param->hl == 0)
if (!dfltcc_was_deflate_used(strm))
/* DFLTCC was not used yet - no changes needed */
return Z_OK;
/* Switching between hardware and software is not implemented */
return Z_STREAM_ERROR;
/* For now, do not convert between window formats - simply get rid of the old data instead */
*flush = Z_FULL_FLUSH;
return Z_OK;
}
int Z_INTERNAL dfltcc_deflate_done(PREFIX3(streamp) strm, int flush) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
struct dfltcc_param_v0 *param = &dfltcc_state->param;
/* When deflate(Z_FULL_FLUSH) is called with small avail_out, it might
* close the block without resetting the compression state. Detect this
* situation and return that deflation is not done.
*/
if (flush == Z_FULL_FLUSH && strm->avail_out == 0)
return 0;
/* Return that deflation is not done if DFLTCC is used and either it
* buffered some data (Continuation Flag is set), or has not written EOBS
* yet (Block-Continuation Flag is set).
*/
return !dfltcc_can_deflate(strm) || (!param->cf && !param->bcf);
}
int Z_INTERNAL dfltcc_can_set_reproducible(PREFIX3(streamp) strm, int reproducible) {
deflate_state *state = (deflate_state *)strm->state;
return reproducible != state->reproducible && !dfltcc_was_deflate_used(strm);
}
/*
Preloading history.
*/
static void append_history(struct dfltcc_param_v0 *param, unsigned char *history, const unsigned char *buf, uInt count)
{
static void append_history(struct dfltcc_param_v0 *param, unsigned char *history, const unsigned char *buf, uInt count) {
size_t offset;
size_t n;
@@ -331,20 +373,19 @@ static void append_history(struct dfltcc_param_v0 *param, unsigned char *history
}
}
int ZLIB_INTERNAL dfltcc_deflate_set_dictionary(PREFIX3(streamp) strm,
const unsigned char *dictionary, uInt dict_length)
{
int Z_INTERNAL dfltcc_deflate_set_dictionary(PREFIX3(streamp) strm,
const unsigned char *dictionary, uInt dict_length) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
struct dfltcc_param_v0 *param = &dfltcc_state->param;
append_history(param, state->window, dictionary, dict_length);
state->strstart = 1; /* Add FDICT to zlib header */
state->block_start = state->strstart; /* Make deflate_stored happy */
return Z_OK;
}
int ZLIB_INTERNAL dfltcc_deflate_get_dictionary(PREFIX3(streamp) strm, unsigned char *dictionary, uInt *dict_length)
{
int Z_INTERNAL dfltcc_deflate_get_dictionary(PREFIX3(streamp) strm, unsigned char *dictionary, uInt *dict_length) {
deflate_state *state = (deflate_state *)strm->state;
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
struct dfltcc_param_v0 *param = &dfltcc_state->param;
+13 -7
View File
@@ -3,12 +3,14 @@
#include "dfltcc_common.h"
int ZLIB_INTERNAL dfltcc_can_deflate(PREFIX3(streamp) strm);
int ZLIB_INTERNAL dfltcc_deflate(PREFIX3(streamp) strm, int flush, block_state *result);
int ZLIB_INTERNAL dfltcc_deflate_params(PREFIX3(streamp) strm, int level, int strategy);
int ZLIB_INTERNAL dfltcc_deflate_set_dictionary(PREFIX3(streamp) strm,
int Z_INTERNAL dfltcc_can_deflate(PREFIX3(streamp) strm);
int Z_INTERNAL dfltcc_deflate(PREFIX3(streamp) strm, int flush, block_state *result);
int Z_INTERNAL dfltcc_deflate_params(PREFIX3(streamp) strm, int level, int strategy, int *flush);
int Z_INTERNAL dfltcc_deflate_done(PREFIX3(streamp) strm, int flush);
int Z_INTERNAL dfltcc_can_set_reproducible(PREFIX3(streamp) strm, int reproducible);
int Z_INTERNAL dfltcc_deflate_set_dictionary(PREFIX3(streamp) strm,
const unsigned char *dictionary, uInt dict_length);
int ZLIB_INTERNAL dfltcc_deflate_get_dictionary(PREFIX3(streamp) strm, unsigned char *dictionary, uInt* dict_length);
int Z_INTERNAL dfltcc_deflate_get_dictionary(PREFIX3(streamp) strm, unsigned char *dictionary, uInt* dict_length);
#define DEFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
do { \
@@ -25,15 +27,17 @@ int ZLIB_INTERNAL dfltcc_deflate_get_dictionary(PREFIX3(streamp) strm, unsigned
#define DEFLATE_RESET_KEEP_HOOK(strm) \
dfltcc_reset((strm), sizeof(deflate_state))
#define DEFLATE_PARAMS_HOOK(strm, level, strategy) \
#define DEFLATE_PARAMS_HOOK(strm, level, strategy, hook_flush) \
do { \
int err; \
\
err = dfltcc_deflate_params((strm), (level), (strategy)); \
err = dfltcc_deflate_params((strm), (level), (strategy), (hook_flush)); \
if (err == Z_STREAM_ERROR) \
return err; \
} while (0)
#define DEFLATE_DONE dfltcc_deflate_done
#define DEFLATE_BOUND_ADJUST_COMPLEN(strm, complen, source_len) \
do { \
if (dfltcc_can_deflate((strm))) \
@@ -47,4 +51,6 @@ int ZLIB_INTERNAL dfltcc_deflate_get_dictionary(PREFIX3(streamp) strm, unsigned
#define DEFLATE_NEED_CHECKSUM(strm) (!dfltcc_can_deflate((strm)))
#define DEFLATE_CAN_SET_REPRODUCIBLE dfltcc_can_set_reproducible
#endif
+14 -16
View File
@@ -46,18 +46,17 @@ typedef enum {
#define DFLTCC_FACILITY 151
static inline dfltcc_cc dfltcc(int fn, void *param,
unsigned char **op1, size_t *len1, const unsigned char **op2, size_t *len2, void *hist)
{
unsigned char **op1, size_t *len1, z_const unsigned char **op2, size_t *len2, void *hist) {
unsigned char *t2 = op1 ? *op1 : NULL;
size_t t3 = len1 ? *len1 : 0;
const unsigned char *t4 = op2 ? *op2 : NULL;
z_const unsigned char *t4 = op2 ? *op2 : NULL;
size_t t5 = len2 ? *len2 : 0;
register int r0 __asm__("r0") = fn;
register void *r1 __asm__("r1") = param;
register unsigned char *r2 __asm__("r2") = t2;
register size_t r3 __asm__("r3") = t3;
register const unsigned char *r4 __asm__("r4") = t4;
register size_t r5 __asm__("r5") = t5;
Z_REGISTER int r0 __asm__("r0") = fn;
Z_REGISTER void *r1 __asm__("r1") = param;
Z_REGISTER unsigned char *r2 __asm__("r2") = t2;
Z_REGISTER size_t r3 __asm__("r3") = t3;
Z_REGISTER z_const unsigned char *r4 __asm__("r4") = t4;
Z_REGISTER size_t r5 __asm__("r5") = t5;
int cc;
__asm__ volatile(
@@ -108,13 +107,11 @@ struct dfltcc_qaf_param {
static_assert(sizeof(struct dfltcc_qaf_param) == 32, sizeof_struct_dfltcc_qaf_param_is_32);
static inline int is_bit_set(const char *bits, int n)
{
static inline int is_bit_set(const char *bits, int n) {
return bits[n / 8] & (1 << (7 - (n % 8)));
}
static inline void clear_bit(char *bits, int n)
{
static inline void clear_bit(char *bits, int n) {
bits[n / 8] &= ~(1 << (7 - (n % 8)));
}
@@ -175,8 +172,7 @@ struct dfltcc_param_v0 {
static_assert(sizeof(struct dfltcc_param_v0) == 1536, sizeof_struct_dfltcc_param_v0_is_1536);
static inline const char *oesc_msg(char *buf, int oesc)
{
static inline z_const char *oesc_msg(char *buf, int oesc) {
if (oesc == 0x00)
return NULL; /* Successful completion */
else {
@@ -198,4 +194,6 @@ struct dfltcc_state {
char msg[64]; /* Buffer for strm->msg */
};
#define GET_DFLTCC_STATE(state) ((struct dfltcc_state *)((state) + 1))
#define ALIGN_UP(p, size) (__typeof__(p))(((uintptr_t)(p) + ((size) - 1)) & ~((size) - 1))
#define GET_DFLTCC_STATE(state) ((struct dfltcc_state *)((char *)(state) + ALIGN_UP(sizeof(*state), 8)))
+9 -14
View File
@@ -13,15 +13,14 @@
$ make
*/
#include "zbuild.h"
#include "zutil.h"
#include "inftrees.h"
#include "inflate.h"
#include "../../zbuild.h"
#include "../../zutil.h"
#include "../../inftrees.h"
#include "../../inflate.h"
#include "dfltcc_inflate.h"
#include "dfltcc_detail.h"
int ZLIB_INTERNAL dfltcc_can_inflate(PREFIX3(streamp) strm)
{
int Z_INTERNAL dfltcc_can_inflate(PREFIX3(streamp) strm) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
@@ -33,8 +32,7 @@ int ZLIB_INTERNAL dfltcc_can_inflate(PREFIX3(streamp) strm)
return is_bit_set(dfltcc_state->af.fns, DFLTCC_XPND) && is_bit_set(dfltcc_state->af.fmts, DFLTCC_FMT0);
}
static inline dfltcc_cc dfltcc_xpnd(PREFIX3(streamp) strm)
{
static inline dfltcc_cc dfltcc_xpnd(PREFIX3(streamp) strm) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
size_t avail_in = strm->avail_in;
@@ -49,8 +47,7 @@ static inline dfltcc_cc dfltcc_xpnd(PREFIX3(streamp) strm)
return cc;
}
dfltcc_inflate_action ZLIB_INTERNAL dfltcc_inflate(PREFIX3(streamp) strm, int flush, int *ret)
{
dfltcc_inflate_action Z_INTERNAL dfltcc_inflate(PREFIX3(streamp) strm, int flush, int *ret) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
struct dfltcc_param_v0 *param = &dfltcc_state->param;
@@ -115,16 +112,14 @@ dfltcc_inflate_action ZLIB_INTERNAL dfltcc_inflate(PREFIX3(streamp) strm, int fl
DFLTCC_INFLATE_BREAK : DFLTCC_INFLATE_CONTINUE;
}
int ZLIB_INTERNAL dfltcc_was_inflate_used(PREFIX3(streamp) strm)
{
int Z_INTERNAL dfltcc_was_inflate_used(PREFIX3(streamp) strm) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
return !param->nt;
}
int ZLIB_INTERNAL dfltcc_inflate_disable(PREFIX3(streamp) strm)
{
int Z_INTERNAL dfltcc_inflate_disable(PREFIX3(streamp) strm) {
struct inflate_state *state = (struct inflate_state *)strm->state;
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
+9 -4
View File
@@ -3,15 +3,15 @@
#include "dfltcc_common.h"
int ZLIB_INTERNAL dfltcc_can_inflate(PREFIX3(streamp) strm);
int Z_INTERNAL dfltcc_can_inflate(PREFIX3(streamp) strm);
typedef enum {
DFLTCC_INFLATE_CONTINUE,
DFLTCC_INFLATE_BREAK,
DFLTCC_INFLATE_SOFTWARE,
} dfltcc_inflate_action;
dfltcc_inflate_action ZLIB_INTERNAL dfltcc_inflate(PREFIX3(streamp) strm, int flush, int *ret);
int ZLIB_INTERNAL dfltcc_was_inflate_used(PREFIX3(streamp) strm);
int ZLIB_INTERNAL dfltcc_inflate_disable(PREFIX3(streamp) strm);
dfltcc_inflate_action Z_INTERNAL dfltcc_inflate(PREFIX3(streamp) strm, int flush, int *ret);
int Z_INTERNAL dfltcc_was_inflate_used(PREFIX3(streamp) strm);
int Z_INTERNAL dfltcc_inflate_disable(PREFIX3(streamp) strm);
#define INFLATE_RESET_KEEP_HOOK(strm) \
dfltcc_reset((strm), sizeof(struct inflate_state))
@@ -41,4 +41,9 @@ int ZLIB_INTERNAL dfltcc_inflate_disable(PREFIX3(streamp) strm);
if (dfltcc_was_inflate_used((strm))) return -(1L << 16); \
} while (0)
#define INFLATE_SYNC_POINT_HOOK(strm) \
do { \
if (dfltcc_was_inflate_used((strm))) return Z_STREAM_ERROR; \
} while (0)
#endif
-3
View File
@@ -1,3 +0,0 @@
fill_window_sse.c SSE2 optimized fill_window
deflate_quick.c SSE4 optimized deflate strategy for use as level 1
crc_folding.c SSE4 + PCLMULQDQ optimized CRC folding implementation
+8
View File
@@ -0,0 +1,8 @@
Contents
--------
|Name|Description|
|:-|:-|
|deflate_quick.c|SSE4 optimized deflate strategy for use as level 1|
|crc_folding.c|SSE4 + PCLMULQDQ optimized CRC folding implementation|
|slide_sse2.c|SSE2 optimized slide_hash|
+58 -9
View File
@@ -8,7 +8,9 @@ SFLAGS=
INCLUDES=
SUFFIX=
AVX2FLAG=-mavx2
SSE2FLAG=-msse2
SSSE3FLAG=-mssse3
SSE4FLAG=-msse4
PCLMULFLAG=-mpclmul
@@ -16,7 +18,18 @@ SRCDIR=.
SRCTOP=../..
TOPDIR=$(SRCTOP)
all: x86.o x86.lo fill_window_sse.o fill_window_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo
all: \
x86.o x86.lo \
adler32_avx.o adler32.lo \
adler32_ssse3.o adler32_ssse3.lo \
chunkset_avx.o chunkset_avx.lo \
chunkset_sse.o chunkset_sse.lo \
compare258_avx.o compare258_avx.lo \
compare258_sse.o compare258_sse.lo \
insert_string_sse.o insert_string_sse.lo \
crc_folding.o crc_folding.lo \
slide_avx.o slide_avx.lo \
slide_sse.o slide_sse.lo
x86.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
@@ -24,17 +37,29 @@ x86.o:
x86.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
fill_window_sse.o:
$(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_sse.c
chunkset_avx.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx.c
fill_window_sse.lo:
$(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_sse.c
chunkset_avx.lo:
$(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx.c
deflate_quick.o:
$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/deflate_quick.c
chunkset_sse.o:
$(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse.c
deflate_quick.lo:
$(CC) $(SFLAGS) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/deflate_quick.c
chunkset_sse.lo:
$(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse.c
compare258_avx.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx.c
compare258_avx.lo:
$(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx.c
compare258_sse.o:
$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c
compare258_sse.lo:
$(CC) $(SFLAGS) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c
insert_string_sse.o:
$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse.c
@@ -48,6 +73,30 @@ crc_folding.o:
crc_folding.lo:
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c
slide_avx.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_avx.c
slide_avx.lo:
$(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_avx.c
slide_sse.o:
$(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_sse.c
slide_sse.lo:
$(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_sse.c
adler32_avx.o: $(SRCDIR)/adler32_avx.c
$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx.c
adler32_avx.lo: $(SRCDIR)/adler32_avx.c
$(CC) $(SFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx.c
adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
$(CC) $(CFLAGS) $(SSSE3FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c
$(CC) $(SFLAGS) $(SSSE3FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
mostlyclean: clean
clean:
rm -f *.o *.lo *~
+117
View File
@@ -0,0 +1,117 @@
/* adler32.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011 Mark Adler
* Authors:
* Brian Bockelman <bockelman@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../zutil.h"
#include "../../adler32_p.h"
#include <immintrin.h>
#ifdef X86_AVX2_ADLER32
Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len) {
uint32_t sum2;
/* split Adler-32 into component sums */
sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
return adler32_len_1(adler, buf, sum2);
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (UNLIKELY(buf == NULL))
return 1L;
/* in case short lengths are provided, keep it somewhat fast */
if (UNLIKELY(len < 16))
return adler32_len_16(adler, buf, len, sum2);
uint32_t ALIGNED_(32) s1[8], s2[8];
memset(s1, 0, sizeof(s1)); s1[7] = adler; // TODO: would a masked load be faster?
memset(s2, 0, sizeof(s2)); s2[7] = sum2;
char ALIGNED_(32) dot1[32] = \
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
__m256i dot1v = _mm256_load_si256((__m256i*)dot1);
char ALIGNED_(32) dot2[32] = \
{32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
__m256i dot2v = _mm256_load_si256((__m256i*)dot2);
short ALIGNED_(32) dot3[16] = \
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
__m256i dot3v = _mm256_load_si256((__m256i*)dot3);
// We will need to multiply by
char ALIGNED_(32) shift[16] = {5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
__m128i shiftv = _mm_load_si128((__m128i*)shift);
while (len >= 32) {
__m256i vs1 = _mm256_load_si256((__m256i*)s1);
__m256i vs2 = _mm256_load_si256((__m256i*)s2);
__m256i vs1_0 = vs1;
int k = (len < NMAX ? (int)len : NMAX);
k -= k % 32;
len -= k;
while (k >= 32) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
*/
__m256i vbuf = _mm256_loadu_si256((__m256i*)buf);
buf += 32;
k -= 32;
__m256i v_short_sum1 = _mm256_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts.
__m256i vsum1 = _mm256_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t;
__m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v);
vs1 = _mm256_add_epi32(vsum1, vs1);
__m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v);
vs1_0 = _mm256_sll_epi32(vs1_0, shiftv);
vsum2 = _mm256_add_epi32(vsum2, vs2);
vs2 = _mm256_add_epi32(vsum2, vs1_0);
vs1_0 = vs1;
}
// At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that
// would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on.
uint32_t ALIGNED_(32) s1_unpack[8];
uint32_t ALIGNED_(32) s2_unpack[8];
_mm256_store_si256((__m256i*)s1_unpack, vs1);
_mm256_store_si256((__m256i*)s2_unpack, vs2);
adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
(s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
adler %= BASE;
s1[7] = adler;
sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE) +
(s2_unpack[4] % BASE) + (s2_unpack[5] % BASE) + (s2_unpack[6] % BASE) + (s2_unpack[7] % BASE);
sum2 %= BASE;
s2[7] = sum2;
}
while (len) {
len--;
adler += *buf++;
sum2 += adler;
}
adler %= BASE;
sum2 %= BASE;
/* return recombined sums */
return adler | (sum2 << 16);
}
#endif
+118
View File
@@ -0,0 +1,118 @@
/* adler32.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011 Mark Adler
* Authors:
* Brian Bockelman <bockelman@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../zutil.h"
#include "../../adler32_p.h"
#ifdef X86_SSSE3_ADLER32
#include <immintrin.h>
Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len) {
uint32_t sum2;
/* split Adler-32 into component sums */
sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
return adler32_len_1(adler, buf, sum2);
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (UNLIKELY(buf == NULL))
return 1L;
/* in case short lengths are provided, keep it somewhat fast */
if (UNLIKELY(len < 16))
return adler32_len_16(adler, buf, len, sum2);
uint32_t ALIGNED_(16) s1[4], s2[4];
s1[0] = s1[1] = s1[2] = 0; s1[3] = adler;
s2[0] = s2[1] = s2[2] = 0; s2[3] = sum2;
char ALIGNED_(16) dot1[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
__m128i dot1v = _mm_load_si128((__m128i*)dot1);
char ALIGNED_(16) dot2[16] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
__m128i dot2v = _mm_load_si128((__m128i*)dot2);
short ALIGNED_(16) dot3[8] = {1, 1, 1, 1, 1, 1, 1, 1};
__m128i dot3v = _mm_load_si128((__m128i*)dot3);
// We will need to multiply by
//char ALIGNED_(16) shift[4] = {0, 0, 0, 4}; //{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4};
char ALIGNED_(16) shift[16] = {4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
__m128i shiftv = _mm_load_si128((__m128i*)shift);
while (len >= 16) {
__m128i vs1 = _mm_load_si128((__m128i*)s1);
__m128i vs2 = _mm_load_si128((__m128i*)s2);
__m128i vs1_0 = vs1;
int k = (len < NMAX ? (int)len : NMAX);
k -= k % 16;
len -= k;
while (k >= 16) {
/*
vs1 = adler + sum(c[i])
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
NOTE: 256-bit equivalents are:
_mm256_maddubs_epi16 <- operates on 32 bytes to 16 shorts
_mm256_madd_epi16 <- Sums 16 shorts to 8 int32_t.
We could rewrite the below to use 256-bit instructions instead of 128-bit.
*/
__m128i vbuf = _mm_loadu_si128((__m128i*)buf);
buf += 16;
k -= 16;
__m128i v_short_sum1 = _mm_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts.
__m128i vsum1 = _mm_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t;
__m128i v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
vs1 = _mm_add_epi32(vsum1, vs1);
__m128i vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
vs1_0 = _mm_sll_epi32(vs1_0, shiftv);
vsum2 = _mm_add_epi32(vsum2, vs2);
vs2 = _mm_add_epi32(vsum2, vs1_0);
vs1_0 = vs1;
}
// At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that
// would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on.
uint32_t ALIGNED_(16) s1_unpack[4];
uint32_t ALIGNED_(16) s2_unpack[4];
_mm_store_si128((__m128i*)s1_unpack, vs1);
_mm_store_si128((__m128i*)s2_unpack, vs2);
adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE);
adler %= BASE;
s1[3] = adler;
sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE);
sum2 %= BASE;
s2[3] = sum2;
}
while (len) {
len--;
adler += *buf++;
sum2 += adler;
}
adler %= BASE;
sum2 %= BASE;
/* return recombined sums */
return adler | (sum2 << 16);
}
#endif
+50
View File
@@ -0,0 +1,50 @@
/* chunkset_avx.c -- AVX inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "zutil.h"
#ifdef X86_AVX_CHUNKSET
#include <immintrin.h>
typedef __m256i chunk_t;
#define HAVE_CHUNKMEMSET_1
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
*chunk = _mm256_set1_epi8(*(int8_t *)from);
}
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
*chunk = _mm256_set1_epi16(*(int16_t *)from);
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
*chunk = _mm256_set1_epi32(*(int32_t *)from);
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
*chunk = _mm256_set1_epi64x(*(int64_t *)from);
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
*chunk = _mm256_loadu_si256((__m256i *)s);
}
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
_mm256_storeu_si256((__m256i *)out, *chunk);
}
#define CHUNKSIZE chunksize_avx
#define CHUNKCOPY chunkcopy_avx
#define CHUNKCOPY_SAFE chunkcopy_safe_avx
#define CHUNKUNROLL chunkunroll_avx
#define CHUNKMEMSET chunkmemset_avx
#define CHUNKMEMSET_SAFE chunkmemset_safe_avx
#include "chunkset_tpl.h"
#endif
+51
View File
@@ -0,0 +1,51 @@
/* chunkset_sse.c -- SSE inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
#include "zutil.h"
#ifdef X86_SSE2
#include <immintrin.h>
typedef __m128i chunk_t;
#define HAVE_CHUNKMEMSET_1
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
*chunk = _mm_set1_epi8(*(int8_t *)from);
}
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
*chunk = _mm_set1_epi16(*(int16_t *)from);
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
*chunk = _mm_set1_epi32(*(int32_t *)from);
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
*chunk = _mm_set1_epi64x(*(int64_t *)from);
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
*chunk = _mm_loadu_si128((__m128i *)s);
}
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
_mm_storeu_si128((__m128i *)out, *chunk);
}
#define CHUNKSIZE chunksize_sse2
#define CHUNKCOPY chunkcopy_sse2
#define CHUNKCOPY_SAFE chunkcopy_safe_sse2
#define CHUNKUNROLL chunkunroll_sse2
#define CHUNKMEMSET chunkmemset_sse2
#define CHUNKMEMSET_SAFE chunkmemset_safe_sse2
#include "chunkset_tpl.h"
#endif
+67
View File
@@ -0,0 +1,67 @@
/* compare258_avx.c -- AVX2 version of compare258
* Copyright Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../zutil.h"
#include "fallback_builtins.h"
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
#include <immintrin.h>
#ifdef _MSC_VER
# include <nmmintrin.h>
#endif
/* UNALIGNED_OK, AVX2 intrinsic comparison */
static inline uint32_t compare256_unaligned_avx2_static(const unsigned char *src0, const unsigned char *src1) {
uint32_t len = 0;
do {
__m256i ymm_src0, ymm_src1, ymm_cmp;
ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
if (mask != 0xFFFFFFFF) {
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */
return len + match_byte;
}
src0 += 32, src1 += 32, len += 32;
ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1);
mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
if (mask != 0xFFFFFFFF) {
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
return len + match_byte;
}
src0 += 32, src1 += 32, len += 32;
} while (len < 256);
return 256;
}
static inline uint32_t compare258_unaligned_avx2_static(const unsigned char *src0, const unsigned char *src1) {
if (*(uint16_t *)src0 != *(uint16_t *)src1)
return (*src0 == *src1);
return compare256_unaligned_avx2_static(src0+2, src1+2) + 2;
}
Z_INTERNAL uint32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1) {
return compare258_unaligned_avx2_static(src0, src1);
}
#define LONGEST_MATCH longest_match_unaligned_avx2
#define COMPARE256 compare256_unaligned_avx2_static
#define COMPARE258 compare258_unaligned_avx2_static
#include "match_tpl.h"
#endif
+74
View File
@@ -0,0 +1,74 @@
/* compare258_sse.c -- SSE4.2 version of compare258
*
* Copyright (C) 2013 Intel Corporation. All rights reserved.
* Authors:
* Wajdi Feghali <wajdi.k.feghali@intel.com>
* Jim Guilford <james.guilford@intel.com>
* Vinodh Gopal <vinodh.gopal@intel.com>
* Erdinc Ozturk <erdinc.ozturk@intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
*
* Portions are Copyright (C) 2016 12Sided Technology, LLC.
* Author:
* Phil Vachon <pvachon@12sidedtech.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../zutil.h"
#ifdef X86_SSE42_CMP_STR
#include <immintrin.h>
#ifdef _MSC_VER
# include <nmmintrin.h>
#endif
/* UNALIGNED_OK, SSE4.2 intrinsic comparison */
static inline uint32_t compare256_unaligned_sse4_static(const unsigned char *src0, const unsigned char *src1) {
uint32_t len = 0;
do {
#define mode _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY
__m128i xmm_src0, xmm_src1;
uint32_t ret;
xmm_src0 = _mm_loadu_si128((__m128i *)src0);
xmm_src1 = _mm_loadu_si128((__m128i *)src1);
ret = (uint32_t)_mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode);
if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) {
return len + ret;
}
src0 += 16, src1 += 16, len += 16;
xmm_src0 = _mm_loadu_si128((__m128i *)src0);
xmm_src1 = _mm_loadu_si128((__m128i *)src1);
ret = (uint32_t)_mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode);
if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) {
return len + ret;
}
src0 += 16, src1 += 16, len += 16;
} while (len < 256);
return 256;
}
static inline uint32_t compare258_unaligned_sse4_static(const unsigned char *src0, const unsigned char *src1) {
if (*(uint16_t *)src0 != *(uint16_t *)src1)
return (*src0 == *src1);
return compare256_unaligned_sse4_static(src0+2, src1+2) + 2;
}
Z_INTERNAL uint32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1) {
return compare258_unaligned_sse4_static(src0, src1);
}
#define LONGEST_MATCH longest_match_unaligned_sse4
#define COMPARE256 compare256_unaligned_sse4_static
#define COMPARE258 compare258_unaligned_sse4_static
#include "match_tpl.h"
#endif
+20 -13
View File
@@ -1,5 +1,5 @@
/*
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
* instruction.
*
* A white paper describing this algorithm can be found at:
@@ -18,14 +18,14 @@
#ifdef X86_PCLMULQDQ_CRC
#include "zbuild.h"
#include "../../zbuild.h"
#include <inttypes.h>
#include <immintrin.h>
#include <wmmintrin.h>
#include "crc_folding.h"
ZLIB_INTERNAL void crc_fold_init(deflate_state *const s) {
Z_INTERNAL void crc_fold_init(deflate_state *const s) {
/* CRC_SAVE */
_mm_storeu_si128((__m128i *)s->crc0 + 0, _mm_cvtsi32_si128(0x9db42487));
_mm_storeu_si128((__m128i *)s->crc0 + 1, _mm_setzero_si128());
@@ -227,9 +227,10 @@ static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
*xmm_crc3 = _mm_castps_si128(ps_res);
}
ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const unsigned char *src, long len) {
Z_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const unsigned char *src, long len) {
unsigned long algn_diff;
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
char ALIGNED_(16) partial_buf[16] = { 0 };
/* CRC_LOAD */
__m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);
@@ -241,11 +242,14 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
if (len < 16) {
if (len == 0)
return;
xmm_crc_part = _mm_loadu_si128((__m128i *)src);
memcpy(partial_buf, src, len);
xmm_crc_part = _mm_loadu_si128((const __m128i *)partial_buf);
memcpy(dst, partial_buf, len);
goto partial;
}
algn_diff = (0 - (uintptr_t)src) & 0xF;
algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
if (algn_diff) {
xmm_crc_part = _mm_loadu_si128((__m128i *)src);
_mm_storeu_si128((__m128i *)dst, xmm_crc_part);
@@ -255,6 +259,8 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
len -= algn_diff;
partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
} else {
xmm_crc_part = _mm_setzero_si128();
}
while ((len -= 64) >= 0) {
@@ -305,7 +311,7 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
goto done;
dst += 48;
xmm_crc_part = _mm_load_si128((__m128i *)src + 3);
memcpy(&xmm_crc_part, (__m128i *)src + 3, len);
} else if (len + 32 >= 0) {
len += 32;
@@ -324,7 +330,7 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
goto done;
dst += 32;
xmm_crc_part = _mm_load_si128((__m128i *)src + 2);
memcpy(&xmm_crc_part, (__m128i *)src + 2, len);
} else if (len + 48 >= 0) {
len += 48;
@@ -340,16 +346,18 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
goto done;
dst += 16;
xmm_crc_part = _mm_load_si128((__m128i *)src + 1);
memcpy(&xmm_crc_part, (__m128i *)src + 1, len);
} else {
len += 64;
if (len == 0)
goto done;
xmm_crc_part = _mm_load_si128((__m128i *)src);
memcpy(&xmm_crc_part, src, len);
}
_mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);
memcpy(dst, partial_buf, len);
partial:
_mm_storeu_si128((__m128i *)dst, xmm_crc_part);
partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
done:
/* CRC_SAVE */
@@ -377,7 +385,7 @@ static const unsigned ALIGNED_(16) crc_mask2[4] = {
0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
};
uint32_t ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s) {
uint32_t Z_INTERNAL crc_fold_512to32(deflate_state *const s) {
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
@@ -447,4 +455,3 @@ uint32_t ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s) {
}
#endif
+4 -4
View File
@@ -10,10 +10,10 @@
#ifndef CRC_FOLDING_H_
#define CRC_FOLDING_H_
#include "deflate.h"
#include "../../deflate.h"
ZLIB_INTERNAL void crc_fold_init(deflate_state *const);
ZLIB_INTERNAL uint32_t crc_fold_512to32(deflate_state *const);
ZLIB_INTERNAL void crc_fold_copy(deflate_state *const, unsigned char *, const unsigned char *, long);
Z_INTERNAL void crc_fold_init(deflate_state *const);
Z_INTERNAL uint32_t crc_fold_512to32(deflate_state *const);
Z_INTERNAL void crc_fold_copy(deflate_state *const, unsigned char *, const unsigned char *, long);
#endif
-25
View File
@@ -1,25 +0,0 @@
#ifndef X86_CTZL_H
#define X86_CTZL_H
#include <intrin.h>
#ifdef X86_CPUID
# include "x86.h"
#endif
#if defined(_MSC_VER) && !defined(__clang__)
/* This is not a general purpose replacement for __builtin_ctzl. The function expects that value is != 0
* Because of that assumption trailing_zero is not initialized and the return value of _BitScanForward is not checked
*/
static __forceinline unsigned long __builtin_ctzl(unsigned long value)
{
#ifdef X86_CPUID
if (x86_cpu_has_tzcnt)
return _tzcnt_u32(value);
#endif
unsigned long trailing_zero;
_BitScanForward(&trailing_zero, value);
return trailing_zero;
}
#endif
#endif
File diff suppressed because it is too large Load Diff
-175
View File
@@ -1,175 +0,0 @@
/*
* Fill Window with SSE2-optimized hash shifting
*
* Copyright (C) 2013 Intel Corporation
* Authors:
* Arjan van de Ven <arjan@linux.intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef X86_SSE2
#include "zbuild.h"
#include <immintrin.h>
#include "deflate.h"
#include "deflate_p.h"
#include "functable.h"
extern int read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
ZLIB_INTERNAL void fill_window_sse(deflate_state *s) {
const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
register unsigned n;
register Pos *p;
unsigned more; /* Amount of free space at the end of the window. */
unsigned int wsize = s->w_size;
Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
do {
more = (unsigned)(s->window_size -(unsigned long)s->lookahead -(unsigned long)s->strstart);
/* Deal with !@#$% 64K limit: */
if (sizeof(int) <= 2) {
if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
more = wsize;
} else if (more == (unsigned)(-1)) {
/* Very unlikely, but possible on 16 bit machine if
* strstart == 0 && lookahead == 1 (input done a byte at time)
*/
more--;
}
}
/* If the window is almost full and there is insufficient lookahead,
* move the upper half to the lower one to make room in the upper half.
*/
if (s->strstart >= wsize+MAX_DIST(s)) {
memcpy(s->window, s->window+wsize, (unsigned)wsize);
s->match_start = (s->match_start >= wsize) ? s->match_start - wsize : 0;
s->strstart -= wsize; /* we now have strstart >= MAX_DIST */
s->block_start -= (long) wsize;
/* Slide the hash table (could be avoided with 32 bit values
at the expense of memory usage). We slide even when level == 0
to keep the hash table consistent if we switch back to level > 0
later. (Using level 0 permanently is not an optimal usage of
zlib, so we don't care about this pathological case.)
*/
n = s->hash_size;
p = &s->head[n];
p -= 8;
do {
__m128i value, result;
value = _mm_loadu_si128((__m128i *)p);
result = _mm_subs_epu16(value, xmm_wsize);
_mm_storeu_si128((__m128i *)p, result);
p -= 8;
n -= 8;
} while (n > 0);
n = wsize;
p = &s->prev[n];
p -= 8;
do {
__m128i value, result;
value = _mm_loadu_si128((__m128i *)p);
result = _mm_subs_epu16(value, xmm_wsize);
_mm_storeu_si128((__m128i *)p, result);
p -= 8;
n -= 8;
} while (n > 0);
more += wsize;
}
if (s->strm->avail_in == 0) break;
/* If there was no sliding:
* strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
* more == window_size - lookahead - strstart
* => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
* => more >= window_size - 2*WSIZE + 2
* In the BIG_MEM or MMAP case (not yet supported),
* window_size == input_size + MIN_LOOKAHEAD &&
* strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
* Otherwise, window_size == 2*WSIZE so more >= 2.
* If there was sliding, more >= WSIZE. So in all cases, more >= 2.
*/
Assert(more >= 2, "more < 2");
n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
s->lookahead += n;
/* Initialize the hash value now that we have some input: */
if (s->lookahead + s->insert >= MIN_MATCH) {
unsigned int str = s->strstart - s->insert;
s->ins_h = s->window[str];
if (str >= 1)
functable.insert_string(s, str + 2 - MIN_MATCH, 1);
#if MIN_MATCH != 3
#error Call insert_string() MIN_MATCH-3 more times
while (s->insert) {
functable.insert_string(s, str, 1);
str++;
s->insert--;
if (s->lookahead + s->insert < MIN_MATCH)
break;
}
#else
unsigned int count;
if (unlikely(s->lookahead == 1)){
count = s->insert - 1;
}else{
count = s->insert;
}
functable.insert_string(s, str, count);
s->insert -= count;
#endif
}
/* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
* but this is not important since only literal bytes will be emitted.
*/
} while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
/* If the WIN_INIT bytes after the end of the current data have never been
* written, then zero those bytes in order to avoid memory check reports of
* the use of uninitialized (or uninitialised as Julian writes) bytes by
* the longest match routines. Update the high water mark for the next
* time through here. WIN_INIT is set to MAX_MATCH since the longest match
* routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
*/
if (s->high_water < s->window_size) {
unsigned long curr = s->strstart + (unsigned long)(s->lookahead);
unsigned long init;
if (s->high_water < curr) {
/* Previous high water mark below current data -- zero WIN_INIT
* bytes or up to end of window, whichever is less.
*/
init = s->window_size - curr;
if (init > WIN_INIT)
init = WIN_INIT;
memset(s->window + curr, 0, (unsigned)init);
s->high_water = curr + init;
} else if (s->high_water < (unsigned long)curr + WIN_INIT) {
/* High water mark at or above current data, but below current data
* plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
* to end of window, whichever is less.
*/
init = (unsigned long)curr + WIN_INIT - s->high_water;
if (init > s->window_size - s->high_water)
init = s->window_size - s->high_water;
memset(s->window + s->high_water, 0, (unsigned)init);
s->high_water += init;
}
}
Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "not enough room for search");
}
#endif
+35 -45
View File
@@ -5,52 +5,42 @@
*
*/
#include "zbuild.h"
#include "deflate.h"
/* ===========================================================================
* Insert string str in the dictionary and set match_head to the previous head
* of the hash chain (the most recent string with same hash key). Return
* the previous length of the hash chain.
* IN assertion: all calls to to INSERT_STRING are made with consecutive
* input characters and the first MIN_MATCH bytes of str are valid
* (except for the last MIN_MATCH-1 bytes of the input file).
*/
#ifdef X86_SSE4_2_CRC_HASH
ZLIB_INTERNAL Pos insert_string_sse(deflate_state *const s, const Pos str, unsigned int count) {
Pos ret = 0;
unsigned int idx;
unsigned int *ip, val, h;
for (idx = 0; idx < count; idx++) {
ip = (unsigned *)&s->window[str+idx];
memcpy(&val, ip, sizeof(val));
h = 0;
if (s->level >= TRIGGER_LEVEL)
val &= 0xFFFFFF;
#include "../../zbuild.h"
#include <immintrin.h>
#ifdef _MSC_VER
h = _mm_crc32_u32(h, val);
#elif defined(X86_SSE4_2_CRC_INTRIN)
h = __builtin_ia32_crc32si(h, val);
# include <nmmintrin.h>
#endif
#include "../../deflate.h"
#ifdef X86_SSE42_CRC_INTRIN
# ifdef _MSC_VER
# define UPDATE_HASH(s, h, val)\
h = _mm_crc32_u32(h, val)
# else
# define UPDATE_HASH(s, h, val)\
h = __builtin_ia32_crc32si(h, val)
# endif
#else
__asm__ __volatile__ (
"crc32 %1,%0\n\t"
: "+r" (h)
: "r" (val)
);
#endif
Pos head = s->head[h & s->hash_mask];
if (head != str+idx) {
s->prev[(str+idx) & s->w_mask] = head;
s->head[h & s->hash_mask] = str+idx;
if (idx == count-1)
ret = head;
} else if (idx == count - 1) {
ret = str + idx;
}
# ifdef _MSC_VER
# define UPDATE_HASH(s, h, val) {\
__asm mov edx, h\
__asm mov eax, val\
__asm crc32 eax, edx\
__asm mov val, eax\
}
return ret;
}
# else
# define UPDATE_HASH(s, h, val) \
__asm__ __volatile__ (\
"crc32 %1,%0\n\t"\
: "+r" (h)\
: "r" (val)\
);
# endif
#endif
#define INSERT_STRING insert_string_sse4
#define QUICK_INSERT_STRING quick_insert_string_sse4
#ifdef X86_SSE42_CRC_HASH
# include "../../insert_string_tpl.h"
#endif
+47
View File
@@ -0,0 +1,47 @@
/*
* AVX2 optimized hash slide, based on Intel's slide_sse implementation
*
* Copyright (C) 2017 Intel Corporation
* Authors:
* Arjan van de Ven <arjan@linux.intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
* Mika T. Lindqvist <postmaster@raasu.org>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../deflate.h"
#include <immintrin.h>
Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
Pos *p;
unsigned n;
uint16_t wsize = (uint16_t)s->w_size;
const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
n = HASH_SIZE;
p = &s->head[n] - 16;
do {
__m256i value, result;
value = _mm256_loadu_si256((__m256i *)p);
result= _mm256_subs_epu16(value, ymm_wsize);
_mm256_storeu_si256((__m256i *)p, result);
p -= 16;
n -= 16;
} while (n > 0);
n = wsize;
p = &s->prev[n] - 16;
do {
__m256i value, result;
value = _mm256_loadu_si256((__m256i *)p);
result= _mm256_subs_epu16(value, ymm_wsize);
_mm256_storeu_si256((__m256i *)p, result);
p -= 16;
n -= 16;
} while (n > 0);
}
+46
View File
@@ -0,0 +1,46 @@
/*
* SSE optimized hash slide
*
* Copyright (C) 2017 Intel Corporation
* Authors:
* Arjan van de Ven <arjan@linux.intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
#include "../../deflate.h"
#include <immintrin.h>
Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
Pos *p;
unsigned n;
uint16_t wsize = (uint16_t)s->w_size;
const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
n = HASH_SIZE;
p = &s->head[n] - 8;
do {
__m128i value, result;
value = _mm_loadu_si128((__m128i *)p);
result= _mm_subs_epu16(value, xmm_wsize);
_mm_storeu_si128((__m128i *)p, result);
p -= 8;
n -= 8;
} while (n > 0);
n = wsize;
p = &s->prev[n] - 8;
do {
__m128i value, result;
value = _mm_loadu_si128((__m128i *)p);
result= _mm_subs_epu16(value, xmm_wsize);
_mm_storeu_si128((__m128i *)p, result);
p -= 8;
n -= 8;
} while (n > 0);
}
+54 -42
View File
@@ -8,61 +8,73 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zutil.h"
#include "../../zutil.h"
#ifdef _MSC_VER
#include <intrin.h>
# include <intrin.h>
#else
// Newer versions of GCC and clang come with cpuid.h
#include <cpuid.h>
# include <cpuid.h>
#endif
ZLIB_INTERNAL int x86_cpu_has_sse2;
ZLIB_INTERNAL int x86_cpu_has_sse42;
ZLIB_INTERNAL int x86_cpu_has_pclmulqdq;
ZLIB_INTERNAL int x86_cpu_has_tzcnt;
Z_INTERNAL int x86_cpu_has_avx2;
Z_INTERNAL int x86_cpu_has_sse2;
Z_INTERNAL int x86_cpu_has_ssse3;
Z_INTERNAL int x86_cpu_has_sse42;
Z_INTERNAL int x86_cpu_has_pclmulqdq;
Z_INTERNAL int x86_cpu_has_tzcnt;
static void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
#ifdef _MSC_VER
unsigned int registers[4];
__cpuid(registers, info);
unsigned int registers[4];
__cpuid((int *)registers, info);
*eax = registers[0];
*ebx = registers[1];
*ecx = registers[2];
*edx = registers[3];
*eax = registers[0];
*ebx = registers[1];
*ecx = registers[2];
*edx = registers[3];
#else
unsigned int _eax;
unsigned int _ebx;
unsigned int _ecx;
unsigned int _edx;
__cpuid(info, _eax, _ebx, _ecx, _edx);
*eax = _eax;
*ebx = _ebx;
*ecx = _ecx;
*edx = _edx;
__cpuid(info, *eax, *ebx, *ecx, *edx);
#endif
}
void ZLIB_INTERNAL x86_check_features(void) {
unsigned eax, ebx, ecx, edx;
unsigned maxbasic;
static void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
#ifdef _MSC_VER
unsigned int registers[4];
__cpuidex((int *)registers, info, subinfo);
cpuid(0, &maxbasic, &ebx, &ecx, &edx);
cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
x86_cpu_has_sse2 = edx & 0x4000000;
x86_cpu_has_sse42 = ecx & 0x100000;
x86_cpu_has_pclmulqdq = ecx & 0x2;
if (maxbasic >= 7) {
cpuid(7, &eax, &ebx, &ecx, &edx);
// check BMI1 bit
// Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
x86_cpu_has_tzcnt = ebx & 0x8;
} else {
x86_cpu_has_tzcnt = 0;
}
*eax = registers[0];
*ebx = registers[1];
*ecx = registers[2];
*edx = registers[3];
#else
__cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
#endif
}
void Z_INTERNAL x86_check_features(void) {
unsigned eax, ebx, ecx, edx;
unsigned maxbasic;
cpuid(0, &maxbasic, &ebx, &ecx, &edx);
cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
x86_cpu_has_sse2 = edx & 0x4000000;
x86_cpu_has_ssse3 = ecx & 0x200;
x86_cpu_has_sse42 = ecx & 0x100000;
x86_cpu_has_pclmulqdq = ecx & 0x2;
if (maxbasic >= 7) {
cpuidex(7, 0, &eax, &ebx, &ecx, &edx);
// check BMI1 bit
// Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
x86_cpu_has_tzcnt = ebx & 0x8;
// check AVX2 bit
x86_cpu_has_avx2 = ebx & 0x20;
} else {
x86_cpu_has_tzcnt = 0;
x86_cpu_has_avx2 = 0;
}
}
+7 -5
View File
@@ -1,16 +1,18 @@
/* cpu.h -- check for CPU features
* Copyright (C) 2013 Intel Corporation Jim Kukunas
* For conditions of distribution and use, see copyright notice in zlib.h
*/
/* cpu.h -- check for CPU features
* Copyright (C) 2013 Intel Corporation Jim Kukunas
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef CPU_H_
#define CPU_H_
extern int x86_cpu_has_avx2;
extern int x86_cpu_has_sse2;
extern int x86_cpu_has_ssse3;
extern int x86_cpu_has_sse42;
extern int x86_cpu_has_pclmulqdq;
extern int x86_cpu_has_tzcnt;
void ZLIB_INTERNAL x86_check_features(void);
void Z_INTERNAL x86_check_features(void);
#endif /* CPU_H_ */