mirror of
https://github.com/EQEmu/Server.git
synced 2026-05-31 00:46:46 +00:00
[Library] Update zlibng (#1255)
* Update zlibng * Set cmake path more directly in zlibng to hopefully fix an issue with the build on drone * I'm dumb, missing / in path * Mackal helps with a dumb gitignore issue * Adding all the files, not sure what's ignoring them and im tired of looking * Some tweaks to zlibng build to hopefully get it to build properly. works on msvc now
This commit is contained in:
@@ -6,19 +6,27 @@ CC=
|
||||
CFLAGS=
|
||||
SFLAGS=
|
||||
INCLUDES=
|
||||
ACLEFLAG=
|
||||
NEONFLAG=
|
||||
SUFFIX=
|
||||
|
||||
SRCDIR=.
|
||||
SRCTOP=../..
|
||||
TOPDIR=$(SRCTOP)
|
||||
|
||||
all: adler32_neon.o adler32_neon.lo armfeature.o armfeature.lo crc32_acle.o crc32_acle.lo fill_window_arm.o fill_window_arm.lo insert_string_acle.o insert_string_acle.lo
|
||||
all: \
|
||||
adler32_neon.o adler32_neon.lo \
|
||||
armfeature.o armfeature.lo \
|
||||
chunkset_neon.o chunkset_neon.lo \
|
||||
crc32_acle.o crc32_acle.lo \
|
||||
slide_neon.o slide_neon.lo \
|
||||
insert_string_acle.o insert_string_acle.lo
|
||||
|
||||
adler32_neon.o:
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
|
||||
$(CC) $(CFLAGS) $(NEONFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
|
||||
|
||||
adler32_neon.lo:
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
|
||||
$(CC) $(SFLAGS) $(NEONFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
|
||||
|
||||
armfeature.o:
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/armfeature.c
|
||||
@@ -26,23 +34,29 @@ armfeature.o:
|
||||
armfeature.lo:
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/armfeature.c
|
||||
|
||||
chunkset_neon.o:
|
||||
$(CC) $(CFLAGS) $(NEONFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
|
||||
|
||||
chunkset_neon.lo:
|
||||
$(CC) $(SFLAGS) $(NEONFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
|
||||
|
||||
crc32_acle.o:
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
|
||||
$(CC) $(CFLAGS) $(ACLEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
|
||||
|
||||
crc32_acle.lo:
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
|
||||
$(CC) $(SFLAGS) $(ACLEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
|
||||
|
||||
fill_window_arm.o:
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_arm.c
|
||||
slide_neon.o:
|
||||
$(CC) $(CFLAGS) $(NEONFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_neon.c
|
||||
|
||||
fill_window_arm.lo:
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_arm.c
|
||||
slide_neon.lo:
|
||||
$(CC) $(SFLAGS) $(NEONFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_neon.c
|
||||
|
||||
insert_string_acle.o:
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
|
||||
$(CC) $(CFLAGS) $(ACLEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
|
||||
|
||||
insert_string_acle.lo:
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
|
||||
$(CC) $(SFLAGS) $(ACLEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
|
||||
|
||||
mostlyclean: clean
|
||||
clean:
|
||||
|
||||
@@ -2,24 +2,16 @@
|
||||
* Copyright (C) 2017 ARM Holdings Inc.
|
||||
* Author: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include "adler32_neon.h"
|
||||
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
||||
#include <arm_neon.h>
|
||||
#include "adler32_p.h"
|
||||
#ifdef ARM_NEON_ADLER32
|
||||
#ifdef _M_ARM64
|
||||
# include <arm64_neon.h>
|
||||
#else
|
||||
# include <arm_neon.h>
|
||||
#endif
|
||||
#include "../../zutil.h"
|
||||
#include "../../adler32_p.h"
|
||||
|
||||
static void NEON_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
|
||||
static const uint8_t taps[32] = {
|
||||
@@ -109,7 +101,7 @@ uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {
|
||||
|
||||
for (i = 0; i < len; i += n) {
|
||||
if ((i + n) > len)
|
||||
n = len - i;
|
||||
n = (int)(len - i);
|
||||
|
||||
if (n < 16)
|
||||
break;
|
||||
|
||||
@@ -1,29 +0,0 @@
|
||||
/* Copyright (C) 1995-2011, 2016 Mark Adler
|
||||
* Copyright (C) 2017 ARM Holdings Inc.
|
||||
* Author: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
#ifndef __ADLER32_NEON__
|
||||
#define __ADLER32_NEON__
|
||||
|
||||
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
||||
// Depending on the compiler flavor, size_t may be defined in one or the other header. See:
|
||||
// http://stackoverflow.com/questions/26410466/gcc-linaro-compiler-throws-error-unknown-type-name-size-t
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len);
|
||||
#endif
|
||||
#endif
|
||||
@@ -8,6 +8,6 @@
|
||||
extern int arm_cpu_has_neon;
|
||||
extern int arm_cpu_has_crc32;
|
||||
|
||||
void ZLIB_INTERNAL arm_check_features(void);
|
||||
void Z_INTERNAL arm_check_features(void);
|
||||
|
||||
#endif /* ARM_H_ */
|
||||
|
||||
@@ -1,50 +1,69 @@
|
||||
#include "zutil.h"
|
||||
#include "../../zutil.h"
|
||||
|
||||
#if defined(__linux__)
|
||||
# include <sys/auxv.h>
|
||||
# include <asm/hwcap.h>
|
||||
# include <sys/auxv.h>
|
||||
# include <asm/hwcap.h>
|
||||
#elif defined(__FreeBSD__) && defined(__aarch64__)
|
||||
# include <machine/armreg.h>
|
||||
# ifndef ID_AA64ISAR0_CRC32_VAL
|
||||
# define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32
|
||||
# endif
|
||||
#elif defined(__APPLE__)
|
||||
# include <sys/sysctl.h>
|
||||
#elif defined(_WIN32)
|
||||
# include <winapifamily.h>
|
||||
# include <winapifamily.h>
|
||||
#endif
|
||||
|
||||
static int arm_has_crc32() {
|
||||
#if defined(__linux__) && defined(HWCAP2_CRC32)
|
||||
return (getauxval(AT_HWCAP2) & HWCAP2_CRC32) != 0 ? 1 : 0;
|
||||
return (getauxval(AT_HWCAP2) & HWCAP2_CRC32) != 0 ? 1 : 0;
|
||||
#elif defined(__FreeBSD__) && defined(__aarch64__)
|
||||
return getenv("QEMU_EMULATING") == NULL
|
||||
&& ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE;
|
||||
#elif defined(__APPLE__)
|
||||
int hascrc32;
|
||||
size_t size = sizeof(hascrc32);
|
||||
return sysctlbyname("hw.optional.armv8_crc32", &hascrc32, &size, NULL, 0) == 0
|
||||
&& hascrc32 == 1;
|
||||
#elif defined(ARM_NOCHECK_ACLE)
|
||||
return 1;
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* AArch64 has neon. */
|
||||
#if !defined(__aarch64__)
|
||||
static inline int arm_has_neon()
|
||||
{
|
||||
#if defined(__linux__) && defined(HWCAP_NEON)
|
||||
#if !defined(__aarch64__) && !defined(_M_ARM64)
|
||||
static inline int arm_has_neon() {
|
||||
#if defined(__linux__) && defined(HWCAP_NEON)
|
||||
return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0 ? 1 : 0;
|
||||
#elif defined(_M_ARM) && defined(WINAPI_FAMILY_PARTITION)
|
||||
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
|
||||
#elif defined(__APPLE__)
|
||||
int hasneon;
|
||||
size_t size = sizeof(hasneon);
|
||||
return sysctlbyname("hw.optional.neon", &hasneon, &size, NULL, 0) == 0
|
||||
&& hasneon == 1;
|
||||
#elif defined(_M_ARM) && defined(WINAPI_FAMILY_PARTITION)
|
||||
# if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
|
||||
return 1; /* Always supported */
|
||||
#endif
|
||||
#endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if defined(ARM_NOCHECK_NEON)
|
||||
#if defined(ARM_NOCHECK_NEON)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
ZLIB_INTERNAL int arm_cpu_has_neon;
|
||||
ZLIB_INTERNAL int arm_cpu_has_crc32;
|
||||
|
||||
void ZLIB_INTERNAL arm_check_features(void) {
|
||||
#if defined(__aarch64__)
|
||||
arm_cpu_has_neon = 1; /* always available */
|
||||
#else
|
||||
arm_cpu_has_neon = arm_has_neon();
|
||||
return 0;
|
||||
#endif
|
||||
arm_cpu_has_crc32 = arm_has_crc32();
|
||||
}
|
||||
#endif
|
||||
|
||||
Z_INTERNAL int arm_cpu_has_neon;
|
||||
Z_INTERNAL int arm_cpu_has_crc32;
|
||||
|
||||
void Z_INTERNAL arm_check_features(void) {
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
arm_cpu_has_neon = 1; /* always available */
|
||||
#else
|
||||
arm_cpu_has_neon = arm_has_neon();
|
||||
#endif
|
||||
arm_cpu_has_crc32 = arm_has_crc32();
|
||||
}
|
||||
|
||||
@@ -0,0 +1,54 @@
|
||||
/* chunkset_neon.c -- NEON inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef ARM_NEON_CHUNKSET
|
||||
#ifdef _M_ARM64
|
||||
# include <arm64_neon.h>
|
||||
#else
|
||||
# include <arm_neon.h>
|
||||
#endif
|
||||
#include "../../zbuild.h"
|
||||
#include "../../zutil.h"
|
||||
|
||||
typedef uint8x16_t chunk_t;
|
||||
|
||||
#define HAVE_CHUNKMEMSET_1
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
|
||||
static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = vld1q_dup_u8(from);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = vreinterpretq_u8_s16(vdupq_n_s16(*(int16_t *)from));
|
||||
}
|
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = vreinterpretq_u8_s32(vdupq_n_s32(*(int32_t *)from));
|
||||
}
|
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = vcombine_u8(vld1_u8(from), vld1_u8(from));
|
||||
}
|
||||
|
||||
#define CHUNKSIZE chunksize_neon
|
||||
#define CHUNKCOPY chunkcopy_neon
|
||||
#define CHUNKCOPY_SAFE chunkcopy_safe_neon
|
||||
#define CHUNKUNROLL chunkunroll_neon
|
||||
#define CHUNKMEMSET chunkmemset_neon
|
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_neon
|
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
|
||||
*chunk = vld1q_u8(s);
|
||||
}
|
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
|
||||
vst1q_u8(out, *chunk);
|
||||
}
|
||||
|
||||
#include "chunkset_tpl.h"
|
||||
|
||||
#endif
|
||||
@@ -5,21 +5,16 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#ifdef __ARM_FEATURE_CRC32
|
||||
# include <arm_acle.h>
|
||||
# ifdef ZLIB_COMPAT
|
||||
# include <zconf.h>
|
||||
# else
|
||||
# include <zconf-ng.h>
|
||||
# endif
|
||||
# ifdef __linux__
|
||||
# include <stddef.h>
|
||||
# endif
|
||||
#ifdef ARM_ACLE_CRC_HASH
|
||||
#ifndef _MSC_VER
|
||||
# include <arm_acle.h>
|
||||
#endif
|
||||
#include "../../zutil.h"
|
||||
|
||||
uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
|
||||
register uint32_t c;
|
||||
register const uint16_t *buf2;
|
||||
register const uint32_t *buf4;
|
||||
Z_REGISTER uint32_t c;
|
||||
Z_REGISTER const uint16_t *buf2;
|
||||
Z_REGISTER const uint32_t *buf4;
|
||||
|
||||
c = ~crc;
|
||||
if (len && ((ptrdiff_t)buf & 1)) {
|
||||
@@ -36,7 +31,7 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
|
||||
buf4 = (const uint32_t *) buf;
|
||||
}
|
||||
|
||||
# if defined(__aarch64__)
|
||||
#if defined(__aarch64__)
|
||||
if ((len > sizeof(uint32_t)) && ((ptrdiff_t)buf & sizeof(uint32_t))) {
|
||||
c = __crc32w(c, *buf4++);
|
||||
len -= sizeof(uint32_t);
|
||||
@@ -44,7 +39,7 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
|
||||
|
||||
const uint64_t *buf8 = (const uint64_t *) buf4;
|
||||
|
||||
# ifdef UNROLL_MORE
|
||||
#ifdef UNROLL_MORE
|
||||
while (len >= 4 * sizeof(uint64_t)) {
|
||||
c = __crc32d(c, *buf8++);
|
||||
c = __crc32d(c, *buf8++);
|
||||
@@ -52,7 +47,7 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
|
||||
c = __crc32d(c, *buf8++);
|
||||
len -= 4 * sizeof(uint64_t);
|
||||
}
|
||||
# endif
|
||||
#endif
|
||||
|
||||
while (len >= sizeof(uint64_t)) {
|
||||
c = __crc32d(c, *buf8++);
|
||||
@@ -74,7 +69,7 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
|
||||
}
|
||||
|
||||
buf = (const unsigned char *) buf2;
|
||||
# else /* __aarch64__ */
|
||||
#else /* __aarch64__ */
|
||||
|
||||
# ifdef UNROLL_MORE
|
||||
while (len >= 8 * sizeof(uint32_t)) {
|
||||
@@ -103,7 +98,7 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
|
||||
} else {
|
||||
buf = (const unsigned char *) buf4;
|
||||
}
|
||||
# endif /* __aarch64__ */
|
||||
#endif /* __aarch64__ */
|
||||
|
||||
if (len) {
|
||||
c = __crc32b(c, *buf);
|
||||
@@ -112,4 +107,4 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
|
||||
c = ~c;
|
||||
return c;
|
||||
}
|
||||
#endif /* __ARM_FEATURE_CRC32 */
|
||||
#endif
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
static __forceinline unsigned long __builtin_ctzl(unsigned long value) {
|
||||
return _arm_clz(_arm_rbit(value));
|
||||
return _arm_clz(_arm_rbit(value));
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -1,169 +0,0 @@
|
||||
/* fill_window_arm.c -- Optimized hash table shifting for ARM with support for NEON instructions
|
||||
* Copyright (C) 2017 Mika T. Lindqvist
|
||||
*
|
||||
* Authors:
|
||||
* Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* Jun He <jun.he@arm.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
/* @(#) $Id$ */
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "deflate.h"
|
||||
#include "deflate_p.h"
|
||||
#include "functable.h"
|
||||
|
||||
extern ZLIB_INTERNAL int read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
|
||||
|
||||
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
||||
#include <arm_neon.h>
|
||||
|
||||
/* SIMD version of hash_chain rebase */
|
||||
static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) {
|
||||
register uint16x8_t v, *p;
|
||||
register size_t n;
|
||||
|
||||
size_t size = entries*sizeof(table[0]);
|
||||
Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
|
||||
|
||||
Assert(sizeof(Pos) == 2, "Wrong Pos size");
|
||||
v = vdupq_n_u16(window_size);
|
||||
|
||||
p = (uint16x8_t *)table;
|
||||
n = size / (sizeof(uint16x8_t) * 8);
|
||||
do {
|
||||
p[0] = vqsubq_u16(p[0], v);
|
||||
p[1] = vqsubq_u16(p[1], v);
|
||||
p[2] = vqsubq_u16(p[2], v);
|
||||
p[3] = vqsubq_u16(p[3], v);
|
||||
p[4] = vqsubq_u16(p[4], v);
|
||||
p[5] = vqsubq_u16(p[5], v);
|
||||
p[6] = vqsubq_u16(p[6], v);
|
||||
p[7] = vqsubq_u16(p[7], v);
|
||||
p += 8;
|
||||
} while (--n);
|
||||
}
|
||||
#else
|
||||
/* generic version for hash rebase */
|
||||
static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) {
|
||||
unsigned int i;
|
||||
for (i = 0; i < entries; i++) {
|
||||
table[i] = (table[i] >= window_size) ? (table[i] - window_size) : NIL;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void fill_window_arm(deflate_state *s) {
|
||||
register unsigned n;
|
||||
unsigned long more; /* Amount of free space at the end of the window. */
|
||||
unsigned int wsize = s->w_size;
|
||||
|
||||
Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
|
||||
|
||||
do {
|
||||
more = s->window_size - s->lookahead - s->strstart;
|
||||
|
||||
/* If the window is almost full and there is insufficient lookahead,
|
||||
* move the upper half to the lower one to make room in the upper half.
|
||||
*/
|
||||
if (s->strstart >= wsize+MAX_DIST(s)) {
|
||||
memcpy(s->window, s->window+wsize, wsize);
|
||||
s->match_start -= wsize;
|
||||
s->strstart -= wsize; /* we now have strstart >= MAX_DIST */
|
||||
s->block_start -= wsize;
|
||||
|
||||
/* Slide the hash table (could be avoided with 32 bit values
|
||||
at the expense of memory usage). We slide even when level == 0
|
||||
to keep the hash table consistent if we switch back to level > 0
|
||||
later. (Using level 0 permanently is not an optimal usage of
|
||||
zlib, so we don't care about this pathological case.)
|
||||
*/
|
||||
|
||||
slide_hash_chain(s->head, s->hash_size, wsize);
|
||||
slide_hash_chain(s->prev, wsize, wsize);
|
||||
more += wsize;
|
||||
}
|
||||
if (s->strm->avail_in == 0)
|
||||
break;
|
||||
|
||||
/* If there was no sliding:
|
||||
* strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
|
||||
* more == window_size - lookahead - strstart
|
||||
* => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
|
||||
* => more >= window_size - 2*WSIZE + 2
|
||||
* In the BIG_MEM or MMAP case (not yet supported),
|
||||
* window_size == input_size + MIN_LOOKAHEAD &&
|
||||
* strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
|
||||
* Otherwise, window_size == 2*WSIZE so more >= 2.
|
||||
* If there was sliding, more >= WSIZE. So in all cases, more >= 2.
|
||||
*/
|
||||
Assert(more >= 2, "more < 2");
|
||||
|
||||
n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
|
||||
s->lookahead += n;
|
||||
|
||||
/* Initialize the hash value now that we have some input: */
|
||||
if (s->lookahead + s->insert >= MIN_MATCH) {
|
||||
unsigned int str = s->strstart - s->insert;
|
||||
unsigned int insert_cnt = s->insert;
|
||||
unsigned int slen;
|
||||
|
||||
s->ins_h = s->window[str];
|
||||
|
||||
if (unlikely(s->lookahead < MIN_MATCH))
|
||||
insert_cnt += s->lookahead - MIN_MATCH;
|
||||
slen = insert_cnt;
|
||||
if (str >= (MIN_MATCH - 2))
|
||||
{
|
||||
str += 2 - MIN_MATCH;
|
||||
insert_cnt += MIN_MATCH - 2;
|
||||
}
|
||||
if (insert_cnt > 0)
|
||||
{
|
||||
functable.insert_string(s, str, insert_cnt);
|
||||
s->insert -= slen;
|
||||
}
|
||||
}
|
||||
/* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
|
||||
* but this is not important since only literal bytes will be emitted.
|
||||
*/
|
||||
} while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
|
||||
|
||||
/* If the WIN_INIT bytes after the end of the current data have never been
|
||||
* written, then zero those bytes in order to avoid memory check reports of
|
||||
* the use of uninitialized (or uninitialised as Julian writes) bytes by
|
||||
* the longest match routines. Update the high water mark for the next
|
||||
* time through here. WIN_INIT is set to MAX_MATCH since the longest match
|
||||
* routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
|
||||
*/
|
||||
if (s->high_water < s->window_size) {
|
||||
unsigned long curr = s->strstart + (unsigned long)s->lookahead;
|
||||
unsigned long init;
|
||||
|
||||
if (s->high_water < curr) {
|
||||
/* Previous high water mark below current data -- zero WIN_INIT
|
||||
* bytes or up to end of window, whichever is less.
|
||||
*/
|
||||
init = s->window_size - curr;
|
||||
if (init > WIN_INIT)
|
||||
init = WIN_INIT;
|
||||
memset(s->window + curr, 0, init);
|
||||
s->high_water = curr + init;
|
||||
} else if (s->high_water < curr + WIN_INIT) {
|
||||
/* High water mark at or above current data, but below current data
|
||||
* plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
|
||||
* to end of window, whichever is less.
|
||||
*/
|
||||
init = curr + WIN_INIT;
|
||||
if (init > s->window_size)
|
||||
init = s->window_size;
|
||||
init -= s->high_water;
|
||||
memset(s->window + s->high_water, 0, init);
|
||||
s->high_water += init;
|
||||
}
|
||||
}
|
||||
|
||||
Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "not enough room for search");
|
||||
}
|
||||
@@ -5,49 +5,18 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#if defined(__ARM_FEATURE_CRC32) && defined(ARM_ACLE_CRC_HASH)
|
||||
#include <arm_acle.h>
|
||||
#include "zbuild.h"
|
||||
#include "deflate.h"
|
||||
|
||||
/* ===========================================================================
|
||||
* Insert string str in the dictionary and set match_head to the previous head
|
||||
* of the hash chain (the most recent string with same hash key). Return
|
||||
* the previous length of the hash chain.
|
||||
* IN assertion: all calls to to INSERT_STRING are made with consecutive
|
||||
* input characters and the first MIN_MATCH bytes of str are valid
|
||||
* (except for the last MIN_MATCH-1 bytes of the input file).
|
||||
*/
|
||||
Pos insert_string_acle(deflate_state *const s, const Pos str, unsigned int count) {
|
||||
Pos p, lp, ret;
|
||||
|
||||
if (unlikely(count == 0)) {
|
||||
return s->prev[str & s->w_mask];
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
lp = str + count - 1; /* last position */
|
||||
|
||||
for (p = str; p <= lp; p++) {
|
||||
uint32_t val, h, hm;
|
||||
memcpy(&val, &s->window[p], sizeof(val));
|
||||
|
||||
if (s->level >= TRIGGER_LEVEL)
|
||||
val &= 0xFFFFFF;
|
||||
|
||||
h = __crc32w(0, val);
|
||||
hm = h & s->hash_mask;
|
||||
|
||||
Pos head = s->head[hm];
|
||||
if (head != p) {
|
||||
s->prev[p & s->w_mask] = head;
|
||||
s->head[hm] = p;
|
||||
if (p == lp)
|
||||
ret = head;
|
||||
} else if (p == lp) {
|
||||
ret = p;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
#ifdef ARM_ACLE_CRC_HASH
|
||||
#ifndef _MSC_VER
|
||||
# include <arm_acle.h>
|
||||
#endif
|
||||
#include "../../zbuild.h"
|
||||
#include "../../deflate.h"
|
||||
|
||||
#define UPDATE_HASH(s, h, val) \
|
||||
h = __crc32w(0, val)
|
||||
|
||||
#define INSERT_STRING insert_string_acle
|
||||
#define QUICK_INSERT_STRING quick_insert_string_acle
|
||||
|
||||
#include "../../insert_string_tpl.h"
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,52 @@
|
||||
/* slide_neon.c -- Optimized hash table shifting for ARM with support for NEON instructions
|
||||
* Copyright (C) 2017-2020 Mika T. Lindqvist
|
||||
*
|
||||
* Authors:
|
||||
* Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* Jun He <jun.he@arm.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#if defined(ARM_NEON_SLIDEHASH)
|
||||
#ifdef _M_ARM64
|
||||
# include <arm64_neon.h>
|
||||
#else
|
||||
# include <arm_neon.h>
|
||||
#endif
|
||||
#include "../../zbuild.h"
|
||||
#include "../../deflate.h"
|
||||
|
||||
/* SIMD version of hash_chain rebase */
|
||||
static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) {
|
||||
Z_REGISTER uint16x8_t v, *p;
|
||||
Z_REGISTER size_t n;
|
||||
|
||||
size_t size = entries*sizeof(table[0]);
|
||||
Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
|
||||
|
||||
Assert(sizeof(Pos) == 2, "Wrong Pos size");
|
||||
v = vdupq_n_u16(window_size);
|
||||
|
||||
p = (uint16x8_t *)table;
|
||||
n = size / (sizeof(uint16x8_t) * 8);
|
||||
do {
|
||||
p[0] = vqsubq_u16(p[0], v);
|
||||
p[1] = vqsubq_u16(p[1], v);
|
||||
p[2] = vqsubq_u16(p[2], v);
|
||||
p[3] = vqsubq_u16(p[3], v);
|
||||
p[4] = vqsubq_u16(p[4], v);
|
||||
p[5] = vqsubq_u16(p[5], v);
|
||||
p[6] = vqsubq_u16(p[6], v);
|
||||
p[7] = vqsubq_u16(p[7], v);
|
||||
p += 8;
|
||||
} while (--n);
|
||||
}
|
||||
|
||||
Z_INTERNAL void slide_hash_neon(deflate_state *s) {
|
||||
unsigned int wsize = s->w_size;
|
||||
|
||||
slide_hash_chain(s->head, HASH_SIZE, wsize);
|
||||
slide_hash_chain(s->prev, wsize, wsize);
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,49 @@
|
||||
# Makefile for POWER-specific files
|
||||
# Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||||
# For conditions of distribution and use, see copyright notice in zlib.h
|
||||
|
||||
CC=
|
||||
CFLAGS=
|
||||
SFLAGS=
|
||||
INCLUDES=
|
||||
SUFFIX=
|
||||
|
||||
SRCDIR=.
|
||||
SRCTOP=../..
|
||||
TOPDIR=$(SRCTOP)
|
||||
|
||||
P8FLAGS=-mcpu=power8
|
||||
|
||||
all: power.o \
|
||||
power.lo \
|
||||
adler32_power8.o \
|
||||
adler32_power8.lo \
|
||||
slide_hash_power8.o \
|
||||
slide_hash_power8.lo
|
||||
|
||||
power.o:
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power.c
|
||||
|
||||
power.lo:
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power.c
|
||||
|
||||
adler32_power8.o:
|
||||
$(CC) $(CFLAGS) $(P8FLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
|
||||
|
||||
adler32_power8.lo:
|
||||
$(CC) $(SFLAGS) $(P8FLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
|
||||
|
||||
slide_hash_power8.o:
|
||||
$(CC) $(CFLAGS) $(P8FLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
|
||||
|
||||
slide_hash_power8.lo:
|
||||
$(CC) $(SFLAGS) $(P8FLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
|
||||
|
||||
mostlyclean: clean
|
||||
clean:
|
||||
rm -f *.o *.lo *~
|
||||
rm -rf objs
|
||||
rm -f *.gcda *.gcno *.gcov
|
||||
|
||||
distclean:
|
||||
rm -f Makefile
|
||||
@@ -0,0 +1,154 @@
|
||||
/* Adler32 for POWER8 using VSX instructions.
|
||||
* Copyright (C) 2020 IBM Corporation
|
||||
* Author: Rogerio Alves <rcardoso@linux.ibm.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*
|
||||
* Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector)
|
||||
* instructions.
|
||||
*
|
||||
* If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means
|
||||
* iteration n) is the initial value of adler - at start _0 is 1 unless
|
||||
* adler initial value is different than 1. So s1_1 = s1_0 + c[0] after
|
||||
* the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on.
|
||||
* Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on
|
||||
* after iteration N.
|
||||
*
|
||||
* Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] +
|
||||
* N-1*c[1] + ... + c[N]
|
||||
*
|
||||
* In a more general way:
|
||||
*
|
||||
* s1_N = s1_0 + sum(i=1 to N)c[i]
|
||||
* s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i]
|
||||
*
|
||||
* Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we
|
||||
* can process N-bit at time we can do this at once.
|
||||
*
|
||||
* Since VSX can support 16-bit vector instructions, we can process
|
||||
* 16-bit at time using N = 16 we have:
|
||||
*
|
||||
* s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i]
|
||||
* s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i]
|
||||
*
|
||||
* After the first iteration we calculate the adler32 checksum for 16 bytes.
|
||||
*
|
||||
* For more background about adler32 please check the RFC:
|
||||
* https://www.ietf.org/rfc/rfc1950.txt
|
||||
*/
|
||||
|
||||
#ifdef POWER8_VSX_ADLER32
|
||||
|
||||
#include <altivec.h>
|
||||
#include "zbuild.h"
|
||||
#include "zutil.h"
|
||||
#include "adler32_p.h"
|
||||
|
||||
/* Vector across sum unsigned int (saturate). */
|
||||
inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsigned int __b) {
|
||||
__b = vec_sld(__a, __a, 8);
|
||||
__b = vec_add(__b, __a);
|
||||
__a = vec_sld(__b, __b, 4);
|
||||
__a = vec_add(__a, __b);
|
||||
|
||||
return __a;
|
||||
}
|
||||
|
||||
uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len) {
|
||||
uint32_t s1 = adler & 0xffff;
|
||||
uint32_t s2 = (adler >> 16) & 0xffff;
|
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */
|
||||
if (UNLIKELY(len == 1))
|
||||
return adler32_len_1(s1, buf, s2);
|
||||
|
||||
/* If buffer is empty or len=0 we need to return adler initial value. */
|
||||
if (UNLIKELY(buf == NULL))
|
||||
return 1;
|
||||
|
||||
/* This is faster than VSX code for len < 64. */
|
||||
if (len < 64)
|
||||
return adler32_len_64(s1, buf, len, s2);
|
||||
|
||||
/* Use POWER VSX instructions for len >= 64. */
|
||||
const vector unsigned int v_zeros = { 0 };
|
||||
const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
|
||||
6, 5, 4, 3, 2, 1};
|
||||
const vector unsigned char vsh = vec_splat_u8(4);
|
||||
const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};
|
||||
vector unsigned int vs1 = { 0 };
|
||||
vector unsigned int vs2 = { 0 };
|
||||
vector unsigned int vs1_save = { 0 };
|
||||
vector unsigned int vsum1, vsum2;
|
||||
vector unsigned char vbuf;
|
||||
int n;
|
||||
|
||||
vs1[0] = s1;
|
||||
vs2[0] = s2;
|
||||
|
||||
/* Do length bigger than NMAX in blocks of NMAX size. */
|
||||
while (len >= NMAX) {
|
||||
len -= NMAX;
|
||||
n = NMAX / 16;
|
||||
do {
|
||||
vbuf = vec_xl(0, (unsigned char *) buf);
|
||||
vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
|
||||
/* sum(i=1 to 16) buf[i]*(16-i+1). */
|
||||
vsum2 = vec_msum(vbuf, v_mul, v_zeros);
|
||||
/* Save vs1. */
|
||||
vs1_save = vec_add(vs1_save, vs1);
|
||||
/* Accumulate the sums. */
|
||||
vs1 = vec_add(vsum1, vs1);
|
||||
vs2 = vec_add(vsum2, vs2);
|
||||
|
||||
buf += 16;
|
||||
} while (--n);
|
||||
/* Once each block of NMAX size. */
|
||||
vs1 = vec_sumsu(vs1, vsum1);
|
||||
vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
|
||||
vs2 = vec_add(vs1_save, vs2);
|
||||
vs2 = vec_sumsu(vs2, vsum2);
|
||||
|
||||
/* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521. */
|
||||
vs1[0] = vs1[0] % BASE;
|
||||
/* vs2[0] = s2_i + 16*s1_save +
|
||||
sum(i=1 to 16)(16-i+1)*buf[i] mod 65521. */
|
||||
vs2[0] = vs2[0] % BASE;
|
||||
|
||||
vs1 = vec_and(vs1, vmask);
|
||||
vs2 = vec_and(vs2, vmask);
|
||||
vs1_save = v_zeros;
|
||||
}
|
||||
|
||||
/* len is less than NMAX one modulo is needed. */
|
||||
if (len >= 16) {
|
||||
while (len >= 16) {
|
||||
len -= 16;
|
||||
|
||||
vbuf = vec_xl(0, (unsigned char *) buf);
|
||||
|
||||
vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
|
||||
/* sum(i=1 to 16) buf[i]*(16-i+1). */
|
||||
vsum2 = vec_msum(vbuf, v_mul, v_zeros);
|
||||
/* Save vs1. */
|
||||
vs1_save = vec_add(vs1_save, vs1);
|
||||
/* Accumulate the sums. */
|
||||
vs1 = vec_add(vsum1, vs1);
|
||||
vs2 = vec_add(vsum2, vs2);
|
||||
|
||||
buf += 16;
|
||||
}
|
||||
/* Since the size will be always less than NMAX we do this once. */
|
||||
vs1 = vec_sumsu(vs1, vsum1);
|
||||
vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
|
||||
vs2 = vec_add(vs1_save, vs2);
|
||||
vs2 = vec_sumsu(vs2, vsum2);
|
||||
}
|
||||
/* Copy result back to s1, s2 (mod 65521). */
|
||||
s1 = vs1[0] % BASE;
|
||||
s2 = vs2[0] % BASE;
|
||||
|
||||
/* Process tail (len < 16).and return */
|
||||
return adler32_len_16(s1, buf, len, s2);
|
||||
}
|
||||
|
||||
#endif /* POWER8_VSX_ADLER32 */
|
||||
@@ -0,0 +1,19 @@
|
||||
/* POWER feature check
|
||||
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include <sys/auxv.h>
|
||||
#include "../../zutil.h"
|
||||
|
||||
Z_INTERNAL int power_cpu_has_arch_2_07;
|
||||
|
||||
void Z_INTERNAL power_check_features(void) {
|
||||
unsigned long hwcap2;
|
||||
hwcap2 = getauxval(AT_HWCAP2);
|
||||
|
||||
#ifdef POWER8
|
||||
if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
|
||||
power_cpu_has_arch_2_07 = 1;
|
||||
#endif
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
/* power.h -- check for POWER CPU features
|
||||
* Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef POWER_H_
|
||||
#define POWER_H_
|
||||
|
||||
extern int power_cpu_has_arch_2_07;
|
||||
|
||||
void Z_INTERNAL power_check_features(void);
|
||||
|
||||
#endif /* POWER_H_ */
|
||||
@@ -0,0 +1,60 @@
|
||||
/* Optimized slide_hash for POWER processors
|
||||
* Copyright (C) 2019-2020 IBM Corporation
|
||||
* Author: Matheus Castanho <msc@linux.ibm.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef POWER8_VSX_SLIDEHASH
|
||||
|
||||
#include <altivec.h>
|
||||
#include "zbuild.h"
|
||||
#include "deflate.h"
|
||||
|
||||
static inline void slide_hash_power8_loop(deflate_state *s, unsigned n_elems, Pos *table_end) {
|
||||
vector unsigned short vw, vm, *vp;
|
||||
unsigned chunks;
|
||||
|
||||
/* Each vector register (chunk) corresponds to 128 bits == 8 Posf,
|
||||
* so instead of processing each of the n_elems in the hash table
|
||||
* individually, we can do it in chunks of 8 with vector instructions.
|
||||
*
|
||||
* This function is only called from slide_hash_power8(), and both calls
|
||||
* pass n_elems as a power of 2 higher than 2^7, as defined by
|
||||
* deflateInit2_(), so n_elems will always be a multiple of 8. */
|
||||
chunks = n_elems >> 3;
|
||||
Assert(n_elems % 8 == 0, "Weird hash table size!");
|
||||
|
||||
/* This type casting is safe since s->w_size is always <= 64KB
|
||||
* as defined by deflateInit2_() and Posf == unsigned short */
|
||||
vw[0] = (Pos) s->w_size;
|
||||
vw = vec_splat(vw,0);
|
||||
|
||||
vp = (vector unsigned short *) table_end;
|
||||
|
||||
do {
|
||||
/* Processing 8 elements at a time */
|
||||
vp--;
|
||||
vm = *vp;
|
||||
|
||||
/* This is equivalent to: m >= w_size ? m - w_size : 0
|
||||
* Since we are using a saturated unsigned subtraction, any
|
||||
* values that are > w_size will be set to 0, while the others
|
||||
* will be subtracted by w_size. */
|
||||
*vp = vec_subs(vm,vw);
|
||||
} while (--chunks);
|
||||
}
|
||||
|
||||
void Z_INTERNAL slide_hash_power8(deflate_state *s) {
|
||||
unsigned int n;
|
||||
Pos *p;
|
||||
|
||||
n = HASH_SIZE;
|
||||
p = &s->head[n];
|
||||
slide_hash_power8_loop(s,n,p);
|
||||
|
||||
n = s->w_size;
|
||||
p = &s->prev[n];
|
||||
slide_hash_power8_loop(s,n,p);
|
||||
}
|
||||
|
||||
#endif /* POWER8_VSX_SLIDEHASH */
|
||||
+187
-40
@@ -1,6 +1,7 @@
|
||||
This directory contains IBM Z DEFLATE CONVERSION CALL support for
|
||||
zlib-ng. In order to enable it, the following build commands should be
|
||||
used:
|
||||
# Introduction
|
||||
|
||||
This directory contains SystemZ deflate hardware acceleration support.
|
||||
It can be enabled using the following build commands:
|
||||
|
||||
$ ./configure --with-dfltcc-deflate --with-dfltcc-inflate
|
||||
$ make
|
||||
@@ -10,60 +11,206 @@ or
|
||||
$ cmake -DWITH_DFLTCC_DEFLATE=1 -DWITH_DFLTCC_INFLATE=1 .
|
||||
$ make
|
||||
|
||||
When built like this, zlib-ng would compress in hardware on level 1,
|
||||
and in software on all other levels. Decompression will always happen
|
||||
in hardware. In order to enable DFLTCC compression for levels 1-6 (i.e.
|
||||
to make it used by default) one could add -DDFLTCC_LEVEL_MASK=0x7e to
|
||||
CFLAGS when building zlib-ng.
|
||||
When built like this, zlib-ng would compress using hardware on level 1,
|
||||
and using software on all other levels. Decompression will always happen
|
||||
in hardware. In order to enable hardware compression for levels 1-6
|
||||
(i.e. to make it used by default) one could add
|
||||
`-DDFLTCC_LEVEL_MASK=0x7e` to CFLAGS when building zlib-ng.
|
||||
|
||||
Two DFLTCC compression calls produce the same results only when they
|
||||
both are made on machines of the same generation, and when the
|
||||
respective buffers have the same offset relative to the start of the
|
||||
page. Therefore care should be taken when using hardware compression
|
||||
when reproducible results are desired.
|
||||
SystemZ deflate hardware acceleration is available on [IBM z15](
|
||||
https://www.ibm.com/products/z15) and newer machines under the name [
|
||||
"Integrated Accelerator for zEnterprise Data Compression"](
|
||||
https://www.ibm.com/support/z-content-solutions/compression/). The
|
||||
programming interface to it is a machine instruction called DEFLATE
|
||||
CONVERSION CALL (DFLTCC). It is documented in Chapter 26 of [Principles
|
||||
of Operation](http://publibfp.dhe.ibm.com/epubs/pdf/a227832c.pdf). Both
|
||||
the code and the rest of this document refer to this feature simply as
|
||||
"DFLTCC".
|
||||
|
||||
# Performance
|
||||
|
||||
Performance figures are published [here](
|
||||
https://github.com/iii-i/zlib-ng/wiki/Performance-with-dfltcc-patch-applied-and-dfltcc-support-built-on-dfltcc-enabled-machine
|
||||
). The compression speed-up can be as high as 110x and the decompression
|
||||
speed-up can be as high as 15x.
|
||||
|
||||
# Limitations
|
||||
|
||||
Two DFLTCC compression calls with identical inputs are not guaranteed to
|
||||
produce identical outputs. Therefore care should be taken when using
|
||||
hardware compression when reproducible results are desired. In
|
||||
particular, zlib-ng-specific `zng_deflateSetParams` call allows setting
|
||||
`Z_DEFLATE_REPRODUCIBLE` parameter, which disables DFLTCC support for a
|
||||
particular stream.
|
||||
|
||||
DFLTCC does not support every single zlib-ng feature, in particular:
|
||||
|
||||
* inflate(Z_BLOCK) and inflate(Z_TREES)
|
||||
* inflateMark()
|
||||
* inflatePrime()
|
||||
* deflateParams() after the first deflate() call
|
||||
* `inflate(Z_BLOCK)` and `inflate(Z_TREES)`
|
||||
* `inflateMark()`
|
||||
* `inflatePrime()`
|
||||
* `inflateSyncPoint()`
|
||||
|
||||
When used, these functions will either switch to software, or, in case
|
||||
this is not possible, gracefully fail.
|
||||
|
||||
All SystemZ-specific code lives in a separate file and is integrated
|
||||
with the rest of zlib-ng using hook macros, which are explained below.
|
||||
# Code structure
|
||||
|
||||
All SystemZ-specific code lives in `arch/s390` directory and is
|
||||
integrated with the rest of zlib-ng using hook macros.
|
||||
|
||||
## Hook macros
|
||||
|
||||
DFLTCC takes as arguments a parameter block, an input buffer, an output
|
||||
buffer and a window. ZALLOC_STATE, ZFREE_STATE, ZCOPY_STATE,
|
||||
ZALLOC_WINDOW and TRY_FREE_WINDOW macros encapsulate allocation details
|
||||
for the parameter block (which is allocated alongside zlib-ng state)
|
||||
and the window (which must be page-aligned).
|
||||
buffer and a window. `ZALLOC_STATE()`, `ZFREE_STATE()`, `ZCOPY_STATE()`,
|
||||
`ZALLOC_WINDOW()` and `TRY_FREE_WINDOW()` macros encapsulate allocation
|
||||
details for the parameter block (which is allocated alongside zlib-ng
|
||||
state) and the window (which must be page-aligned).
|
||||
|
||||
While for inflate software and hardware window formats match, this is
|
||||
not the case for deflate. Therefore, deflateSetDictionary and
|
||||
deflateGetDictionary need special handling, which is triggered using
|
||||
the DEFLATE_SET_DICTIONARY_HOOK and DEFLATE_GET_DICTIONARY_HOOK macros.
|
||||
While inflate software and hardware window formats match, this is not
|
||||
the case for deflate. Therefore, `deflateSetDictionary()` and
|
||||
`deflateGetDictionary()` need special handling, which is triggered using
|
||||
`DEFLATE_SET_DICTIONARY_HOOK()` and `DEFLATE_GET_DICTIONARY_HOOK()`
|
||||
macros.
|
||||
|
||||
deflateResetKeep() and inflateResetKeep() update the DFLTCC parameter
|
||||
block using DEFLATE_RESET_KEEP_HOOK and INFLATE_RESET_KEEP_HOOK macros.
|
||||
`deflateResetKeep()` and `inflateResetKeep()` update the DFLTCC
|
||||
parameter block using `DEFLATE_RESET_KEEP_HOOK()` and
|
||||
`INFLATE_RESET_KEEP_HOOK()` macros.
|
||||
|
||||
DEFLATE_PARAMS_HOOK, INFLATE_PRIME_HOOK and INFLATE_MARK_HOOK macros
|
||||
make the unsupported deflateParams(), inflatePrime() and inflateMark()
|
||||
calls fail gracefully.
|
||||
`INFLATE_PRIME_HOOK()`, `INFLATE_MARK_HOOK()` and
|
||||
`INFLATE_SYNC_POINT_HOOK()` macros make the respective unsupported
|
||||
calls gracefully fail.
|
||||
|
||||
`DEFLATE_PARAMS_HOOK()` implements switching between hardware and
|
||||
software compression mid-stream using `deflateParams()`. Switching
|
||||
normally entails flushing the current block, which might not be possible
|
||||
in low memory situations. `deflateParams()` uses `DEFLATE_DONE()` hook
|
||||
in order to detect and gracefully handle such situations.
|
||||
|
||||
The algorithm implemented in hardware has different compression ratio
|
||||
than the one implemented in software. DEFLATE_BOUND_ADJUST_COMPLEN and
|
||||
DEFLATE_NEED_CONSERVATIVE_BOUND macros make deflateBound() return the
|
||||
correct results for the hardware implementation.
|
||||
than the one implemented in software. `DEFLATE_BOUND_ADJUST_COMPLEN()`
|
||||
and `DEFLATE_NEED_CONSERVATIVE_BOUND()` macros make `deflateBound()`
|
||||
return the correct results for the hardware implementation.
|
||||
|
||||
Actual compression and decompression are handled by DEFLATE_HOOK and
|
||||
INFLATE_TYPEDO_HOOK macros. Since inflation with DFLTCC manages the
|
||||
window on its own, calling updatewindow() is suppressed using
|
||||
INFLATE_NEED_UPDATEWINDOW() macro.
|
||||
Actual compression and decompression are handled by `DEFLATE_HOOK()` and
|
||||
`INFLATE_TYPEDO_HOOK()` macros. Since inflation with DFLTCC manages the
|
||||
window on its own, calling `updatewindow()` is suppressed using
|
||||
`INFLATE_NEED_UPDATEWINDOW()` macro.
|
||||
|
||||
In addition to compression, DFLTCC computes CRC-32 and Adler-32
|
||||
checksums, therefore, whenever it's used, software checksumming is
|
||||
suppressed using DEFLATE_NEED_CHECKSUM and INFLATE_NEED_CHECKSUM
|
||||
suppressed using `DEFLATE_NEED_CHECKSUM()` and `INFLATE_NEED_CHECKSUM()`
|
||||
macros.
|
||||
|
||||
While software always produces reproducible compression results, this
|
||||
is not the case for DFLTCC. Therefore, zlib-ng users are given the
|
||||
ability to specify whether or not reproducible compression results
|
||||
are required. While it is always possible to specify this setting
|
||||
before the compression begins, it is not always possible to do so in
|
||||
the middle of a deflate stream - the exact conditions for that are
|
||||
determined by `DEFLATE_CAN_SET_REPRODUCIBLE()` macro.
|
||||
|
||||
## SystemZ-specific code
|
||||
|
||||
When zlib-ng is built with DFLTCC, the hooks described above are
|
||||
converted to calls to functions, which are implemented in
|
||||
`arch/s390/dfltcc_*` files. The functions can be grouped in three broad
|
||||
categories:
|
||||
|
||||
* Base DFLTCC support, e.g. wrapping the machine instruction -
|
||||
`dfltcc()` and allocating aligned memory - `dfltcc_alloc_state()`.
|
||||
* Translating between software and hardware data formats, e.g.
|
||||
`dfltcc_deflate_set_dictionary()`.
|
||||
* Translating between software and hardware state machines, e.g.
|
||||
`dfltcc_deflate()` and `dfltcc_inflate()`.
|
||||
|
||||
The functions from the first two categories are fairly simple, however,
|
||||
various quirks in both software and hardware state machines make the
|
||||
functions from the third category quite complicated.
|
||||
|
||||
### `dfltcc_deflate()` function
|
||||
|
||||
This function is called by `deflate()` and has the following
|
||||
responsibilities:
|
||||
|
||||
* Checking whether DFLTCC can be used with the current stream. If this
|
||||
is not the case, then it returns `0`, making `deflate()` use some
|
||||
other function in order to compress in software. Otherwise it returns
|
||||
`1`.
|
||||
* Block management and Huffman table generation. DFLTCC ends blocks only
|
||||
when explicitly instructed to do so by the software. Furthermore,
|
||||
whether to use fixed or dynamic Huffman tables must also be determined
|
||||
by the software. Since looking at data in order to gather statistics
|
||||
would negate performance benefits, the following approach is used: the
|
||||
first `DFLTCC_FIRST_FHT_BLOCK_SIZE` bytes are placed into a fixed
|
||||
block, and every next `DFLTCC_BLOCK_SIZE` bytes are placed into
|
||||
dynamic blocks.
|
||||
* Writing EOBS. Block Closing Control bit in the parameter block
|
||||
instructs DFLTCC to write EOBS, however, certain conditions need to be
|
||||
met: input data length must be non-zero or Continuation Flag must be
|
||||
set. To put this in simpler terms, DFLTCC will silently refuse to
|
||||
write EOBS if this is the only thing that it is asked to do. Since the
|
||||
code has to be able to emit EOBS in software anyway, in order to avoid
|
||||
tricky corner cases Block Closing Control is never used. Whether to
|
||||
write EOBS is instead controlled by `soft_bcc` variable.
|
||||
* Triggering block post-processing. Depending on flush mode, `deflate()`
|
||||
must perform various additional actions when a block or a stream ends.
|
||||
`dfltcc_deflate()` informs `deflate()` about this using
|
||||
`block_state *result` parameter.
|
||||
* Converting software state fields into hardware parameter block fields,
|
||||
and vice versa. For example, `wrap` and Check Value Type or `bi_valid`
|
||||
and Sub-Byte Boundary. Certain fields cannot be translated and must
|
||||
persist untouched in the parameter block between calls, for example,
|
||||
Continuation Flag or Continuation State Buffer.
|
||||
* Handling flush modes and low-memory situations. These aspects are
|
||||
quite intertwined and pervasive. The general idea here is that the
|
||||
code must not do anything in software - whether explicitly by e.g.
|
||||
calling `send_eobs()`, or implicitly - by returning to `deflate()`
|
||||
with certain return and `*result` values, when Continuation Flag is
|
||||
set.
|
||||
* Ending streams. When a new block is started and flush mode is
|
||||
`Z_FINISH`, Block Header Final parameter block bit is used to mark
|
||||
this block as final. However, sometimes an empty final block is
|
||||
needed, and, unfortunately, just like with EOBS, DFLTCC will silently
|
||||
refuse to do this. The general idea of DFLTCC implementation is to
|
||||
rely as much as possible on the existing code. Here in order to do
|
||||
this, the code pretends that it does not support DFLTCC, which makes
|
||||
`deflate()` call a software compression function, which writes an
|
||||
empty final block. Whether this is required is controlled by
|
||||
`need_empty_block` variable.
|
||||
* Error handling. This is simply converting
|
||||
Operation-Ending-Supplemental Code to string. Errors can only happen
|
||||
due to things like memory corruption, and therefore they don't affect
|
||||
the `deflate()` return code.
|
||||
|
||||
### `dfltcc_inflate()` function
|
||||
|
||||
This function is called by `inflate()` from the `TYPEDO` state (that is,
|
||||
when all the metadata is parsed and the stream is positioned at the type
|
||||
bits of deflate block header) and it's responsible for the following:
|
||||
|
||||
* Falling back to software when flush mode is `Z_BLOCK` or `Z_TREES`.
|
||||
Unfortunately, there is no way to ask DFLTCC to stop decompressing on
|
||||
block or tree boundary.
|
||||
* `inflate()` decompression loop management. This is controlled using
|
||||
the return value, which can be either `DFLTCC_INFLATE_BREAK` or
|
||||
`DFLTCC_INFLATE_CONTINUE`.
|
||||
* Converting software state fields into hardware parameter block fields,
|
||||
and vice versa. For example, `whave` and History Length or `wnext` and
|
||||
History Offset.
|
||||
* Ending streams. This instructs `inflate()` to return `Z_STREAM_END`
|
||||
and is controlled by `last` state field.
|
||||
* Error handling. Like deflate, error handling comprises
|
||||
Operation-Ending-Supplemental Code to string conversion. Unlike
|
||||
deflate, errors may happen due to bad inputs, therefore they are
|
||||
propagated to `inflate()` by setting `mode` field to `MEM` or `BAD`.
|
||||
|
||||
# Testing
|
||||
|
||||
Given complexity of DFLTCC machine instruction, it is not clear whether
|
||||
QEMU TCG will ever support it. At the time of writing, one has to have
|
||||
access to an IBM z15+ VM or LPAR in order to test DFLTCC support. Since
|
||||
DFLTCC is a non-privileged instruction, neither special VM/LPAR
|
||||
configuration nor root are required.
|
||||
|
||||
Still, zlib-ng CI has a few QEMU TCG-based configurations that check
|
||||
whether fallback to software is working.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/* dfltcc_deflate.c - IBM Z DEFLATE CONVERSION CALL general support. */
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "../../zbuild.h"
|
||||
#include "dfltcc_common.h"
|
||||
#include "dfltcc_detail.h"
|
||||
|
||||
@@ -12,20 +12,31 @@
|
||||
`posix_memalign' is not an option. Thus, we overallocate and take the
|
||||
aligned portion of the buffer.
|
||||
*/
|
||||
static inline int is_dfltcc_enabled(void)
|
||||
{
|
||||
static inline int is_dfltcc_enabled(void) {
|
||||
uint64_t facilities[(DFLTCC_FACILITY / 64) + 1];
|
||||
register uint8_t r0 __asm__("r0");
|
||||
Z_REGISTER uint8_t r0 __asm__("r0");
|
||||
|
||||
memset(facilities, 0, sizeof(facilities));
|
||||
r0 = sizeof(facilities) / sizeof(facilities[0]) - 1;
|
||||
__asm__ volatile("stfle %[facilities]\n" : [facilities] "=Q" (facilities), [r0] "+r" (r0) :: "cc");
|
||||
/* STFLE is supported since z9-109 and only in z/Architecture mode. When
|
||||
* compiling with -m31, gcc defaults to ESA mode, however, since the kernel
|
||||
* is 64-bit, it's always z/Architecture mode at runtime.
|
||||
*/
|
||||
__asm__ volatile(
|
||||
#ifndef __clang__
|
||||
".machinemode push\n"
|
||||
".machinemode zarch\n"
|
||||
#endif
|
||||
"stfle %[facilities]\n"
|
||||
#ifndef __clang__
|
||||
".machinemode pop\n"
|
||||
#endif
|
||||
: [facilities] "=Q" (facilities), [r0] "+r" (r0) :: "cc");
|
||||
return is_bit_set((const char *)facilities, DFLTCC_FACILITY);
|
||||
}
|
||||
|
||||
void ZLIB_INTERNAL dfltcc_reset(PREFIX3(streamp) strm, uInt size)
|
||||
{
|
||||
struct dfltcc_state *dfltcc_state = (struct dfltcc_state *)((char *)strm->state + size);
|
||||
void Z_INTERNAL dfltcc_reset(PREFIX3(streamp) strm, uInt size) {
|
||||
struct dfltcc_state *dfltcc_state = (struct dfltcc_state *)((char *)strm->state + ALIGN_UP(size, 8));
|
||||
struct dfltcc_qaf_param *param = (struct dfltcc_qaf_param *)&dfltcc_state->param;
|
||||
|
||||
/* Initialize available functions */
|
||||
@@ -47,24 +58,17 @@ void ZLIB_INTERNAL dfltcc_reset(PREFIX3(streamp) strm, uInt size)
|
||||
dfltcc_state->param.ribm = DFLTCC_RIBM;
|
||||
}
|
||||
|
||||
void ZLIB_INTERNAL *dfltcc_alloc_state(PREFIX3(streamp) strm, uInt items, uInt size)
|
||||
{
|
||||
Assert((items * size) % 8 == 0,
|
||||
"The size of zlib-ng state must be a multiple of 8");
|
||||
return ZALLOC(strm, items * size + sizeof(struct dfltcc_state), sizeof(unsigned char));
|
||||
void Z_INTERNAL *dfltcc_alloc_state(PREFIX3(streamp) strm, uInt items, uInt size) {
|
||||
return ZALLOC(strm, ALIGN_UP(items * size, 8) + sizeof(struct dfltcc_state), sizeof(unsigned char));
|
||||
}
|
||||
|
||||
void ZLIB_INTERNAL dfltcc_copy_state(void *dst, const void *src, uInt size)
|
||||
{
|
||||
memcpy(dst, src, size + sizeof(struct dfltcc_state));
|
||||
void Z_INTERNAL dfltcc_copy_state(void *dst, const void *src, uInt size) {
|
||||
memcpy(dst, src, ALIGN_UP(size, 8) + sizeof(struct dfltcc_state));
|
||||
}
|
||||
|
||||
static const int PAGE_ALIGN = 0x1000;
|
||||
|
||||
#define ALIGN_UP(p, size) (__typeof__(p))(((uintptr_t)(p) + ((size) - 1)) & ~((size) - 1))
|
||||
|
||||
void ZLIB_INTERNAL *dfltcc_alloc_window(PREFIX3(streamp) strm, uInt items, uInt size)
|
||||
{
|
||||
void Z_INTERNAL *dfltcc_alloc_window(PREFIX3(streamp) strm, uInt items, uInt size) {
|
||||
void *p;
|
||||
void *w;
|
||||
|
||||
@@ -79,8 +83,7 @@ void ZLIB_INTERNAL *dfltcc_alloc_window(PREFIX3(streamp) strm, uInt items, uInt
|
||||
return w;
|
||||
}
|
||||
|
||||
void ZLIB_INTERNAL dfltcc_free_window(PREFIX3(streamp) strm, void *w)
|
||||
{
|
||||
void Z_INTERNAL dfltcc_free_window(PREFIX3(streamp) strm, void *w) {
|
||||
if (w)
|
||||
ZFREE(strm, *(void **)((unsigned char *)w - sizeof(void *)));
|
||||
}
|
||||
|
||||
@@ -2,17 +2,17 @@
|
||||
#define DFLTCC_COMMON_H
|
||||
|
||||
#ifdef ZLIB_COMPAT
|
||||
#include "zlib.h"
|
||||
#include "../../zlib.h"
|
||||
#else
|
||||
#include "zlib-ng.h"
|
||||
#include "../../zlib-ng.h"
|
||||
#endif
|
||||
#include "zutil.h"
|
||||
#include "../../zutil.h"
|
||||
|
||||
void ZLIB_INTERNAL *dfltcc_alloc_state(PREFIX3(streamp) strm, uInt items, uInt size);
|
||||
void ZLIB_INTERNAL dfltcc_copy_state(void *dst, const void *src, uInt size);
|
||||
void ZLIB_INTERNAL dfltcc_reset(PREFIX3(streamp) strm, uInt size);
|
||||
void ZLIB_INTERNAL *dfltcc_alloc_window(PREFIX3(streamp) strm, uInt items, uInt size);
|
||||
void ZLIB_INTERNAL dfltcc_free_window(PREFIX3(streamp) strm, void *w);
|
||||
void Z_INTERNAL *dfltcc_alloc_state(PREFIX3(streamp) strm, uInt items, uInt size);
|
||||
void Z_INTERNAL dfltcc_copy_state(void *dst, const void *src, uInt size);
|
||||
void Z_INTERNAL dfltcc_reset(PREFIX3(streamp) strm, uInt size);
|
||||
void Z_INTERNAL *dfltcc_alloc_window(PREFIX3(streamp) strm, uInt items, uInt size);
|
||||
void Z_INTERNAL dfltcc_free_window(PREFIX3(streamp) strm, void *w);
|
||||
|
||||
#define ZALLOC_STATE dfltcc_alloc_state
|
||||
|
||||
|
||||
@@ -13,27 +13,26 @@
|
||||
$ make
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zutil.h"
|
||||
#include "deflate.h"
|
||||
#include "../../zbuild.h"
|
||||
#include "../../zutil.h"
|
||||
#include "../../deflate.h"
|
||||
#include "../../trees_emit.h"
|
||||
#include "dfltcc_deflate.h"
|
||||
#include "dfltcc_detail.h"
|
||||
|
||||
static inline int dfltcc_are_params_ok(int level, uInt window_bits, int strategy, uint16_t level_mask)
|
||||
{
|
||||
return (level_mask & ((uint16_t)1 << level)) != 0 &&
|
||||
(window_bits == HB_BITS) &&
|
||||
(strategy == Z_FIXED || strategy == Z_DEFAULT_STRATEGY);
|
||||
}
|
||||
|
||||
|
||||
int ZLIB_INTERNAL dfltcc_can_deflate(PREFIX3(streamp) strm)
|
||||
{
|
||||
static inline int dfltcc_can_deflate_with_params(PREFIX3(streamp) strm, int level, uInt window_bits, int strategy,
|
||||
int reproducible) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
|
||||
|
||||
/* Unsupported compression settings */
|
||||
if (!dfltcc_are_params_ok(state->level, state->w_bits, state->strategy, dfltcc_state->level_mask))
|
||||
if ((dfltcc_state->level_mask & (1 << level)) == 0)
|
||||
return 0;
|
||||
if (window_bits != HB_BITS)
|
||||
return 0;
|
||||
if (strategy != Z_FIXED && strategy != Z_DEFAULT_STRATEGY)
|
||||
return 0;
|
||||
if (reproducible)
|
||||
return 0;
|
||||
|
||||
/* Unsupported hardware */
|
||||
@@ -45,8 +44,13 @@ int ZLIB_INTERNAL dfltcc_can_deflate(PREFIX3(streamp) strm)
|
||||
return 1;
|
||||
}
|
||||
|
||||
static inline void dfltcc_gdht(PREFIX3(streamp) strm)
|
||||
{
|
||||
int Z_INTERNAL dfltcc_can_deflate(PREFIX3(streamp) strm) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
|
||||
return dfltcc_can_deflate_with_params(strm, state->level, state->w_bits, state->strategy, state->reproducible);
|
||||
}
|
||||
|
||||
static inline void dfltcc_gdht(PREFIX3(streamp) strm) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
|
||||
size_t avail_in = strm->avail_in;
|
||||
@@ -54,8 +58,7 @@ static inline void dfltcc_gdht(PREFIX3(streamp) strm)
|
||||
dfltcc(DFLTCC_GDHT, param, NULL, NULL, &strm->next_in, &avail_in, NULL);
|
||||
}
|
||||
|
||||
static inline dfltcc_cc dfltcc_cmpr(PREFIX3(streamp) strm)
|
||||
{
|
||||
static inline dfltcc_cc dfltcc_cmpr(PREFIX3(streamp) strm) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
|
||||
size_t avail_in = strm->avail_in;
|
||||
@@ -72,11 +75,10 @@ static inline dfltcc_cc dfltcc_cmpr(PREFIX3(streamp) strm)
|
||||
return cc;
|
||||
}
|
||||
|
||||
static inline void send_eobs(PREFIX3(streamp) strm, const struct dfltcc_param_v0 *param)
|
||||
{
|
||||
static inline void send_eobs(PREFIX3(streamp) strm, const struct dfltcc_param_v0 *param) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
|
||||
send_bits(state, bi_reverse(param->eobs >> (15 - param->eobl), param->eobl), param->eobl);
|
||||
send_bits(state, bi_reverse(param->eobs >> (15 - param->eobl), param->eobl), param->eobl, state->bi_buf, state->bi_valid);
|
||||
flush_pending(strm);
|
||||
if (state->pending != 0) {
|
||||
/* The remaining data is located in pending_out[0:pending]. If someone
|
||||
@@ -93,8 +95,7 @@ static inline void send_eobs(PREFIX3(streamp) strm, const struct dfltcc_param_v0
|
||||
#endif
|
||||
}
|
||||
|
||||
int ZLIB_INTERNAL dfltcc_deflate(PREFIX3(streamp) strm, int flush, block_state *result)
|
||||
{
|
||||
int Z_INTERNAL dfltcc_deflate(PREFIX3(streamp) strm, int flush, block_state *result) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
|
||||
struct dfltcc_param_v0 *param = &dfltcc_state->param;
|
||||
@@ -104,31 +105,38 @@ int ZLIB_INTERNAL dfltcc_deflate(PREFIX3(streamp) strm, int flush, block_state *
|
||||
int soft_bcc;
|
||||
int no_flush;
|
||||
|
||||
if (!dfltcc_can_deflate(strm))
|
||||
if (!dfltcc_can_deflate(strm)) {
|
||||
/* Clear history. */
|
||||
if (flush == Z_FULL_FLUSH)
|
||||
param->hl = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
again:
|
||||
masked_avail_in = 0;
|
||||
soft_bcc = 0;
|
||||
no_flush = flush == Z_NO_FLUSH;
|
||||
|
||||
/* Trailing empty block. Switch to software, except when Continuation Flag
|
||||
* is set, which means that DFLTCC has buffered some output in the
|
||||
* parameter block and needs to be called again in order to flush it.
|
||||
/* No input data. Return, except when Continuation Flag is set, which means
|
||||
* that DFLTCC has buffered some output in the parameter block and needs to
|
||||
* be called again in order to flush it.
|
||||
*/
|
||||
if (flush == Z_FINISH && strm->avail_in == 0 && !param->cf) {
|
||||
if (param->bcf) {
|
||||
/* A block is still open, and the hardware does not support closing
|
||||
* blocks without adding data. Thus, close it manually.
|
||||
*/
|
||||
if (strm->avail_in == 0 && !param->cf) {
|
||||
/* A block is still open, and the hardware does not support closing
|
||||
* blocks without adding data. Thus, close it manually.
|
||||
*/
|
||||
if (!no_flush && param->bcf) {
|
||||
send_eobs(strm, param);
|
||||
param->bcf = 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (strm->avail_in == 0 && !param->cf) {
|
||||
*result = need_more;
|
||||
/* Let one of deflate_* functions write a trailing empty block. */
|
||||
if (flush == Z_FINISH)
|
||||
return 0;
|
||||
/* Clear history. */
|
||||
if (flush == Z_FULL_FLUSH)
|
||||
param->hl = 0;
|
||||
/* Trigger block post-processing if necessary. */
|
||||
*result = no_flush ? need_more : block_done;
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -154,13 +162,18 @@ again:
|
||||
send_eobs(strm, param);
|
||||
param->bcf = 0;
|
||||
dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size;
|
||||
if (strm->avail_out == 0) {
|
||||
*result = need_more;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* No space for compressed data. If we proceed, dfltcc_cmpr() will return
|
||||
* DFLTCC_CC_OP1_TOO_SHORT without buffering header bits, but we will still
|
||||
* set BCF=1, which is wrong. Avoid complications and return early.
|
||||
*/
|
||||
if (strm->avail_out == 0) {
|
||||
*result = need_more;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* The caller gave us too much data. Pass only one block worth of
|
||||
* uncompressed data to DFLTCC and mask the rest, so that on the next
|
||||
* iteration we start a new block.
|
||||
@@ -180,7 +193,7 @@ again:
|
||||
param->cvt = state->wrap == 2 ? CVT_CRC32 : CVT_ADLER32;
|
||||
if (!no_flush)
|
||||
/* We need to close a block. Always do this in software - when there is
|
||||
* no input data, the hardware will not nohor BCC. */
|
||||
* no input data, the hardware will not honor BCC. */
|
||||
soft_bcc = 1;
|
||||
if (flush == Z_FINISH && !param->bcf)
|
||||
/* We are about to open a BFINAL block, set Block Header Final bit
|
||||
@@ -195,8 +208,8 @@ again:
|
||||
param->sbb = (unsigned int)state->bi_valid;
|
||||
if (param->sbb > 0)
|
||||
*strm->next_out = (unsigned char)state->bi_buf;
|
||||
if (param->hl)
|
||||
param->nt = 0; /* Honor history */
|
||||
/* Honor history and check value */
|
||||
param->nt = 0;
|
||||
param->cv = state->wrap == 2 ? ZSWAP32(strm->adler) : strm->adler;
|
||||
|
||||
/* When opening a block, choose a Huffman-Table Type */
|
||||
@@ -277,31 +290,60 @@ again:
|
||||
fly with deflateParams, we need to convert between hardware and software
|
||||
window formats.
|
||||
*/
|
||||
int ZLIB_INTERNAL dfltcc_deflate_params(PREFIX3(streamp) strm, int level, int strategy)
|
||||
{
|
||||
static int dfltcc_was_deflate_used(PREFIX3(streamp) strm) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
|
||||
|
||||
return strm->total_in > 0 || param->nt == 0 || param->hl > 0;
|
||||
}
|
||||
|
||||
int Z_INTERNAL dfltcc_deflate_params(PREFIX3(streamp) strm, int level, int strategy, int *flush) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
|
||||
struct dfltcc_param_v0 *param = &dfltcc_state->param;
|
||||
int could_deflate = dfltcc_can_deflate(strm);
|
||||
int can_deflate = dfltcc_are_params_ok(level, state->w_bits, strategy, dfltcc_state->level_mask);
|
||||
int can_deflate = dfltcc_can_deflate_with_params(strm, level, state->w_bits, strategy, state->reproducible);
|
||||
|
||||
if (can_deflate == could_deflate)
|
||||
/* We continue to work in the same mode - no changes needed */
|
||||
return Z_OK;
|
||||
|
||||
if (strm->total_in == 0 && param->nt == 1 && param->hl == 0)
|
||||
if (!dfltcc_was_deflate_used(strm))
|
||||
/* DFLTCC was not used yet - no changes needed */
|
||||
return Z_OK;
|
||||
|
||||
/* Switching between hardware and software is not implemented */
|
||||
return Z_STREAM_ERROR;
|
||||
/* For now, do not convert between window formats - simply get rid of the old data instead */
|
||||
*flush = Z_FULL_FLUSH;
|
||||
return Z_OK;
|
||||
}
|
||||
|
||||
int Z_INTERNAL dfltcc_deflate_done(PREFIX3(streamp) strm, int flush) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
|
||||
struct dfltcc_param_v0 *param = &dfltcc_state->param;
|
||||
|
||||
/* When deflate(Z_FULL_FLUSH) is called with small avail_out, it might
|
||||
* close the block without resetting the compression state. Detect this
|
||||
* situation and return that deflation is not done.
|
||||
*/
|
||||
if (flush == Z_FULL_FLUSH && strm->avail_out == 0)
|
||||
return 0;
|
||||
|
||||
/* Return that deflation is not done if DFLTCC is used and either it
|
||||
* buffered some data (Continuation Flag is set), or has not written EOBS
|
||||
* yet (Block-Continuation Flag is set).
|
||||
*/
|
||||
return !dfltcc_can_deflate(strm) || (!param->cf && !param->bcf);
|
||||
}
|
||||
|
||||
int Z_INTERNAL dfltcc_can_set_reproducible(PREFIX3(streamp) strm, int reproducible) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
|
||||
return reproducible != state->reproducible && !dfltcc_was_deflate_used(strm);
|
||||
}
|
||||
|
||||
/*
|
||||
Preloading history.
|
||||
*/
|
||||
static void append_history(struct dfltcc_param_v0 *param, unsigned char *history, const unsigned char *buf, uInt count)
|
||||
{
|
||||
static void append_history(struct dfltcc_param_v0 *param, unsigned char *history, const unsigned char *buf, uInt count) {
|
||||
size_t offset;
|
||||
size_t n;
|
||||
|
||||
@@ -331,20 +373,19 @@ static void append_history(struct dfltcc_param_v0 *param, unsigned char *history
|
||||
}
|
||||
}
|
||||
|
||||
int ZLIB_INTERNAL dfltcc_deflate_set_dictionary(PREFIX3(streamp) strm,
|
||||
const unsigned char *dictionary, uInt dict_length)
|
||||
{
|
||||
int Z_INTERNAL dfltcc_deflate_set_dictionary(PREFIX3(streamp) strm,
|
||||
const unsigned char *dictionary, uInt dict_length) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
|
||||
struct dfltcc_param_v0 *param = &dfltcc_state->param;
|
||||
|
||||
append_history(param, state->window, dictionary, dict_length);
|
||||
state->strstart = 1; /* Add FDICT to zlib header */
|
||||
state->block_start = state->strstart; /* Make deflate_stored happy */
|
||||
return Z_OK;
|
||||
}
|
||||
|
||||
int ZLIB_INTERNAL dfltcc_deflate_get_dictionary(PREFIX3(streamp) strm, unsigned char *dictionary, uInt *dict_length)
|
||||
{
|
||||
int Z_INTERNAL dfltcc_deflate_get_dictionary(PREFIX3(streamp) strm, unsigned char *dictionary, uInt *dict_length) {
|
||||
deflate_state *state = (deflate_state *)strm->state;
|
||||
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
|
||||
struct dfltcc_param_v0 *param = &dfltcc_state->param;
|
||||
|
||||
@@ -3,12 +3,14 @@
|
||||
|
||||
#include "dfltcc_common.h"
|
||||
|
||||
int ZLIB_INTERNAL dfltcc_can_deflate(PREFIX3(streamp) strm);
|
||||
int ZLIB_INTERNAL dfltcc_deflate(PREFIX3(streamp) strm, int flush, block_state *result);
|
||||
int ZLIB_INTERNAL dfltcc_deflate_params(PREFIX3(streamp) strm, int level, int strategy);
|
||||
int ZLIB_INTERNAL dfltcc_deflate_set_dictionary(PREFIX3(streamp) strm,
|
||||
int Z_INTERNAL dfltcc_can_deflate(PREFIX3(streamp) strm);
|
||||
int Z_INTERNAL dfltcc_deflate(PREFIX3(streamp) strm, int flush, block_state *result);
|
||||
int Z_INTERNAL dfltcc_deflate_params(PREFIX3(streamp) strm, int level, int strategy, int *flush);
|
||||
int Z_INTERNAL dfltcc_deflate_done(PREFIX3(streamp) strm, int flush);
|
||||
int Z_INTERNAL dfltcc_can_set_reproducible(PREFIX3(streamp) strm, int reproducible);
|
||||
int Z_INTERNAL dfltcc_deflate_set_dictionary(PREFIX3(streamp) strm,
|
||||
const unsigned char *dictionary, uInt dict_length);
|
||||
int ZLIB_INTERNAL dfltcc_deflate_get_dictionary(PREFIX3(streamp) strm, unsigned char *dictionary, uInt* dict_length);
|
||||
int Z_INTERNAL dfltcc_deflate_get_dictionary(PREFIX3(streamp) strm, unsigned char *dictionary, uInt* dict_length);
|
||||
|
||||
#define DEFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) \
|
||||
do { \
|
||||
@@ -25,15 +27,17 @@ int ZLIB_INTERNAL dfltcc_deflate_get_dictionary(PREFIX3(streamp) strm, unsigned
|
||||
#define DEFLATE_RESET_KEEP_HOOK(strm) \
|
||||
dfltcc_reset((strm), sizeof(deflate_state))
|
||||
|
||||
#define DEFLATE_PARAMS_HOOK(strm, level, strategy) \
|
||||
#define DEFLATE_PARAMS_HOOK(strm, level, strategy, hook_flush) \
|
||||
do { \
|
||||
int err; \
|
||||
\
|
||||
err = dfltcc_deflate_params((strm), (level), (strategy)); \
|
||||
err = dfltcc_deflate_params((strm), (level), (strategy), (hook_flush)); \
|
||||
if (err == Z_STREAM_ERROR) \
|
||||
return err; \
|
||||
} while (0)
|
||||
|
||||
#define DEFLATE_DONE dfltcc_deflate_done
|
||||
|
||||
#define DEFLATE_BOUND_ADJUST_COMPLEN(strm, complen, source_len) \
|
||||
do { \
|
||||
if (dfltcc_can_deflate((strm))) \
|
||||
@@ -47,4 +51,6 @@ int ZLIB_INTERNAL dfltcc_deflate_get_dictionary(PREFIX3(streamp) strm, unsigned
|
||||
|
||||
#define DEFLATE_NEED_CHECKSUM(strm) (!dfltcc_can_deflate((strm)))
|
||||
|
||||
#define DEFLATE_CAN_SET_REPRODUCIBLE dfltcc_can_set_reproducible
|
||||
|
||||
#endif
|
||||
|
||||
@@ -46,18 +46,17 @@ typedef enum {
|
||||
#define DFLTCC_FACILITY 151
|
||||
|
||||
static inline dfltcc_cc dfltcc(int fn, void *param,
|
||||
unsigned char **op1, size_t *len1, const unsigned char **op2, size_t *len2, void *hist)
|
||||
{
|
||||
unsigned char **op1, size_t *len1, z_const unsigned char **op2, size_t *len2, void *hist) {
|
||||
unsigned char *t2 = op1 ? *op1 : NULL;
|
||||
size_t t3 = len1 ? *len1 : 0;
|
||||
const unsigned char *t4 = op2 ? *op2 : NULL;
|
||||
z_const unsigned char *t4 = op2 ? *op2 : NULL;
|
||||
size_t t5 = len2 ? *len2 : 0;
|
||||
register int r0 __asm__("r0") = fn;
|
||||
register void *r1 __asm__("r1") = param;
|
||||
register unsigned char *r2 __asm__("r2") = t2;
|
||||
register size_t r3 __asm__("r3") = t3;
|
||||
register const unsigned char *r4 __asm__("r4") = t4;
|
||||
register size_t r5 __asm__("r5") = t5;
|
||||
Z_REGISTER int r0 __asm__("r0") = fn;
|
||||
Z_REGISTER void *r1 __asm__("r1") = param;
|
||||
Z_REGISTER unsigned char *r2 __asm__("r2") = t2;
|
||||
Z_REGISTER size_t r3 __asm__("r3") = t3;
|
||||
Z_REGISTER z_const unsigned char *r4 __asm__("r4") = t4;
|
||||
Z_REGISTER size_t r5 __asm__("r5") = t5;
|
||||
int cc;
|
||||
|
||||
__asm__ volatile(
|
||||
@@ -108,13 +107,11 @@ struct dfltcc_qaf_param {
|
||||
|
||||
static_assert(sizeof(struct dfltcc_qaf_param) == 32, sizeof_struct_dfltcc_qaf_param_is_32);
|
||||
|
||||
static inline int is_bit_set(const char *bits, int n)
|
||||
{
|
||||
static inline int is_bit_set(const char *bits, int n) {
|
||||
return bits[n / 8] & (1 << (7 - (n % 8)));
|
||||
}
|
||||
|
||||
static inline void clear_bit(char *bits, int n)
|
||||
{
|
||||
static inline void clear_bit(char *bits, int n) {
|
||||
bits[n / 8] &= ~(1 << (7 - (n % 8)));
|
||||
}
|
||||
|
||||
@@ -175,8 +172,7 @@ struct dfltcc_param_v0 {
|
||||
|
||||
static_assert(sizeof(struct dfltcc_param_v0) == 1536, sizeof_struct_dfltcc_param_v0_is_1536);
|
||||
|
||||
static inline const char *oesc_msg(char *buf, int oesc)
|
||||
{
|
||||
static inline z_const char *oesc_msg(char *buf, int oesc) {
|
||||
if (oesc == 0x00)
|
||||
return NULL; /* Successful completion */
|
||||
else {
|
||||
@@ -198,4 +194,6 @@ struct dfltcc_state {
|
||||
char msg[64]; /* Buffer for strm->msg */
|
||||
};
|
||||
|
||||
#define GET_DFLTCC_STATE(state) ((struct dfltcc_state *)((state) + 1))
|
||||
#define ALIGN_UP(p, size) (__typeof__(p))(((uintptr_t)(p) + ((size) - 1)) & ~((size) - 1))
|
||||
|
||||
#define GET_DFLTCC_STATE(state) ((struct dfltcc_state *)((char *)(state) + ALIGN_UP(sizeof(*state), 8)))
|
||||
|
||||
@@ -13,15 +13,14 @@
|
||||
$ make
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zutil.h"
|
||||
#include "inftrees.h"
|
||||
#include "inflate.h"
|
||||
#include "../../zbuild.h"
|
||||
#include "../../zutil.h"
|
||||
#include "../../inftrees.h"
|
||||
#include "../../inflate.h"
|
||||
#include "dfltcc_inflate.h"
|
||||
#include "dfltcc_detail.h"
|
||||
|
||||
int ZLIB_INTERNAL dfltcc_can_inflate(PREFIX3(streamp) strm)
|
||||
{
|
||||
int Z_INTERNAL dfltcc_can_inflate(PREFIX3(streamp) strm) {
|
||||
struct inflate_state *state = (struct inflate_state *)strm->state;
|
||||
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
|
||||
|
||||
@@ -33,8 +32,7 @@ int ZLIB_INTERNAL dfltcc_can_inflate(PREFIX3(streamp) strm)
|
||||
return is_bit_set(dfltcc_state->af.fns, DFLTCC_XPND) && is_bit_set(dfltcc_state->af.fmts, DFLTCC_FMT0);
|
||||
}
|
||||
|
||||
static inline dfltcc_cc dfltcc_xpnd(PREFIX3(streamp) strm)
|
||||
{
|
||||
static inline dfltcc_cc dfltcc_xpnd(PREFIX3(streamp) strm) {
|
||||
struct inflate_state *state = (struct inflate_state *)strm->state;
|
||||
struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
|
||||
size_t avail_in = strm->avail_in;
|
||||
@@ -49,8 +47,7 @@ static inline dfltcc_cc dfltcc_xpnd(PREFIX3(streamp) strm)
|
||||
return cc;
|
||||
}
|
||||
|
||||
dfltcc_inflate_action ZLIB_INTERNAL dfltcc_inflate(PREFIX3(streamp) strm, int flush, int *ret)
|
||||
{
|
||||
dfltcc_inflate_action Z_INTERNAL dfltcc_inflate(PREFIX3(streamp) strm, int flush, int *ret) {
|
||||
struct inflate_state *state = (struct inflate_state *)strm->state;
|
||||
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
|
||||
struct dfltcc_param_v0 *param = &dfltcc_state->param;
|
||||
@@ -115,16 +112,14 @@ dfltcc_inflate_action ZLIB_INTERNAL dfltcc_inflate(PREFIX3(streamp) strm, int fl
|
||||
DFLTCC_INFLATE_BREAK : DFLTCC_INFLATE_CONTINUE;
|
||||
}
|
||||
|
||||
int ZLIB_INTERNAL dfltcc_was_inflate_used(PREFIX3(streamp) strm)
|
||||
{
|
||||
int Z_INTERNAL dfltcc_was_inflate_used(PREFIX3(streamp) strm) {
|
||||
struct inflate_state *state = (struct inflate_state *)strm->state;
|
||||
struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param;
|
||||
|
||||
return !param->nt;
|
||||
}
|
||||
|
||||
int ZLIB_INTERNAL dfltcc_inflate_disable(PREFIX3(streamp) strm)
|
||||
{
|
||||
int Z_INTERNAL dfltcc_inflate_disable(PREFIX3(streamp) strm) {
|
||||
struct inflate_state *state = (struct inflate_state *)strm->state;
|
||||
struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state);
|
||||
|
||||
|
||||
@@ -3,15 +3,15 @@
|
||||
|
||||
#include "dfltcc_common.h"
|
||||
|
||||
int ZLIB_INTERNAL dfltcc_can_inflate(PREFIX3(streamp) strm);
|
||||
int Z_INTERNAL dfltcc_can_inflate(PREFIX3(streamp) strm);
|
||||
typedef enum {
|
||||
DFLTCC_INFLATE_CONTINUE,
|
||||
DFLTCC_INFLATE_BREAK,
|
||||
DFLTCC_INFLATE_SOFTWARE,
|
||||
} dfltcc_inflate_action;
|
||||
dfltcc_inflate_action ZLIB_INTERNAL dfltcc_inflate(PREFIX3(streamp) strm, int flush, int *ret);
|
||||
int ZLIB_INTERNAL dfltcc_was_inflate_used(PREFIX3(streamp) strm);
|
||||
int ZLIB_INTERNAL dfltcc_inflate_disable(PREFIX3(streamp) strm);
|
||||
dfltcc_inflate_action Z_INTERNAL dfltcc_inflate(PREFIX3(streamp) strm, int flush, int *ret);
|
||||
int Z_INTERNAL dfltcc_was_inflate_used(PREFIX3(streamp) strm);
|
||||
int Z_INTERNAL dfltcc_inflate_disable(PREFIX3(streamp) strm);
|
||||
|
||||
#define INFLATE_RESET_KEEP_HOOK(strm) \
|
||||
dfltcc_reset((strm), sizeof(struct inflate_state))
|
||||
@@ -41,4 +41,9 @@ int ZLIB_INTERNAL dfltcc_inflate_disable(PREFIX3(streamp) strm);
|
||||
if (dfltcc_was_inflate_used((strm))) return -(1L << 16); \
|
||||
} while (0)
|
||||
|
||||
#define INFLATE_SYNC_POINT_HOOK(strm) \
|
||||
do { \
|
||||
if (dfltcc_was_inflate_used((strm))) return Z_STREAM_ERROR; \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,3 +0,0 @@
|
||||
fill_window_sse.c SSE2 optimized fill_window
|
||||
deflate_quick.c SSE4 optimized deflate strategy for use as level 1
|
||||
crc_folding.c SSE4 + PCLMULQDQ optimized CRC folding implementation
|
||||
@@ -0,0 +1,8 @@
|
||||
Contents
|
||||
--------
|
||||
|
||||
|Name|Description|
|
||||
|:-|:-|
|
||||
|deflate_quick.c|SSE4 optimized deflate strategy for use as level 1|
|
||||
|crc_folding.c|SSE4 + PCLMULQDQ optimized CRC folding implementation|
|
||||
|slide_sse2.c|SSE2 optimized slide_hash|
|
||||
@@ -8,7 +8,9 @@ SFLAGS=
|
||||
INCLUDES=
|
||||
SUFFIX=
|
||||
|
||||
AVX2FLAG=-mavx2
|
||||
SSE2FLAG=-msse2
|
||||
SSSE3FLAG=-mssse3
|
||||
SSE4FLAG=-msse4
|
||||
PCLMULFLAG=-mpclmul
|
||||
|
||||
@@ -16,7 +18,18 @@ SRCDIR=.
|
||||
SRCTOP=../..
|
||||
TOPDIR=$(SRCTOP)
|
||||
|
||||
all: x86.o x86.lo fill_window_sse.o fill_window_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo
|
||||
all: \
|
||||
x86.o x86.lo \
|
||||
adler32_avx.o adler32.lo \
|
||||
adler32_ssse3.o adler32_ssse3.lo \
|
||||
chunkset_avx.o chunkset_avx.lo \
|
||||
chunkset_sse.o chunkset_sse.lo \
|
||||
compare258_avx.o compare258_avx.lo \
|
||||
compare258_sse.o compare258_sse.lo \
|
||||
insert_string_sse.o insert_string_sse.lo \
|
||||
crc_folding.o crc_folding.lo \
|
||||
slide_avx.o slide_avx.lo \
|
||||
slide_sse.o slide_sse.lo
|
||||
|
||||
x86.o:
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
|
||||
@@ -24,17 +37,29 @@ x86.o:
|
||||
x86.lo:
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
|
||||
|
||||
fill_window_sse.o:
|
||||
$(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_sse.c
|
||||
chunkset_avx.o:
|
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx.c
|
||||
|
||||
fill_window_sse.lo:
|
||||
$(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_sse.c
|
||||
chunkset_avx.lo:
|
||||
$(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx.c
|
||||
|
||||
deflate_quick.o:
|
||||
$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/deflate_quick.c
|
||||
chunkset_sse.o:
|
||||
$(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse.c
|
||||
|
||||
deflate_quick.lo:
|
||||
$(CC) $(SFLAGS) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/deflate_quick.c
|
||||
chunkset_sse.lo:
|
||||
$(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse.c
|
||||
|
||||
compare258_avx.o:
|
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx.c
|
||||
|
||||
compare258_avx.lo:
|
||||
$(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx.c
|
||||
|
||||
compare258_sse.o:
|
||||
$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c
|
||||
|
||||
compare258_sse.lo:
|
||||
$(CC) $(SFLAGS) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c
|
||||
|
||||
insert_string_sse.o:
|
||||
$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse.c
|
||||
@@ -48,6 +73,30 @@ crc_folding.o:
|
||||
crc_folding.lo:
|
||||
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c
|
||||
|
||||
slide_avx.o:
|
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_avx.c
|
||||
|
||||
slide_avx.lo:
|
||||
$(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_avx.c
|
||||
|
||||
slide_sse.o:
|
||||
$(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_sse.c
|
||||
|
||||
slide_sse.lo:
|
||||
$(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_sse.c
|
||||
|
||||
adler32_avx.o: $(SRCDIR)/adler32_avx.c
|
||||
$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx.c
|
||||
|
||||
adler32_avx.lo: $(SRCDIR)/adler32_avx.c
|
||||
$(CC) $(SFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx.c
|
||||
|
||||
adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
|
||||
$(CC) $(CFLAGS) $(SSSE3FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
|
||||
|
||||
adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c
|
||||
$(CC) $(SFLAGS) $(SSSE3FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
|
||||
|
||||
mostlyclean: clean
|
||||
clean:
|
||||
rm -f *.o *.lo *~
|
||||
|
||||
@@ -0,0 +1,117 @@
|
||||
/* adler32.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011 Mark Adler
|
||||
* Authors:
|
||||
* Brian Bockelman <bockelman@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "../../zutil.h"
|
||||
|
||||
#include "../../adler32_p.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#ifdef X86_AVX2_ADLER32
|
||||
|
||||
Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len) {
|
||||
uint32_t sum2;
|
||||
|
||||
/* split Adler-32 into component sums */
|
||||
sum2 = (adler >> 16) & 0xffff;
|
||||
adler &= 0xffff;
|
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */
|
||||
if (UNLIKELY(len == 1))
|
||||
return adler32_len_1(adler, buf, sum2);
|
||||
|
||||
/* initial Adler-32 value (deferred check for len == 1 speed) */
|
||||
if (UNLIKELY(buf == NULL))
|
||||
return 1L;
|
||||
|
||||
/* in case short lengths are provided, keep it somewhat fast */
|
||||
if (UNLIKELY(len < 16))
|
||||
return adler32_len_16(adler, buf, len, sum2);
|
||||
|
||||
uint32_t ALIGNED_(32) s1[8], s2[8];
|
||||
|
||||
memset(s1, 0, sizeof(s1)); s1[7] = adler; // TODO: would a masked load be faster?
|
||||
memset(s2, 0, sizeof(s2)); s2[7] = sum2;
|
||||
|
||||
char ALIGNED_(32) dot1[32] = \
|
||||
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
|
||||
__m256i dot1v = _mm256_load_si256((__m256i*)dot1);
|
||||
char ALIGNED_(32) dot2[32] = \
|
||||
{32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
|
||||
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
|
||||
__m256i dot2v = _mm256_load_si256((__m256i*)dot2);
|
||||
short ALIGNED_(32) dot3[16] = \
|
||||
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
|
||||
__m256i dot3v = _mm256_load_si256((__m256i*)dot3);
|
||||
|
||||
// We will need to multiply by
|
||||
char ALIGNED_(32) shift[16] = {5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
__m128i shiftv = _mm_load_si128((__m128i*)shift);
|
||||
|
||||
while (len >= 32) {
|
||||
__m256i vs1 = _mm256_load_si256((__m256i*)s1);
|
||||
__m256i vs2 = _mm256_load_si256((__m256i*)s2);
|
||||
__m256i vs1_0 = vs1;
|
||||
|
||||
int k = (len < NMAX ? (int)len : NMAX);
|
||||
k -= k % 32;
|
||||
len -= k;
|
||||
|
||||
while (k >= 32) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
|
||||
*/
|
||||
__m256i vbuf = _mm256_loadu_si256((__m256i*)buf);
|
||||
buf += 32;
|
||||
k -= 32;
|
||||
|
||||
__m256i v_short_sum1 = _mm256_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts.
|
||||
__m256i vsum1 = _mm256_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t;
|
||||
__m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v);
|
||||
vs1 = _mm256_add_epi32(vsum1, vs1);
|
||||
__m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v);
|
||||
vs1_0 = _mm256_sll_epi32(vs1_0, shiftv);
|
||||
vsum2 = _mm256_add_epi32(vsum2, vs2);
|
||||
vs2 = _mm256_add_epi32(vsum2, vs1_0);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
// At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that
|
||||
// would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on.
|
||||
uint32_t ALIGNED_(32) s1_unpack[8];
|
||||
uint32_t ALIGNED_(32) s2_unpack[8];
|
||||
|
||||
_mm256_store_si256((__m256i*)s1_unpack, vs1);
|
||||
_mm256_store_si256((__m256i*)s2_unpack, vs2);
|
||||
|
||||
adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
|
||||
(s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
|
||||
adler %= BASE;
|
||||
s1[7] = adler;
|
||||
|
||||
sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE) +
|
||||
(s2_unpack[4] % BASE) + (s2_unpack[5] % BASE) + (s2_unpack[6] % BASE) + (s2_unpack[7] % BASE);
|
||||
sum2 %= BASE;
|
||||
s2[7] = sum2;
|
||||
}
|
||||
|
||||
while (len) {
|
||||
len--;
|
||||
adler += *buf++;
|
||||
sum2 += adler;
|
||||
}
|
||||
adler %= BASE;
|
||||
sum2 %= BASE;
|
||||
|
||||
/* return recombined sums */
|
||||
return adler | (sum2 << 16);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,118 @@
|
||||
/* adler32.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011 Mark Adler
|
||||
* Authors:
|
||||
* Brian Bockelman <bockelman@gmail.com>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "../../zutil.h"
|
||||
|
||||
#include "../../adler32_p.h"
|
||||
|
||||
#ifdef X86_SSSE3_ADLER32
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len) {
|
||||
uint32_t sum2;
|
||||
|
||||
/* split Adler-32 into component sums */
|
||||
sum2 = (adler >> 16) & 0xffff;
|
||||
adler &= 0xffff;
|
||||
|
||||
/* in case user likes doing a byte at a time, keep it fast */
|
||||
if (UNLIKELY(len == 1))
|
||||
return adler32_len_1(adler, buf, sum2);
|
||||
|
||||
/* initial Adler-32 value (deferred check for len == 1 speed) */
|
||||
if (UNLIKELY(buf == NULL))
|
||||
return 1L;
|
||||
|
||||
/* in case short lengths are provided, keep it somewhat fast */
|
||||
if (UNLIKELY(len < 16))
|
||||
return adler32_len_16(adler, buf, len, sum2);
|
||||
|
||||
uint32_t ALIGNED_(16) s1[4], s2[4];
|
||||
|
||||
s1[0] = s1[1] = s1[2] = 0; s1[3] = adler;
|
||||
s2[0] = s2[1] = s2[2] = 0; s2[3] = sum2;
|
||||
|
||||
char ALIGNED_(16) dot1[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
|
||||
__m128i dot1v = _mm_load_si128((__m128i*)dot1);
|
||||
char ALIGNED_(16) dot2[16] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
|
||||
__m128i dot2v = _mm_load_si128((__m128i*)dot2);
|
||||
short ALIGNED_(16) dot3[8] = {1, 1, 1, 1, 1, 1, 1, 1};
|
||||
__m128i dot3v = _mm_load_si128((__m128i*)dot3);
|
||||
|
||||
// We will need to multiply by
|
||||
//char ALIGNED_(16) shift[4] = {0, 0, 0, 4}; //{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4};
|
||||
|
||||
char ALIGNED_(16) shift[16] = {4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
__m128i shiftv = _mm_load_si128((__m128i*)shift);
|
||||
|
||||
while (len >= 16) {
|
||||
__m128i vs1 = _mm_load_si128((__m128i*)s1);
|
||||
__m128i vs2 = _mm_load_si128((__m128i*)s2);
|
||||
__m128i vs1_0 = vs1;
|
||||
|
||||
int k = (len < NMAX ? (int)len : NMAX);
|
||||
k -= k % 16;
|
||||
len -= k;
|
||||
|
||||
while (k >= 16) {
|
||||
/*
|
||||
vs1 = adler + sum(c[i])
|
||||
vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
|
||||
|
||||
NOTE: 256-bit equivalents are:
|
||||
_mm256_maddubs_epi16 <- operates on 32 bytes to 16 shorts
|
||||
_mm256_madd_epi16 <- Sums 16 shorts to 8 int32_t.
|
||||
We could rewrite the below to use 256-bit instructions instead of 128-bit.
|
||||
*/
|
||||
__m128i vbuf = _mm_loadu_si128((__m128i*)buf);
|
||||
buf += 16;
|
||||
k -= 16;
|
||||
|
||||
__m128i v_short_sum1 = _mm_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts.
|
||||
__m128i vsum1 = _mm_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t;
|
||||
__m128i v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
|
||||
vs1 = _mm_add_epi32(vsum1, vs1);
|
||||
__m128i vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
|
||||
vs1_0 = _mm_sll_epi32(vs1_0, shiftv);
|
||||
vsum2 = _mm_add_epi32(vsum2, vs2);
|
||||
vs2 = _mm_add_epi32(vsum2, vs1_0);
|
||||
vs1_0 = vs1;
|
||||
}
|
||||
|
||||
// At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that
|
||||
// would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on.
|
||||
|
||||
uint32_t ALIGNED_(16) s1_unpack[4];
|
||||
uint32_t ALIGNED_(16) s2_unpack[4];
|
||||
|
||||
_mm_store_si128((__m128i*)s1_unpack, vs1);
|
||||
_mm_store_si128((__m128i*)s2_unpack, vs2);
|
||||
|
||||
adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE);
|
||||
adler %= BASE;
|
||||
s1[3] = adler;
|
||||
|
||||
sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE);
|
||||
sum2 %= BASE;
|
||||
s2[3] = sum2;
|
||||
}
|
||||
|
||||
while (len) {
|
||||
len--;
|
||||
adler += *buf++;
|
||||
sum2 += adler;
|
||||
}
|
||||
adler %= BASE;
|
||||
sum2 %= BASE;
|
||||
|
||||
/* return recombined sums */
|
||||
return adler | (sum2 << 16);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,50 @@
|
||||
/* chunkset_avx.c -- AVX inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include "zbuild.h"
|
||||
#include "zutil.h"
|
||||
|
||||
#ifdef X86_AVX_CHUNKSET
|
||||
#include <immintrin.h>
|
||||
|
||||
typedef __m256i chunk_t;
|
||||
|
||||
#define HAVE_CHUNKMEMSET_1
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
|
||||
static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm256_set1_epi8(*(int8_t *)from);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm256_set1_epi16(*(int16_t *)from);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm256_set1_epi32(*(int32_t *)from);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm256_set1_epi64x(*(int64_t *)from);
|
||||
}
|
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
|
||||
*chunk = _mm256_loadu_si256((__m256i *)s);
|
||||
}
|
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
|
||||
_mm256_storeu_si256((__m256i *)out, *chunk);
|
||||
}
|
||||
|
||||
#define CHUNKSIZE chunksize_avx
|
||||
#define CHUNKCOPY chunkcopy_avx
|
||||
#define CHUNKCOPY_SAFE chunkcopy_safe_avx
|
||||
#define CHUNKUNROLL chunkunroll_avx
|
||||
#define CHUNKMEMSET chunkmemset_avx
|
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_avx
|
||||
|
||||
#include "chunkset_tpl.h"
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,51 @@
|
||||
/* chunkset_sse.c -- SSE inline functions to copy small data chunks.
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zutil.h"
|
||||
|
||||
#ifdef X86_SSE2
|
||||
#include <immintrin.h>
|
||||
|
||||
typedef __m128i chunk_t;
|
||||
|
||||
#define HAVE_CHUNKMEMSET_1
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
|
||||
static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm_set1_epi8(*(int8_t *)from);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm_set1_epi16(*(int16_t *)from);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm_set1_epi32(*(int32_t *)from);
|
||||
}
|
||||
|
||||
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
|
||||
*chunk = _mm_set1_epi64x(*(int64_t *)from);
|
||||
}
|
||||
|
||||
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
|
||||
*chunk = _mm_loadu_si128((__m128i *)s);
|
||||
}
|
||||
|
||||
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
|
||||
_mm_storeu_si128((__m128i *)out, *chunk);
|
||||
}
|
||||
|
||||
#define CHUNKSIZE chunksize_sse2
|
||||
#define CHUNKCOPY chunkcopy_sse2
|
||||
#define CHUNKCOPY_SAFE chunkcopy_safe_sse2
|
||||
#define CHUNKUNROLL chunkunroll_sse2
|
||||
#define CHUNKMEMSET chunkmemset_sse2
|
||||
#define CHUNKMEMSET_SAFE chunkmemset_safe_sse2
|
||||
|
||||
#include "chunkset_tpl.h"
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,67 @@
|
||||
/* compare258_avx.c -- AVX2 version of compare258
|
||||
* Copyright Mika T. Lindqvist <postmaster@raasu.org>
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "../../zutil.h"
|
||||
|
||||
#include "fallback_builtins.h"
|
||||
|
||||
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
|
||||
|
||||
#include <immintrin.h>
|
||||
#ifdef _MSC_VER
|
||||
# include <nmmintrin.h>
|
||||
#endif
|
||||
|
||||
/* UNALIGNED_OK, AVX2 intrinsic comparison */
|
||||
static inline uint32_t compare256_unaligned_avx2_static(const unsigned char *src0, const unsigned char *src1) {
|
||||
uint32_t len = 0;
|
||||
|
||||
do {
|
||||
__m256i ymm_src0, ymm_src1, ymm_cmp;
|
||||
ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
|
||||
ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
|
||||
ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
|
||||
unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
|
||||
if (mask != 0xFFFFFFFF) {
|
||||
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */
|
||||
return len + match_byte;
|
||||
}
|
||||
|
||||
src0 += 32, src1 += 32, len += 32;
|
||||
|
||||
ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
|
||||
ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
|
||||
ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1);
|
||||
mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
|
||||
if (mask != 0xFFFFFFFF) {
|
||||
uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
|
||||
return len + match_byte;
|
||||
}
|
||||
|
||||
src0 += 32, src1 += 32, len += 32;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
static inline uint32_t compare258_unaligned_avx2_static(const unsigned char *src0, const unsigned char *src1) {
|
||||
if (*(uint16_t *)src0 != *(uint16_t *)src1)
|
||||
return (*src0 == *src1);
|
||||
|
||||
return compare256_unaligned_avx2_static(src0+2, src1+2) + 2;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1) {
|
||||
return compare258_unaligned_avx2_static(src0, src1);
|
||||
}
|
||||
|
||||
#define LONGEST_MATCH longest_match_unaligned_avx2
|
||||
#define COMPARE256 compare256_unaligned_avx2_static
|
||||
#define COMPARE258 compare258_unaligned_avx2_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,74 @@
|
||||
/* compare258_sse.c -- SSE4.2 version of compare258
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved.
|
||||
* Authors:
|
||||
* Wajdi Feghali <wajdi.k.feghali@intel.com>
|
||||
* Jim Guilford <james.guilford@intel.com>
|
||||
* Vinodh Gopal <vinodh.gopal@intel.com>
|
||||
* Erdinc Ozturk <erdinc.ozturk@intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
*
|
||||
* Portions are Copyright (C) 2016 12Sided Technology, LLC.
|
||||
* Author:
|
||||
* Phil Vachon <pvachon@12sidedtech.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include "../../zutil.h"
|
||||
|
||||
#ifdef X86_SSE42_CMP_STR
|
||||
|
||||
#include <immintrin.h>
|
||||
#ifdef _MSC_VER
|
||||
# include <nmmintrin.h>
|
||||
#endif
|
||||
|
||||
/* UNALIGNED_OK, SSE4.2 intrinsic comparison */
|
||||
static inline uint32_t compare256_unaligned_sse4_static(const unsigned char *src0, const unsigned char *src1) {
|
||||
uint32_t len = 0;
|
||||
|
||||
do {
|
||||
#define mode _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY
|
||||
__m128i xmm_src0, xmm_src1;
|
||||
uint32_t ret;
|
||||
|
||||
xmm_src0 = _mm_loadu_si128((__m128i *)src0);
|
||||
xmm_src1 = _mm_loadu_si128((__m128i *)src1);
|
||||
ret = (uint32_t)_mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode);
|
||||
if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) {
|
||||
return len + ret;
|
||||
}
|
||||
src0 += 16, src1 += 16, len += 16;
|
||||
|
||||
xmm_src0 = _mm_loadu_si128((__m128i *)src0);
|
||||
xmm_src1 = _mm_loadu_si128((__m128i *)src1);
|
||||
ret = (uint32_t)_mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode);
|
||||
if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) {
|
||||
return len + ret;
|
||||
}
|
||||
src0 += 16, src1 += 16, len += 16;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
static inline uint32_t compare258_unaligned_sse4_static(const unsigned char *src0, const unsigned char *src1) {
|
||||
if (*(uint16_t *)src0 != *(uint16_t *)src1)
|
||||
return (*src0 == *src1);
|
||||
|
||||
return compare256_unaligned_sse4_static(src0+2, src1+2) + 2;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1) {
|
||||
return compare258_unaligned_sse4_static(src0, src1);
|
||||
}
|
||||
|
||||
#define LONGEST_MATCH longest_match_unaligned_sse4
|
||||
#define COMPARE256 compare256_unaligned_sse4_static
|
||||
#define COMPARE258 compare258_unaligned_sse4_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#endif
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
|
||||
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
|
||||
* instruction.
|
||||
*
|
||||
* A white paper describing this algorithm can be found at:
|
||||
@@ -18,14 +18,14 @@
|
||||
|
||||
#ifdef X86_PCLMULQDQ_CRC
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "../../zbuild.h"
|
||||
#include <inttypes.h>
|
||||
#include <immintrin.h>
|
||||
#include <wmmintrin.h>
|
||||
|
||||
#include "crc_folding.h"
|
||||
|
||||
ZLIB_INTERNAL void crc_fold_init(deflate_state *const s) {
|
||||
Z_INTERNAL void crc_fold_init(deflate_state *const s) {
|
||||
/* CRC_SAVE */
|
||||
_mm_storeu_si128((__m128i *)s->crc0 + 0, _mm_cvtsi32_si128(0x9db42487));
|
||||
_mm_storeu_si128((__m128i *)s->crc0 + 1, _mm_setzero_si128());
|
||||
@@ -227,9 +227,10 @@ static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res);
|
||||
}
|
||||
|
||||
ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const unsigned char *src, long len) {
|
||||
Z_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const unsigned char *src, long len) {
|
||||
unsigned long algn_diff;
|
||||
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
|
||||
char ALIGNED_(16) partial_buf[16] = { 0 };
|
||||
|
||||
/* CRC_LOAD */
|
||||
__m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);
|
||||
@@ -241,11 +242,14 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
|
||||
if (len < 16) {
|
||||
if (len == 0)
|
||||
return;
|
||||
xmm_crc_part = _mm_loadu_si128((__m128i *)src);
|
||||
|
||||
memcpy(partial_buf, src, len);
|
||||
xmm_crc_part = _mm_loadu_si128((const __m128i *)partial_buf);
|
||||
memcpy(dst, partial_buf, len);
|
||||
goto partial;
|
||||
}
|
||||
|
||||
algn_diff = (0 - (uintptr_t)src) & 0xF;
|
||||
algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
|
||||
if (algn_diff) {
|
||||
xmm_crc_part = _mm_loadu_si128((__m128i *)src);
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_crc_part);
|
||||
@@ -255,6 +259,8 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
|
||||
len -= algn_diff;
|
||||
|
||||
partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
|
||||
} else {
|
||||
xmm_crc_part = _mm_setzero_si128();
|
||||
}
|
||||
|
||||
while ((len -= 64) >= 0) {
|
||||
@@ -305,7 +311,7 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
|
||||
goto done;
|
||||
|
||||
dst += 48;
|
||||
xmm_crc_part = _mm_load_si128((__m128i *)src + 3);
|
||||
memcpy(&xmm_crc_part, (__m128i *)src + 3, len);
|
||||
} else if (len + 32 >= 0) {
|
||||
len += 32;
|
||||
|
||||
@@ -324,7 +330,7 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
|
||||
goto done;
|
||||
|
||||
dst += 32;
|
||||
xmm_crc_part = _mm_load_si128((__m128i *)src + 2);
|
||||
memcpy(&xmm_crc_part, (__m128i *)src + 2, len);
|
||||
} else if (len + 48 >= 0) {
|
||||
len += 48;
|
||||
|
||||
@@ -340,16 +346,18 @@ ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, con
|
||||
goto done;
|
||||
|
||||
dst += 16;
|
||||
xmm_crc_part = _mm_load_si128((__m128i *)src + 1);
|
||||
memcpy(&xmm_crc_part, (__m128i *)src + 1, len);
|
||||
} else {
|
||||
len += 64;
|
||||
if (len == 0)
|
||||
goto done;
|
||||
xmm_crc_part = _mm_load_si128((__m128i *)src);
|
||||
memcpy(&xmm_crc_part, src, len);
|
||||
}
|
||||
|
||||
_mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);
|
||||
memcpy(dst, partial_buf, len);
|
||||
|
||||
partial:
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_crc_part);
|
||||
partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
|
||||
done:
|
||||
/* CRC_SAVE */
|
||||
@@ -377,7 +385,7 @@ static const unsigned ALIGNED_(16) crc_mask2[4] = {
|
||||
0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
|
||||
};
|
||||
|
||||
uint32_t ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s) {
|
||||
uint32_t Z_INTERNAL crc_fold_512to32(deflate_state *const s) {
|
||||
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
|
||||
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
|
||||
|
||||
@@ -447,4 +455,3 @@ uint32_t ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s) {
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -10,10 +10,10 @@
|
||||
#ifndef CRC_FOLDING_H_
|
||||
#define CRC_FOLDING_H_
|
||||
|
||||
#include "deflate.h"
|
||||
#include "../../deflate.h"
|
||||
|
||||
ZLIB_INTERNAL void crc_fold_init(deflate_state *const);
|
||||
ZLIB_INTERNAL uint32_t crc_fold_512to32(deflate_state *const);
|
||||
ZLIB_INTERNAL void crc_fold_copy(deflate_state *const, unsigned char *, const unsigned char *, long);
|
||||
Z_INTERNAL void crc_fold_init(deflate_state *const);
|
||||
Z_INTERNAL uint32_t crc_fold_512to32(deflate_state *const);
|
||||
Z_INTERNAL void crc_fold_copy(deflate_state *const, unsigned char *, const unsigned char *, long);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,25 +0,0 @@
|
||||
#ifndef X86_CTZL_H
|
||||
#define X86_CTZL_H
|
||||
|
||||
#include <intrin.h>
|
||||
#ifdef X86_CPUID
|
||||
# include "x86.h"
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
/* This is not a general purpose replacement for __builtin_ctzl. The function expects that value is != 0
|
||||
* Because of that assumption trailing_zero is not initialized and the return value of _BitScanForward is not checked
|
||||
*/
|
||||
static __forceinline unsigned long __builtin_ctzl(unsigned long value)
|
||||
{
|
||||
#ifdef X86_CPUID
|
||||
if (x86_cpu_has_tzcnt)
|
||||
return _tzcnt_u32(value);
|
||||
#endif
|
||||
unsigned long trailing_zero;
|
||||
_BitScanForward(&trailing_zero, value);
|
||||
return trailing_zero;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,175 +0,0 @@
|
||||
/*
|
||||
* Fill Window with SSE2-optimized hash shifting
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation
|
||||
* Authors:
|
||||
* Arjan van de Ven <arjan@linux.intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#ifdef X86_SSE2
|
||||
|
||||
#include "zbuild.h"
|
||||
#include <immintrin.h>
|
||||
#include "deflate.h"
|
||||
#include "deflate_p.h"
|
||||
#include "functable.h"
|
||||
|
||||
extern int read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
|
||||
|
||||
ZLIB_INTERNAL void fill_window_sse(deflate_state *s) {
|
||||
const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
|
||||
|
||||
register unsigned n;
|
||||
register Pos *p;
|
||||
unsigned more; /* Amount of free space at the end of the window. */
|
||||
unsigned int wsize = s->w_size;
|
||||
|
||||
Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
|
||||
|
||||
do {
|
||||
more = (unsigned)(s->window_size -(unsigned long)s->lookahead -(unsigned long)s->strstart);
|
||||
|
||||
/* Deal with !@#$% 64K limit: */
|
||||
if (sizeof(int) <= 2) {
|
||||
if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
|
||||
more = wsize;
|
||||
|
||||
} else if (more == (unsigned)(-1)) {
|
||||
/* Very unlikely, but possible on 16 bit machine if
|
||||
* strstart == 0 && lookahead == 1 (input done a byte at time)
|
||||
*/
|
||||
more--;
|
||||
}
|
||||
}
|
||||
|
||||
/* If the window is almost full and there is insufficient lookahead,
|
||||
* move the upper half to the lower one to make room in the upper half.
|
||||
*/
|
||||
if (s->strstart >= wsize+MAX_DIST(s)) {
|
||||
memcpy(s->window, s->window+wsize, (unsigned)wsize);
|
||||
s->match_start = (s->match_start >= wsize) ? s->match_start - wsize : 0;
|
||||
s->strstart -= wsize; /* we now have strstart >= MAX_DIST */
|
||||
s->block_start -= (long) wsize;
|
||||
|
||||
/* Slide the hash table (could be avoided with 32 bit values
|
||||
at the expense of memory usage). We slide even when level == 0
|
||||
to keep the hash table consistent if we switch back to level > 0
|
||||
later. (Using level 0 permanently is not an optimal usage of
|
||||
zlib, so we don't care about this pathological case.)
|
||||
*/
|
||||
n = s->hash_size;
|
||||
p = &s->head[n];
|
||||
p -= 8;
|
||||
do {
|
||||
__m128i value, result;
|
||||
|
||||
value = _mm_loadu_si128((__m128i *)p);
|
||||
result = _mm_subs_epu16(value, xmm_wsize);
|
||||
_mm_storeu_si128((__m128i *)p, result);
|
||||
|
||||
p -= 8;
|
||||
n -= 8;
|
||||
} while (n > 0);
|
||||
|
||||
n = wsize;
|
||||
p = &s->prev[n];
|
||||
p -= 8;
|
||||
do {
|
||||
__m128i value, result;
|
||||
|
||||
value = _mm_loadu_si128((__m128i *)p);
|
||||
result = _mm_subs_epu16(value, xmm_wsize);
|
||||
_mm_storeu_si128((__m128i *)p, result);
|
||||
|
||||
p -= 8;
|
||||
n -= 8;
|
||||
} while (n > 0);
|
||||
more += wsize;
|
||||
}
|
||||
if (s->strm->avail_in == 0) break;
|
||||
|
||||
/* If there was no sliding:
|
||||
* strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
|
||||
* more == window_size - lookahead - strstart
|
||||
* => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
|
||||
* => more >= window_size - 2*WSIZE + 2
|
||||
* In the BIG_MEM or MMAP case (not yet supported),
|
||||
* window_size == input_size + MIN_LOOKAHEAD &&
|
||||
* strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
|
||||
* Otherwise, window_size == 2*WSIZE so more >= 2.
|
||||
* If there was sliding, more >= WSIZE. So in all cases, more >= 2.
|
||||
*/
|
||||
Assert(more >= 2, "more < 2");
|
||||
|
||||
n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
|
||||
s->lookahead += n;
|
||||
|
||||
/* Initialize the hash value now that we have some input: */
|
||||
if (s->lookahead + s->insert >= MIN_MATCH) {
|
||||
unsigned int str = s->strstart - s->insert;
|
||||
s->ins_h = s->window[str];
|
||||
if (str >= 1)
|
||||
functable.insert_string(s, str + 2 - MIN_MATCH, 1);
|
||||
#if MIN_MATCH != 3
|
||||
#error Call insert_string() MIN_MATCH-3 more times
|
||||
while (s->insert) {
|
||||
functable.insert_string(s, str, 1);
|
||||
str++;
|
||||
s->insert--;
|
||||
if (s->lookahead + s->insert < MIN_MATCH)
|
||||
break;
|
||||
}
|
||||
#else
|
||||
unsigned int count;
|
||||
if (unlikely(s->lookahead == 1)){
|
||||
count = s->insert - 1;
|
||||
}else{
|
||||
count = s->insert;
|
||||
}
|
||||
functable.insert_string(s, str, count);
|
||||
s->insert -= count;
|
||||
#endif
|
||||
}
|
||||
/* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
|
||||
* but this is not important since only literal bytes will be emitted.
|
||||
*/
|
||||
} while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
|
||||
|
||||
/* If the WIN_INIT bytes after the end of the current data have never been
|
||||
* written, then zero those bytes in order to avoid memory check reports of
|
||||
* the use of uninitialized (or uninitialised as Julian writes) bytes by
|
||||
* the longest match routines. Update the high water mark for the next
|
||||
* time through here. WIN_INIT is set to MAX_MATCH since the longest match
|
||||
* routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
|
||||
*/
|
||||
if (s->high_water < s->window_size) {
|
||||
unsigned long curr = s->strstart + (unsigned long)(s->lookahead);
|
||||
unsigned long init;
|
||||
|
||||
if (s->high_water < curr) {
|
||||
/* Previous high water mark below current data -- zero WIN_INIT
|
||||
* bytes or up to end of window, whichever is less.
|
||||
*/
|
||||
init = s->window_size - curr;
|
||||
if (init > WIN_INIT)
|
||||
init = WIN_INIT;
|
||||
memset(s->window + curr, 0, (unsigned)init);
|
||||
s->high_water = curr + init;
|
||||
} else if (s->high_water < (unsigned long)curr + WIN_INIT) {
|
||||
/* High water mark at or above current data, but below current data
|
||||
* plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
|
||||
* to end of window, whichever is less.
|
||||
*/
|
||||
init = (unsigned long)curr + WIN_INIT - s->high_water;
|
||||
if (init > s->window_size - s->high_water)
|
||||
init = s->window_size - s->high_water;
|
||||
memset(s->window + s->high_water, 0, (unsigned)init);
|
||||
s->high_water += init;
|
||||
}
|
||||
}
|
||||
|
||||
Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "not enough room for search");
|
||||
}
|
||||
#endif
|
||||
@@ -5,52 +5,42 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "deflate.h"
|
||||
|
||||
/* ===========================================================================
|
||||
* Insert string str in the dictionary and set match_head to the previous head
|
||||
* of the hash chain (the most recent string with same hash key). Return
|
||||
* the previous length of the hash chain.
|
||||
* IN assertion: all calls to to INSERT_STRING are made with consecutive
|
||||
* input characters and the first MIN_MATCH bytes of str are valid
|
||||
* (except for the last MIN_MATCH-1 bytes of the input file).
|
||||
*/
|
||||
#ifdef X86_SSE4_2_CRC_HASH
|
||||
ZLIB_INTERNAL Pos insert_string_sse(deflate_state *const s, const Pos str, unsigned int count) {
|
||||
Pos ret = 0;
|
||||
unsigned int idx;
|
||||
unsigned int *ip, val, h;
|
||||
|
||||
for (idx = 0; idx < count; idx++) {
|
||||
ip = (unsigned *)&s->window[str+idx];
|
||||
memcpy(&val, ip, sizeof(val));
|
||||
h = 0;
|
||||
|
||||
if (s->level >= TRIGGER_LEVEL)
|
||||
val &= 0xFFFFFF;
|
||||
|
||||
#include "../../zbuild.h"
|
||||
#include <immintrin.h>
|
||||
#ifdef _MSC_VER
|
||||
h = _mm_crc32_u32(h, val);
|
||||
#elif defined(X86_SSE4_2_CRC_INTRIN)
|
||||
h = __builtin_ia32_crc32si(h, val);
|
||||
# include <nmmintrin.h>
|
||||
#endif
|
||||
#include "../../deflate.h"
|
||||
|
||||
#ifdef X86_SSE42_CRC_INTRIN
|
||||
# ifdef _MSC_VER
|
||||
# define UPDATE_HASH(s, h, val)\
|
||||
h = _mm_crc32_u32(h, val)
|
||||
# else
|
||||
# define UPDATE_HASH(s, h, val)\
|
||||
h = __builtin_ia32_crc32si(h, val)
|
||||
# endif
|
||||
#else
|
||||
__asm__ __volatile__ (
|
||||
"crc32 %1,%0\n\t"
|
||||
: "+r" (h)
|
||||
: "r" (val)
|
||||
);
|
||||
#endif
|
||||
Pos head = s->head[h & s->hash_mask];
|
||||
if (head != str+idx) {
|
||||
s->prev[(str+idx) & s->w_mask] = head;
|
||||
s->head[h & s->hash_mask] = str+idx;
|
||||
if (idx == count-1)
|
||||
ret = head;
|
||||
} else if (idx == count - 1) {
|
||||
ret = str + idx;
|
||||
}
|
||||
# ifdef _MSC_VER
|
||||
# define UPDATE_HASH(s, h, val) {\
|
||||
__asm mov edx, h\
|
||||
__asm mov eax, val\
|
||||
__asm crc32 eax, edx\
|
||||
__asm mov val, eax\
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
# else
|
||||
# define UPDATE_HASH(s, h, val) \
|
||||
__asm__ __volatile__ (\
|
||||
"crc32 %1,%0\n\t"\
|
||||
: "+r" (h)\
|
||||
: "r" (val)\
|
||||
);
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define INSERT_STRING insert_string_sse4
|
||||
#define QUICK_INSERT_STRING quick_insert_string_sse4
|
||||
|
||||
#ifdef X86_SSE42_CRC_HASH
|
||||
# include "../../insert_string_tpl.h"
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
* AVX2 optimized hash slide, based on Intel's slide_sse implementation
|
||||
*
|
||||
* Copyright (C) 2017 Intel Corporation
|
||||
* Authors:
|
||||
* Arjan van de Ven <arjan@linux.intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
* Mika T. Lindqvist <postmaster@raasu.org>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include "../../zbuild.h"
|
||||
#include "../../deflate.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
|
||||
Pos *p;
|
||||
unsigned n;
|
||||
uint16_t wsize = (uint16_t)s->w_size;
|
||||
const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
|
||||
|
||||
n = HASH_SIZE;
|
||||
p = &s->head[n] - 16;
|
||||
do {
|
||||
__m256i value, result;
|
||||
|
||||
value = _mm256_loadu_si256((__m256i *)p);
|
||||
result= _mm256_subs_epu16(value, ymm_wsize);
|
||||
_mm256_storeu_si256((__m256i *)p, result);
|
||||
p -= 16;
|
||||
n -= 16;
|
||||
} while (n > 0);
|
||||
|
||||
n = wsize;
|
||||
p = &s->prev[n] - 16;
|
||||
do {
|
||||
__m256i value, result;
|
||||
|
||||
value = _mm256_loadu_si256((__m256i *)p);
|
||||
result= _mm256_subs_epu16(value, ymm_wsize);
|
||||
_mm256_storeu_si256((__m256i *)p, result);
|
||||
|
||||
p -= 16;
|
||||
n -= 16;
|
||||
} while (n > 0);
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
* SSE optimized hash slide
|
||||
*
|
||||
* Copyright (C) 2017 Intel Corporation
|
||||
* Authors:
|
||||
* Arjan van de Ven <arjan@linux.intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include "../../zbuild.h"
|
||||
#include "../../deflate.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
|
||||
Pos *p;
|
||||
unsigned n;
|
||||
uint16_t wsize = (uint16_t)s->w_size;
|
||||
const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
|
||||
|
||||
n = HASH_SIZE;
|
||||
p = &s->head[n] - 8;
|
||||
do {
|
||||
__m128i value, result;
|
||||
|
||||
value = _mm_loadu_si128((__m128i *)p);
|
||||
result= _mm_subs_epu16(value, xmm_wsize);
|
||||
_mm_storeu_si128((__m128i *)p, result);
|
||||
p -= 8;
|
||||
n -= 8;
|
||||
} while (n > 0);
|
||||
|
||||
n = wsize;
|
||||
p = &s->prev[n] - 8;
|
||||
do {
|
||||
__m128i value, result;
|
||||
|
||||
value = _mm_loadu_si128((__m128i *)p);
|
||||
result= _mm_subs_epu16(value, xmm_wsize);
|
||||
_mm_storeu_si128((__m128i *)p, result);
|
||||
|
||||
p -= 8;
|
||||
n -= 8;
|
||||
} while (n > 0);
|
||||
}
|
||||
+54
-42
@@ -8,61 +8,73 @@
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zutil.h"
|
||||
#include "../../zutil.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
# include <intrin.h>
|
||||
#else
|
||||
// Newer versions of GCC and clang come with cpuid.h
|
||||
#include <cpuid.h>
|
||||
# include <cpuid.h>
|
||||
#endif
|
||||
|
||||
ZLIB_INTERNAL int x86_cpu_has_sse2;
|
||||
ZLIB_INTERNAL int x86_cpu_has_sse42;
|
||||
ZLIB_INTERNAL int x86_cpu_has_pclmulqdq;
|
||||
ZLIB_INTERNAL int x86_cpu_has_tzcnt;
|
||||
Z_INTERNAL int x86_cpu_has_avx2;
|
||||
Z_INTERNAL int x86_cpu_has_sse2;
|
||||
Z_INTERNAL int x86_cpu_has_ssse3;
|
||||
Z_INTERNAL int x86_cpu_has_sse42;
|
||||
Z_INTERNAL int x86_cpu_has_pclmulqdq;
|
||||
Z_INTERNAL int x86_cpu_has_tzcnt;
|
||||
|
||||
static void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
|
||||
#ifdef _MSC_VER
|
||||
unsigned int registers[4];
|
||||
__cpuid(registers, info);
|
||||
unsigned int registers[4];
|
||||
__cpuid((int *)registers, info);
|
||||
|
||||
*eax = registers[0];
|
||||
*ebx = registers[1];
|
||||
*ecx = registers[2];
|
||||
*edx = registers[3];
|
||||
*eax = registers[0];
|
||||
*ebx = registers[1];
|
||||
*ecx = registers[2];
|
||||
*edx = registers[3];
|
||||
#else
|
||||
unsigned int _eax;
|
||||
unsigned int _ebx;
|
||||
unsigned int _ecx;
|
||||
unsigned int _edx;
|
||||
__cpuid(info, _eax, _ebx, _ecx, _edx);
|
||||
*eax = _eax;
|
||||
*ebx = _ebx;
|
||||
*ecx = _ecx;
|
||||
*edx = _edx;
|
||||
__cpuid(info, *eax, *ebx, *ecx, *edx);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ZLIB_INTERNAL x86_check_features(void) {
|
||||
unsigned eax, ebx, ecx, edx;
|
||||
unsigned maxbasic;
|
||||
static void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
|
||||
#ifdef _MSC_VER
|
||||
unsigned int registers[4];
|
||||
__cpuidex((int *)registers, info, subinfo);
|
||||
|
||||
cpuid(0, &maxbasic, &ebx, &ecx, &edx);
|
||||
|
||||
cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
x86_cpu_has_sse2 = edx & 0x4000000;
|
||||
x86_cpu_has_sse42 = ecx & 0x100000;
|
||||
x86_cpu_has_pclmulqdq = ecx & 0x2;
|
||||
|
||||
if (maxbasic >= 7) {
|
||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
// check BMI1 bit
|
||||
// Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
|
||||
x86_cpu_has_tzcnt = ebx & 0x8;
|
||||
} else {
|
||||
x86_cpu_has_tzcnt = 0;
|
||||
}
|
||||
*eax = registers[0];
|
||||
*ebx = registers[1];
|
||||
*ecx = registers[2];
|
||||
*edx = registers[3];
|
||||
#else
|
||||
__cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
|
||||
#endif
|
||||
}
|
||||
|
||||
void Z_INTERNAL x86_check_features(void) {
|
||||
unsigned eax, ebx, ecx, edx;
|
||||
unsigned maxbasic;
|
||||
|
||||
cpuid(0, &maxbasic, &ebx, &ecx, &edx);
|
||||
|
||||
cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
x86_cpu_has_sse2 = edx & 0x4000000;
|
||||
x86_cpu_has_ssse3 = ecx & 0x200;
|
||||
x86_cpu_has_sse42 = ecx & 0x100000;
|
||||
x86_cpu_has_pclmulqdq = ecx & 0x2;
|
||||
|
||||
if (maxbasic >= 7) {
|
||||
cpuidex(7, 0, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
// check BMI1 bit
|
||||
// Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
|
||||
x86_cpu_has_tzcnt = ebx & 0x8;
|
||||
// check AVX2 bit
|
||||
x86_cpu_has_avx2 = ebx & 0x20;
|
||||
} else {
|
||||
x86_cpu_has_tzcnt = 0;
|
||||
x86_cpu_has_avx2 = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,16 +1,18 @@
|
||||
/* cpu.h -- check for CPU features
|
||||
* Copyright (C) 2013 Intel Corporation Jim Kukunas
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
/* cpu.h -- check for CPU features
|
||||
* Copyright (C) 2013 Intel Corporation Jim Kukunas
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifndef CPU_H_
|
||||
#define CPU_H_
|
||||
|
||||
extern int x86_cpu_has_avx2;
|
||||
extern int x86_cpu_has_sse2;
|
||||
extern int x86_cpu_has_ssse3;
|
||||
extern int x86_cpu_has_sse42;
|
||||
extern int x86_cpu_has_pclmulqdq;
|
||||
extern int x86_cpu_has_tzcnt;
|
||||
|
||||
void ZLIB_INTERNAL x86_check_features(void);
|
||||
void Z_INTERNAL x86_check_features(void);
|
||||
|
||||
#endif /* CPU_H_ */
|
||||
|
||||
Reference in New Issue
Block a user