mirror of
https://github.com/EQEmu/Server.git
synced 2026-06-14 03:38:33 +00:00
Zlib ng
This commit is contained in:
@@ -0,0 +1,512 @@
|
||||
/*
|
||||
* Set match_start to the longest match starting at the given string and
|
||||
* return its length. Matches shorter or equal to prev_length are discarded,
|
||||
* in which case the result is equal to prev_length and match_start is garbage.
|
||||
*
|
||||
* IN assertions: cur_match is the head of the hash chain for the current
|
||||
* string (strstart) and its distance is <= MAX_DIST, and prev_length >=1
|
||||
* OUT assertion: the match length is not greater than s->lookahead
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "deflate.h"
|
||||
|
||||
#if (defined(UNALIGNED_OK) && MAX_MATCH == 258)
|
||||
|
||||
/* ARM 32-bit clang/gcc builds perform better, on average, with std2. Both gcc and clang and define __GNUC__. */
|
||||
# if defined(__GNUC__) && defined(__arm__) && !defined(__aarch64__)
|
||||
# define std2_longest_match
|
||||
/* Only use std3_longest_match for little_endian systems, also avoid using it with
|
||||
non-gcc compilers since the __builtin_ctzl() function might not be optimized. */
|
||||
# elif(defined(__GNUC__) && defined(HAVE_BUILTIN_CTZL) && ((__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \
|
||||
|| defined(__LITTLE_ENDIAN__)))
|
||||
# define std3_longest_match
|
||||
# elif(defined(_MSC_VER) && defined(_WIN32))
|
||||
# define std3_longest_match
|
||||
# else
|
||||
# define std2_longest_match
|
||||
# endif
|
||||
|
||||
#else
|
||||
# define std1_longest_match
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
# if defined(_M_IX86) || defined(_M_AMD64) || defined(_M_IA64)
|
||||
# include "arch/x86/ctzl.h"
|
||||
# elif defined(_M_ARM)
|
||||
# include "arch/arm/ctzl.h"
|
||||
# endif
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#ifdef std1_longest_match
|
||||
|
||||
/*
|
||||
* Standard longest_match
|
||||
*
|
||||
*/
|
||||
static inline unsigned longest_match(deflate_state *const s, IPos cur_match) {
|
||||
const unsigned wmask = s->w_mask;
|
||||
const Pos *prev = s->prev;
|
||||
|
||||
unsigned chain_length;
|
||||
IPos limit;
|
||||
unsigned int len, best_len, nice_match;
|
||||
unsigned char *scan, *match, *strend, scan_end, scan_end1;
|
||||
|
||||
/*
|
||||
* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple
|
||||
* of 16. It is easy to get rid of this optimization if necessary.
|
||||
*/
|
||||
Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
|
||||
|
||||
/*
|
||||
* Do not waste too much time if we already have a good match
|
||||
*/
|
||||
best_len = s->prev_length;
|
||||
chain_length = s->max_chain_length;
|
||||
if (best_len >= s->good_match)
|
||||
chain_length >>= 2;
|
||||
|
||||
/*
|
||||
* Do not looks for matches beyond the end of the input. This is
|
||||
* necessary to make deflate deterministic
|
||||
*/
|
||||
nice_match = (unsigned int)s->nice_match > s->lookahead ? s->lookahead : (unsigned int)s->nice_match;
|
||||
|
||||
/*
|
||||
* Stop when cur_match becomes <= limit. To simplify the code,
|
||||
* we prevent matches with the string of window index 0
|
||||
*/
|
||||
limit = s->strstart > MAX_DIST(s) ? s->strstart - MAX_DIST(s) : 0;
|
||||
|
||||
scan = s->window + s->strstart;
|
||||
strend = s->window + s->strstart + MAX_MATCH;
|
||||
scan_end1 = scan[best_len-1];
|
||||
scan_end = scan[best_len];
|
||||
|
||||
Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "need lookahead");
|
||||
do {
|
||||
if (cur_match >= s->strstart) {
|
||||
break;
|
||||
}
|
||||
match = s->window + cur_match;
|
||||
|
||||
/*
|
||||
* Skip to next match if the match length cannot increase
|
||||
* or if the match length is less than 2. Note that the checks
|
||||
* below for insufficient lookahead only occur occasionally
|
||||
* for performance reasons. Therefore uninitialized memory
|
||||
* will be accessed and conditional jumps will be made that
|
||||
* depend on those values. However the length of the match
|
||||
* is limited to the lookahead, so the output of deflate is not
|
||||
* affected by the uninitialized values.
|
||||
*/
|
||||
if (match[best_len] != scan_end ||
|
||||
match[best_len-1] != scan_end1 ||
|
||||
*match != *scan ||
|
||||
*++match != scan[1])
|
||||
continue;
|
||||
|
||||
/*
|
||||
* The check at best_len-1 can be removed because it will
|
||||
* be made again later. (This heuristic is not always a win.)
|
||||
* It is not necessary to compare scan[2] and match[2] since
|
||||
* they are always equal when the other bytes match, given
|
||||
* that the hash keys are equal and that HASH_BITS >= 8.
|
||||
*/
|
||||
scan += 2;
|
||||
match++;
|
||||
Assert(*scan == *match, "match[2]?");
|
||||
|
||||
/*
|
||||
* We check for insufficient lookahead only every 8th
|
||||
* comparision; the 256th check will be made at strstart + 258.
|
||||
*/
|
||||
do {
|
||||
} while (*++scan == *++match && *++scan == *++match &&
|
||||
*++scan == *++match && *++scan == *++match &&
|
||||
*++scan == *++match && *++scan == *++match &&
|
||||
*++scan == *++match && *++scan == *++match &&
|
||||
scan < strend);
|
||||
|
||||
Assert(scan <= s->window+(unsigned int)(s->window_size-1), "wild scan");
|
||||
|
||||
len = MAX_MATCH - (int)(strend - scan);
|
||||
scan = strend - MAX_MATCH;
|
||||
|
||||
if (len > best_len) {
|
||||
s->match_start = cur_match;
|
||||
best_len = len;
|
||||
if (len >= nice_match)
|
||||
break;
|
||||
scan_end1 = scan[best_len-1];
|
||||
scan_end = scan[best_len];
|
||||
} else {
|
||||
/*
|
||||
* The probability of finding a match later if we here
|
||||
* is pretty low, so for performance it's best to
|
||||
* outright stop here for the lower compression levels
|
||||
*/
|
||||
if (s->level < TRIGGER_LEVEL)
|
||||
break;
|
||||
}
|
||||
} while ((cur_match = prev[cur_match & wmask]) > limit && --chain_length);
|
||||
|
||||
if ((unsigned int)best_len <= s->lookahead)
|
||||
return best_len;
|
||||
return s->lookahead;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef std2_longest_match
|
||||
/*
|
||||
* UNALIGNED_OK longest_match
|
||||
*
|
||||
*/
|
||||
static inline unsigned longest_match(deflate_state *const s, IPos cur_match) {
|
||||
const unsigned wmask = s->w_mask;
|
||||
const Pos *prev = s->prev;
|
||||
|
||||
uint16_t scan_start, scan_end;
|
||||
unsigned chain_length;
|
||||
IPos limit;
|
||||
unsigned int len, best_len, nice_match;
|
||||
unsigned char *scan, *strend;
|
||||
|
||||
/*
|
||||
* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple
|
||||
* of 16. It is easy to get rid of this optimization if necessary.
|
||||
*/
|
||||
Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
|
||||
|
||||
/*
|
||||
* Do not waste too much time if we already have a good match
|
||||
*/
|
||||
best_len = s->prev_length;
|
||||
chain_length = s->max_chain_length;
|
||||
if (best_len >= s->good_match)
|
||||
chain_length >>= 2;
|
||||
|
||||
/*
|
||||
* Do not look for matches beyond the end of the input. This is
|
||||
* necessary to make deflate deterministic
|
||||
*/
|
||||
nice_match = (unsigned int)s->nice_match > s->lookahead ? s->lookahead : s->nice_match;
|
||||
|
||||
/*
|
||||
* Stop when cur_match becomes <= limit. To simplify the code,
|
||||
* we prevent matches with the string of window index 0
|
||||
*/
|
||||
limit = s->strstart > MAX_DIST(s) ? s->strstart - MAX_DIST(s) : 0;
|
||||
|
||||
scan = s->window + s->strstart;
|
||||
strend = s->window + s->strstart + MAX_MATCH - 1;
|
||||
memcpy(&scan_start, scan, sizeof(scan_start));
|
||||
memcpy(&scan_end, scan + best_len - 1, sizeof(scan_end));
|
||||
|
||||
Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "need lookahead");
|
||||
do {
|
||||
unsigned char *match;
|
||||
if (cur_match >= s->strstart) {
|
||||
break;
|
||||
}
|
||||
match = s->window + cur_match;
|
||||
|
||||
/*
|
||||
* Skip to next match if the match length cannot increase
|
||||
* or if the match length is less than 2. Note that the checks
|
||||
* below for insufficient lookahead only occur occasionally
|
||||
* for performance reasons. Therefore uninitialized memory
|
||||
* will be accessed and conditional jumps will be made that
|
||||
* depend on those values. However the length of the match
|
||||
* is limited to the lookahead, so the output of deflate is not
|
||||
* affected by the uninitialized values.
|
||||
*/
|
||||
uint16_t val;
|
||||
memcpy(&val, match + best_len - 1, sizeof(val));
|
||||
if (likely(val != scan_end))
|
||||
continue;
|
||||
|
||||
memcpy(&val, match, sizeof(val));
|
||||
if (val != scan_start)
|
||||
continue;
|
||||
|
||||
/* It is not necessary to compare scan[2] and match[2] since
|
||||
* they are always equal when the other bytes match, given that
|
||||
* the hash keys are equal and that HASH_BITS >= 8. Compare 2
|
||||
* bytes at a time at strstart+3, +5, ... up to strstart+257.
|
||||
* We check for insufficient lookahead only every 4th
|
||||
* comparison; the 128th check will be made at strstart+257.
|
||||
* If MAX_MATCH-2 is not a multiple of 8, it is necessary to
|
||||
* put more guard bytes at the end of the window, or to check
|
||||
* more often for insufficient lookahead.
|
||||
*/
|
||||
Assert(scan[2] == match[2], "scan[2]?");
|
||||
scan++;
|
||||
match++;
|
||||
|
||||
do {
|
||||
uint16_t mval, sval;
|
||||
|
||||
memcpy(&mval, match, sizeof(mval));
|
||||
memcpy(&sval, scan, sizeof(sval));
|
||||
if (mval != sval)
|
||||
break;
|
||||
match += sizeof(mval);
|
||||
scan += sizeof(sval);
|
||||
|
||||
memcpy(&mval, match, sizeof(mval));
|
||||
memcpy(&sval, scan, sizeof(sval));
|
||||
if (mval != sval)
|
||||
break;
|
||||
match += sizeof(mval);
|
||||
scan += sizeof(sval);
|
||||
|
||||
memcpy(&mval, match, sizeof(mval));
|
||||
memcpy(&sval, scan, sizeof(sval));
|
||||
if (mval != sval)
|
||||
break;
|
||||
match += sizeof(mval);
|
||||
scan += sizeof(sval);
|
||||
|
||||
memcpy(&mval, match, sizeof(mval));
|
||||
memcpy(&sval, scan, sizeof(sval));
|
||||
if (mval != sval)
|
||||
break;
|
||||
match += sizeof(mval);
|
||||
scan += sizeof(sval);
|
||||
} while (scan < strend);
|
||||
|
||||
/*
|
||||
* Here, scan <= window + strstart + 257
|
||||
*/
|
||||
Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
|
||||
if (*scan == *match)
|
||||
scan++;
|
||||
|
||||
len = (MAX_MATCH -1) - (int)(strend-scan);
|
||||
scan = strend - (MAX_MATCH-1);
|
||||
|
||||
if (len > best_len) {
|
||||
s->match_start = cur_match;
|
||||
best_len = len;
|
||||
if (len >= nice_match)
|
||||
break;
|
||||
memcpy(&scan_end, scan + best_len - 1, sizeof(scan_end));
|
||||
} else {
|
||||
/*
|
||||
* The probability of finding a match later if we here
|
||||
* is pretty low, so for performance it's best to
|
||||
* outright stop here for the lower compression levels
|
||||
*/
|
||||
if (s->level < TRIGGER_LEVEL)
|
||||
break;
|
||||
}
|
||||
} while (--chain_length && (cur_match = prev[cur_match & wmask]) > limit);
|
||||
|
||||
if ((unsigned)best_len <= s->lookahead)
|
||||
return best_len;
|
||||
return s->lookahead;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef std3_longest_match
|
||||
/* longest_match() with minor change to improve performance (in terms of
|
||||
* execution time).
|
||||
*
|
||||
* The pristine longest_match() function is sketched below (strip the
|
||||
* then-clause of the "#ifdef UNALIGNED_OK"-directive)
|
||||
*
|
||||
* ------------------------------------------------------------
|
||||
* unsigned int longest_match(...) {
|
||||
* ...
|
||||
* do {
|
||||
* match = s->window + cur_match; //s0
|
||||
* if (*(ushf*)(match+best_len-1) != scan_end || //s1
|
||||
* *(ushf*)match != scan_start) continue; //s2
|
||||
* ...
|
||||
*
|
||||
* do {
|
||||
* } while (*(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
|
||||
* *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
|
||||
* *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
|
||||
* *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
|
||||
* scan < strend); //s3
|
||||
*
|
||||
* ...
|
||||
* } while(cond); //s4
|
||||
*
|
||||
* -------------------------------------------------------------
|
||||
*
|
||||
* The change include:
|
||||
*
|
||||
* 1) The hottest statements of the function is: s0, s1 and s4. Pull them
|
||||
* together to form a new loop. The benefit is two-fold:
|
||||
*
|
||||
* o. Ease the compiler to yield good code layout: the conditional-branch
|
||||
* corresponding to s1 and its biased target s4 become very close (likely,
|
||||
* fit in the same cache-line), hence improving instruction-fetching
|
||||
* efficiency.
|
||||
*
|
||||
* o. Ease the compiler to promote "s->window" into register. "s->window"
|
||||
* is loop-invariant; it is supposed to be promoted into register and keep
|
||||
* the value throughout the entire loop. However, there are many such
|
||||
* loop-invariant, and x86-family has small register file; "s->window" is
|
||||
* likely to be chosen as register-allocation victim such that its value
|
||||
* is reloaded from memory in every single iteration. By forming a new loop,
|
||||
* "s->window" is loop-invariant of that newly created tight loop. It is
|
||||
* lot easier for compiler to promote this quantity to register and keep
|
||||
* its value throughout the entire small loop.
|
||||
*
|
||||
* 2) Transfrom s3 such that it examines sizeof(long)-byte-match at a time.
|
||||
* This is done by:
|
||||
* ------------------------------------------------
|
||||
* v1 = load from "scan" by sizeof(long) bytes
|
||||
* v2 = load from "match" by sizeof(lnog) bytes
|
||||
* v3 = v1 xor v2
|
||||
* match-bit = little-endian-machine(yes-for-x86) ?
|
||||
* count-trailing-zero(v3) :
|
||||
* count-leading-zero(v3);
|
||||
*
|
||||
* match-byte = match-bit/8
|
||||
*
|
||||
* "scan" and "match" advance if necessary
|
||||
* -------------------------------------------------
|
||||
*/
|
||||
|
||||
static inline unsigned longest_match(deflate_state *const s, IPos cur_match) {
|
||||
unsigned int strstart = s->strstart;
|
||||
unsigned chain_length = s->max_chain_length;/* max hash chain length */
|
||||
unsigned char *window = s->window;
|
||||
register unsigned char *scan = window + strstart; /* current string */
|
||||
register unsigned char *match; /* matched string */
|
||||
register unsigned int len; /* length of current match */
|
||||
unsigned int best_len = s->prev_length; /* best match length so far */
|
||||
unsigned int nice_match = s->nice_match; /* stop if match long enough */
|
||||
IPos limit = strstart > (IPos)MAX_DIST(s) ?
|
||||
strstart - (IPos)MAX_DIST(s) : NIL;
|
||||
/* Stop when cur_match becomes <= limit. To simplify the code,
|
||||
* we prevent matches with the string of window index 0.
|
||||
*/
|
||||
Pos *prev = s->prev;
|
||||
unsigned int wmask = s->w_mask;
|
||||
|
||||
register unsigned char *strend = window + strstart + MAX_MATCH;
|
||||
|
||||
uint16_t scan_start, scan_end;
|
||||
|
||||
memcpy(&scan_start, scan, sizeof(scan_start));
|
||||
memcpy(&scan_end, scan+best_len-1, sizeof(scan_end));
|
||||
|
||||
/* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
|
||||
* It is easy to get rid of this optimization if necessary.
|
||||
*/
|
||||
Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
|
||||
|
||||
/* Do not waste too much time if we already have a good match: */
|
||||
if (s->prev_length >= s->good_match) {
|
||||
chain_length >>= 2;
|
||||
}
|
||||
/* Do not look for matches beyond the end of the input. This is necessary
|
||||
* to make deflate deterministic.
|
||||
*/
|
||||
if ((unsigned int)nice_match > s->lookahead) nice_match = s->lookahead;
|
||||
|
||||
Assert((unsigned long)strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
|
||||
|
||||
do {
|
||||
if (cur_match >= strstart) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* Skip to next match if the match length cannot increase
|
||||
* or if the match length is less than 2. Note that the checks below
|
||||
* for insufficient lookahead only occur occasionally for performance
|
||||
* reasons. Therefore uninitialized memory will be accessed, and
|
||||
* conditional jumps will be made that depend on those values.
|
||||
* However the length of the match is limited to the lookahead, so
|
||||
* the output of deflate is not affected by the uninitialized values.
|
||||
*/
|
||||
int cont = 1;
|
||||
do {
|
||||
match = window + cur_match;
|
||||
if (likely(memcmp(match+best_len-1, &scan_end, sizeof(scan_end)) != 0)) {
|
||||
if ((cur_match = prev[cur_match & wmask]) > limit
|
||||
&& --chain_length != 0) {
|
||||
continue;
|
||||
} else {
|
||||
cont = 0;
|
||||
}
|
||||
}
|
||||
break;
|
||||
} while (1);
|
||||
|
||||
if (!cont)
|
||||
break;
|
||||
|
||||
if (memcmp(match, &scan_start, sizeof(scan_start)) != 0)
|
||||
continue;
|
||||
|
||||
/* It is not necessary to compare scan[2] and match[2] since they are
|
||||
* always equal when the other bytes match, given that the hash keys
|
||||
* are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at
|
||||
* strstart+3, +5, ... up to strstart+257. We check for insufficient
|
||||
* lookahead only every 4th comparison; the 128th check will be made
|
||||
* at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is
|
||||
* necessary to put more guard bytes at the end of the window, or
|
||||
* to check more often for insufficient lookahead.
|
||||
*/
|
||||
scan += 2, match+=2;
|
||||
Assert(*scan == *match, "match[2]?");
|
||||
do {
|
||||
unsigned long sv, mv, xor;
|
||||
|
||||
memcpy(&sv, scan, sizeof(sv));
|
||||
memcpy(&mv, match, sizeof(mv));
|
||||
|
||||
xor = sv ^ mv;
|
||||
|
||||
if (xor) {
|
||||
int match_byte = __builtin_ctzl(xor) / 8;
|
||||
scan += match_byte;
|
||||
break;
|
||||
} else {
|
||||
scan += sizeof(unsigned long);
|
||||
match += sizeof(unsigned long);
|
||||
}
|
||||
} while (scan < strend);
|
||||
|
||||
if (scan > strend)
|
||||
scan = strend;
|
||||
|
||||
Assert(scan <= window + (unsigned)(s->window_size-1), "wild scan");
|
||||
|
||||
len = MAX_MATCH - (int)(strend - scan);
|
||||
scan = strend - MAX_MATCH;
|
||||
|
||||
if (len > best_len) {
|
||||
s->match_start = cur_match;
|
||||
best_len = len;
|
||||
if (len >= nice_match)
|
||||
break;
|
||||
memcpy(&scan_end, scan+best_len-1, sizeof(scan_end));
|
||||
} else {
|
||||
/*
|
||||
* The probability of finding a match later if we here
|
||||
* is pretty low, so for performance it's best to
|
||||
* outright stop here for the lower compression levels
|
||||
*/
|
||||
if (s->level < TRIGGER_LEVEL)
|
||||
break;
|
||||
}
|
||||
} while ((cur_match = prev[cur_match & wmask]) > limit && --chain_length != 0);
|
||||
|
||||
if ((unsigned int)best_len <= s->lookahead)
|
||||
return (unsigned int)best_len;
|
||||
return s->lookahead;
|
||||
}
|
||||
#endif
|
||||
Reference in New Issue
Block a user