04ffce137b
This commit is the same with [PATCH v6 1/2], and provides avx512 support for xbzrle_encode_buffer function to accelerate xbzrle encoding speed. Runtime check of avx512 support and benchmark for this feature are added. Compared with C version of xbzrle_encode_buffer function, avx512 version can achieve 50%-70% performance improvement on benchmarking. In addition, if dirty data is randomly located in 4K page, the avx512 version can achieve almost 140% performance gain. Signed-off-by: ling xu <ling1.xu@intel.com> Co-authored-by: Zhou Zhao <zhou.zhao@intel.com> Co-authored-by: Jun Jin <jun.i.jin@intel.com> Reviewed-by: Juan Quintela <quintela@redhat.com> Signed-off-by: Juan Quintela <quintela@redhat.com>
301 lines
8.2 KiB
C
301 lines
8.2 KiB
C
/*
|
|
* Xor Based Zero Run Length Encoding
|
|
*
|
|
* Copyright 2013 Red Hat, Inc. and/or its affiliates
|
|
*
|
|
* Authors:
|
|
* Orit Wasserman <owasserm@redhat.com>
|
|
*
|
|
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
|
* See the COPYING file in the top-level directory.
|
|
*
|
|
*/
|
|
#include "qemu/osdep.h"
|
|
#include "qemu/cutils.h"
|
|
#include "xbzrle.h"
|
|
|
|
/*
|
|
page = zrun nzrun
|
|
| zrun nzrun page
|
|
|
|
zrun = length
|
|
|
|
nzrun = length byte...
|
|
|
|
length = uleb128 encoded integer
|
|
*/
|
|
int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
|
|
uint8_t *dst, int dlen)
|
|
{
|
|
uint32_t zrun_len = 0, nzrun_len = 0;
|
|
int d = 0, i = 0;
|
|
long res;
|
|
uint8_t *nzrun_start = NULL;
|
|
|
|
g_assert(!(((uintptr_t)old_buf | (uintptr_t)new_buf | slen) %
|
|
sizeof(long)));
|
|
|
|
while (i < slen) {
|
|
/* overflow */
|
|
if (d + 2 > dlen) {
|
|
return -1;
|
|
}
|
|
|
|
/* not aligned to sizeof(long) */
|
|
res = (slen - i) % sizeof(long);
|
|
while (res && old_buf[i] == new_buf[i]) {
|
|
zrun_len++;
|
|
i++;
|
|
res--;
|
|
}
|
|
|
|
/* word at a time for speed */
|
|
if (!res) {
|
|
while (i < slen &&
|
|
(*(long *)(old_buf + i)) == (*(long *)(new_buf + i))) {
|
|
i += sizeof(long);
|
|
zrun_len += sizeof(long);
|
|
}
|
|
|
|
/* go over the rest */
|
|
while (i < slen && old_buf[i] == new_buf[i]) {
|
|
zrun_len++;
|
|
i++;
|
|
}
|
|
}
|
|
|
|
/* buffer unchanged */
|
|
if (zrun_len == slen) {
|
|
return 0;
|
|
}
|
|
|
|
/* skip last zero run */
|
|
if (i == slen) {
|
|
return d;
|
|
}
|
|
|
|
d += uleb128_encode_small(dst + d, zrun_len);
|
|
|
|
zrun_len = 0;
|
|
nzrun_start = new_buf + i;
|
|
|
|
/* overflow */
|
|
if (d + 2 > dlen) {
|
|
return -1;
|
|
}
|
|
/* not aligned to sizeof(long) */
|
|
res = (slen - i) % sizeof(long);
|
|
while (res && old_buf[i] != new_buf[i]) {
|
|
i++;
|
|
nzrun_len++;
|
|
res--;
|
|
}
|
|
|
|
/* word at a time for speed, use of 32-bit long okay */
|
|
if (!res) {
|
|
/* truncation to 32-bit long okay */
|
|
unsigned long mask = (unsigned long)0x0101010101010101ULL;
|
|
while (i < slen) {
|
|
unsigned long xor;
|
|
xor = *(unsigned long *)(old_buf + i)
|
|
^ *(unsigned long *)(new_buf + i);
|
|
if ((xor - mask) & ~xor & (mask << 7)) {
|
|
/* found the end of an nzrun within the current long */
|
|
while (old_buf[i] != new_buf[i]) {
|
|
nzrun_len++;
|
|
i++;
|
|
}
|
|
break;
|
|
} else {
|
|
i += sizeof(long);
|
|
nzrun_len += sizeof(long);
|
|
}
|
|
}
|
|
}
|
|
|
|
d += uleb128_encode_small(dst + d, nzrun_len);
|
|
/* overflow */
|
|
if (d + nzrun_len > dlen) {
|
|
return -1;
|
|
}
|
|
memcpy(dst + d, nzrun_start, nzrun_len);
|
|
d += nzrun_len;
|
|
nzrun_len = 0;
|
|
}
|
|
|
|
return d;
|
|
}
|
|
|
|
int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen)
|
|
{
|
|
int i = 0, d = 0;
|
|
int ret;
|
|
uint32_t count = 0;
|
|
|
|
while (i < slen) {
|
|
|
|
/* zrun */
|
|
if ((slen - i) < 2) {
|
|
return -1;
|
|
}
|
|
|
|
ret = uleb128_decode_small(src + i, &count);
|
|
if (ret < 0 || (i && !count)) {
|
|
return -1;
|
|
}
|
|
i += ret;
|
|
d += count;
|
|
|
|
/* overflow */
|
|
if (d > dlen) {
|
|
return -1;
|
|
}
|
|
|
|
/* nzrun */
|
|
if ((slen - i) < 2) {
|
|
return -1;
|
|
}
|
|
|
|
ret = uleb128_decode_small(src + i, &count);
|
|
if (ret < 0 || !count) {
|
|
return -1;
|
|
}
|
|
i += ret;
|
|
|
|
/* overflow */
|
|
if (d + count > dlen || i + count > slen) {
|
|
return -1;
|
|
}
|
|
|
|
memcpy(dst + d, src + i, count);
|
|
d += count;
|
|
i += count;
|
|
}
|
|
|
|
return d;
|
|
}
|
|
|
|
#if defined(CONFIG_AVX512BW_OPT)
|
|
#pragma GCC push_options
|
|
#pragma GCC target("avx512bw")
|
|
#include <immintrin.h>
|
|
int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
|
|
uint8_t *dst, int dlen)
|
|
{
|
|
uint32_t zrun_len = 0, nzrun_len = 0;
|
|
int d = 0, i = 0, num = 0;
|
|
uint8_t *nzrun_start = NULL;
|
|
/* add 1 to include residual part in main loop */
|
|
uint32_t count512s = (slen >> 6) + 1;
|
|
/* countResidual is tail of data, i.e., countResidual = slen % 64 */
|
|
uint32_t count_residual = slen & 0b111111;
|
|
bool never_same = true;
|
|
uint64_t mask_residual = 1;
|
|
mask_residual <<= count_residual;
|
|
mask_residual -= 1;
|
|
__m512i r = _mm512_set1_epi32(0);
|
|
|
|
while (count512s) {
|
|
if (d + 2 > dlen) {
|
|
return -1;
|
|
}
|
|
|
|
int bytes_to_check = 64;
|
|
uint64_t mask = 0xffffffffffffffff;
|
|
if (count512s == 1) {
|
|
bytes_to_check = count_residual;
|
|
mask = mask_residual;
|
|
}
|
|
__m512i old_data = _mm512_mask_loadu_epi8(r,
|
|
mask, old_buf + i);
|
|
__m512i new_data = _mm512_mask_loadu_epi8(r,
|
|
mask, new_buf + i);
|
|
uint64_t comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
|
|
count512s--;
|
|
|
|
bool is_same = (comp & 0x1);
|
|
while (bytes_to_check) {
|
|
if (is_same) {
|
|
if (nzrun_len) {
|
|
d += uleb128_encode_small(dst + d, nzrun_len);
|
|
if (d + nzrun_len > dlen) {
|
|
return -1;
|
|
}
|
|
nzrun_start = new_buf + i - nzrun_len;
|
|
memcpy(dst + d, nzrun_start, nzrun_len);
|
|
d += nzrun_len;
|
|
nzrun_len = 0;
|
|
}
|
|
/* 64 data at a time for speed */
|
|
if (count512s && (comp == 0xffffffffffffffff)) {
|
|
i += 64;
|
|
zrun_len += 64;
|
|
break;
|
|
}
|
|
never_same = false;
|
|
num = __builtin_ctzll(~comp);
|
|
num = (num < bytes_to_check) ? num : bytes_to_check;
|
|
zrun_len += num;
|
|
bytes_to_check -= num;
|
|
comp >>= num;
|
|
i += num;
|
|
if (bytes_to_check) {
|
|
/* still has different data after same data */
|
|
d += uleb128_encode_small(dst + d, zrun_len);
|
|
zrun_len = 0;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
if (never_same || zrun_len) {
|
|
/*
|
|
* never_same only acts if
|
|
* data begins with diff in first count512s
|
|
*/
|
|
d += uleb128_encode_small(dst + d, zrun_len);
|
|
zrun_len = 0;
|
|
never_same = false;
|
|
}
|
|
/* has diff, 64 data at a time for speed */
|
|
if ((bytes_to_check == 64) && (comp == 0x0)) {
|
|
i += 64;
|
|
nzrun_len += 64;
|
|
break;
|
|
}
|
|
num = __builtin_ctzll(comp);
|
|
num = (num < bytes_to_check) ? num : bytes_to_check;
|
|
nzrun_len += num;
|
|
bytes_to_check -= num;
|
|
comp >>= num;
|
|
i += num;
|
|
if (bytes_to_check) {
|
|
/* mask like 111000 */
|
|
d += uleb128_encode_small(dst + d, nzrun_len);
|
|
/* overflow */
|
|
if (d + nzrun_len > dlen) {
|
|
return -1;
|
|
}
|
|
nzrun_start = new_buf + i - nzrun_len;
|
|
memcpy(dst + d, nzrun_start, nzrun_len);
|
|
d += nzrun_len;
|
|
nzrun_len = 0;
|
|
is_same = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (nzrun_len != 0) {
|
|
d += uleb128_encode_small(dst + d, nzrun_len);
|
|
/* overflow */
|
|
if (d + nzrun_len > dlen) {
|
|
return -1;
|
|
}
|
|
nzrun_start = new_buf + i - nzrun_len;
|
|
memcpy(dst + d, nzrun_start, nzrun_len);
|
|
d += nzrun_len;
|
|
}
|
|
return d;
|
|
}
|
|
#pragma GCC pop_options
|
|
#endif
|