Update libwebp to 0.6.1
* lossless performance and compression improvements + a new 'cruncher' mode (-m 6 -q 100) * ARM performance improvements with clang (15-20% w/ndk r15c) * webp-js: emscripten/webassembly based javascript decoder * miscellaneous bug & build fixes
This commit is contained in:
parent
64d104756c
commit
043103fe6a
|
@ -26,9 +26,6 @@ if env['builtin_libwebp']:
|
|||
"dsp/alpha_processing_neon.c",
|
||||
"dsp/alpha_processing_sse2.c",
|
||||
"dsp/alpha_processing_sse41.c",
|
||||
"dsp/argb.c",
|
||||
"dsp/argb_mips_dsp_r2.c",
|
||||
"dsp/argb_sse2.c",
|
||||
"dsp/cost.c",
|
||||
"dsp/cost_mips32.c",
|
||||
"dsp/cost_mips_dsp_r2.c",
|
||||
|
@ -36,6 +33,9 @@ if env['builtin_libwebp']:
|
|||
"dsp/cpu.c",
|
||||
"dsp/dec.c",
|
||||
"dsp/dec_clip_tables.c",
|
||||
"dsp/ssim.c",
|
||||
"dsp/ssim_sse2.c",
|
||||
"dsp/yuv_neon.c",
|
||||
"dsp/dec_mips32.c",
|
||||
"dsp/dec_mips_dsp_r2.c",
|
||||
"dsp/dec_msa.c",
|
||||
|
@ -84,6 +84,7 @@ if env['builtin_libwebp']:
|
|||
"dsp/yuv_sse2.c",
|
||||
"enc/alpha_enc.c",
|
||||
"enc/analysis_enc.c",
|
||||
"enc/backward_references_cost_enc.c",
|
||||
"enc/backward_references_enc.c",
|
||||
"enc/config_enc.c",
|
||||
"enc/cost_enc.c",
|
||||
|
@ -122,10 +123,10 @@ if env['builtin_libwebp']:
|
|||
"utils/thread_utils.c",
|
||||
"utils/utils.c",
|
||||
]
|
||||
thirdparty_sources = [thirdparty_dir + file for file in thirdparty_sources]
|
||||
thirdparty_sources = [thirdparty_dir + "src/" + file for file in thirdparty_sources]
|
||||
|
||||
env_webp.add_source_files(env.modules_sources, thirdparty_sources)
|
||||
env_webp.Append(CPPPATH=[thirdparty_dir])
|
||||
env_webp.Append(CPPPATH=[thirdparty_dir, thirdparty_dir + "src/"])
|
||||
|
||||
# Godot source files
|
||||
env_webp.add_source_files(env.modules_sources, "*.cpp")
|
||||
|
|
|
@ -183,7 +183,7 @@ TODO.
|
|||
## libwebp
|
||||
|
||||
- Upstream: https://chromium.googlesource.com/webm/libwebp/
|
||||
- Version: 0.6.0
|
||||
- Version: 0.6.1
|
||||
- License: BSD-3-Clause
|
||||
|
||||
Files extracted from upstream source:
|
||||
|
|
|
@ -1,68 +0,0 @@
|
|||
// Copyright 2014 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// ARGB making functions.
|
||||
//
|
||||
// Author: Djordje Pesut (djordje.pesut@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
|
||||
static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
|
||||
return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
|
||||
}
|
||||
|
||||
static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
|
||||
const uint8_t* b, int len, uint32_t* out) {
|
||||
int i;
|
||||
for (i = 0; i < len; ++i) {
|
||||
out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
|
||||
}
|
||||
}
|
||||
|
||||
static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
|
||||
int len, int step, uint32_t* out) {
|
||||
int i, offset = 0;
|
||||
for (i = 0; i < len; ++i) {
|
||||
out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
|
||||
offset += step;
|
||||
}
|
||||
}
|
||||
|
||||
void (*VP8PackARGB)(const uint8_t*, const uint8_t*, const uint8_t*,
|
||||
const uint8_t*, int, uint32_t*);
|
||||
void (*VP8PackRGB)(const uint8_t*, const uint8_t*, const uint8_t*,
|
||||
int, int, uint32_t*);
|
||||
|
||||
extern void VP8EncDspARGBInitMIPSdspR2(void);
|
||||
extern void VP8EncDspARGBInitSSE2(void);
|
||||
|
||||
static volatile VP8CPUInfo argb_last_cpuinfo_used =
|
||||
(VP8CPUInfo)&argb_last_cpuinfo_used;
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInit(void) {
|
||||
if (argb_last_cpuinfo_used == VP8GetCPUInfo) return;
|
||||
|
||||
VP8PackARGB = PackARGB;
|
||||
VP8PackRGB = PackRGB;
|
||||
|
||||
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
if (VP8GetCPUInfo(kSSE2)) {
|
||||
VP8EncDspARGBInitSSE2();
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_MIPS_DSP_R2)
|
||||
if (VP8GetCPUInfo(kMIPSdspR2)) {
|
||||
VP8EncDspARGBInitMIPSdspR2();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
argb_last_cpuinfo_used = VP8GetCPUInfo;
|
||||
}
|
|
@ -1,110 +0,0 @@
|
|||
// Copyright 2014 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// ARGB making functions (mips version).
|
||||
//
|
||||
// Author: Djordje Pesut (djordje.pesut@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MIPS_DSP_R2)
|
||||
|
||||
static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
|
||||
const uint8_t* b, int len, uint32_t* out) {
|
||||
int temp0, temp1, temp2, temp3, offset;
|
||||
const int rest = len & 1;
|
||||
const uint32_t* const loop_end = out + len - rest;
|
||||
const int step = 4;
|
||||
__asm__ volatile (
|
||||
"xor %[offset], %[offset], %[offset] \n\t"
|
||||
"beq %[loop_end], %[out], 0f \n\t"
|
||||
"2: \n\t"
|
||||
"lbux %[temp0], %[offset](%[a]) \n\t"
|
||||
"lbux %[temp1], %[offset](%[r]) \n\t"
|
||||
"lbux %[temp2], %[offset](%[g]) \n\t"
|
||||
"lbux %[temp3], %[offset](%[b]) \n\t"
|
||||
"ins %[temp1], %[temp0], 16, 16 \n\t"
|
||||
"ins %[temp3], %[temp2], 16, 16 \n\t"
|
||||
"addiu %[out], %[out], 4 \n\t"
|
||||
"precr.qb.ph %[temp0], %[temp1], %[temp3] \n\t"
|
||||
"sw %[temp0], -4(%[out]) \n\t"
|
||||
"addu %[offset], %[offset], %[step] \n\t"
|
||||
"bne %[loop_end], %[out], 2b \n\t"
|
||||
"0: \n\t"
|
||||
"beq %[rest], $zero, 1f \n\t"
|
||||
"lbux %[temp0], %[offset](%[a]) \n\t"
|
||||
"lbux %[temp1], %[offset](%[r]) \n\t"
|
||||
"lbux %[temp2], %[offset](%[g]) \n\t"
|
||||
"lbux %[temp3], %[offset](%[b]) \n\t"
|
||||
"ins %[temp1], %[temp0], 16, 16 \n\t"
|
||||
"ins %[temp3], %[temp2], 16, 16 \n\t"
|
||||
"precr.qb.ph %[temp0], %[temp1], %[temp3] \n\t"
|
||||
"sw %[temp0], 0(%[out]) \n\t"
|
||||
"1: \n\t"
|
||||
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
|
||||
[temp3]"=&r"(temp3), [offset]"=&r"(offset), [out]"+&r"(out)
|
||||
: [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
|
||||
[loop_end]"r"(loop_end), [rest]"r"(rest)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
|
||||
int len, int step, uint32_t* out) {
|
||||
int temp0, temp1, temp2, offset;
|
||||
const int rest = len & 1;
|
||||
const int a = 0xff;
|
||||
const uint32_t* const loop_end = out + len - rest;
|
||||
__asm__ volatile (
|
||||
"xor %[offset], %[offset], %[offset] \n\t"
|
||||
"beq %[loop_end], %[out], 0f \n\t"
|
||||
"2: \n\t"
|
||||
"lbux %[temp0], %[offset](%[r]) \n\t"
|
||||
"lbux %[temp1], %[offset](%[g]) \n\t"
|
||||
"lbux %[temp2], %[offset](%[b]) \n\t"
|
||||
"ins %[temp0], %[a], 16, 16 \n\t"
|
||||
"ins %[temp2], %[temp1], 16, 16 \n\t"
|
||||
"addiu %[out], %[out], 4 \n\t"
|
||||
"precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t"
|
||||
"sw %[temp0], -4(%[out]) \n\t"
|
||||
"addu %[offset], %[offset], %[step] \n\t"
|
||||
"bne %[loop_end], %[out], 2b \n\t"
|
||||
"0: \n\t"
|
||||
"beq %[rest], $zero, 1f \n\t"
|
||||
"lbux %[temp0], %[offset](%[r]) \n\t"
|
||||
"lbux %[temp1], %[offset](%[g]) \n\t"
|
||||
"lbux %[temp2], %[offset](%[b]) \n\t"
|
||||
"ins %[temp0], %[a], 16, 16 \n\t"
|
||||
"ins %[temp2], %[temp1], 16, 16 \n\t"
|
||||
"precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t"
|
||||
"sw %[temp0], 0(%[out]) \n\t"
|
||||
"1: \n\t"
|
||||
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
|
||||
[offset]"=&r"(offset), [out]"+&r"(out)
|
||||
: [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
|
||||
[loop_end]"r"(loop_end), [rest]"r"(rest)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Entry point
|
||||
|
||||
extern void VP8EncDspARGBInitMIPSdspR2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitMIPSdspR2(void) {
|
||||
VP8PackARGB = PackARGB;
|
||||
VP8PackRGB = PackRGB;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_MIPS_DSP_R2
|
||||
|
||||
WEBP_DSP_INIT_STUB(VP8EncDspARGBInitMIPSdspR2)
|
||||
|
||||
#endif // WEBP_USE_MIPS_DSP_R2
|
|
@ -1,67 +0,0 @@
|
|||
// Copyright 2014 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// ARGB making functions (SSE2 version).
|
||||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
|
||||
#include <assert.h>
|
||||
#include <emmintrin.h>
|
||||
#include <string.h>
|
||||
|
||||
static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
|
||||
return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
|
||||
}
|
||||
|
||||
static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
|
||||
const uint8_t* b, int len, uint32_t* out) {
|
||||
if (g == r + 1) { // RGBA input order. Need to swap R and B.
|
||||
int i = 0;
|
||||
const int len_max = len & ~3; // max length processed in main loop
|
||||
const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
|
||||
assert(b == r + 2);
|
||||
assert(a == r + 3);
|
||||
for (; i < len_max; i += 4) {
|
||||
const __m128i A = _mm_loadu_si128((const __m128i*)(r + 4 * i));
|
||||
const __m128i B = _mm_and_si128(A, red_blue_mask); // R 0 B 0
|
||||
const __m128i C = _mm_andnot_si128(red_blue_mask, A); // 0 G 0 A
|
||||
const __m128i D = _mm_shufflelo_epi16(B, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
const __m128i E = _mm_shufflehi_epi16(D, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
const __m128i F = _mm_or_si128(E, C);
|
||||
_mm_storeu_si128((__m128i*)(out + i), F);
|
||||
}
|
||||
for (; i < len; ++i) {
|
||||
out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
|
||||
}
|
||||
} else {
|
||||
assert(g == b + 1);
|
||||
assert(r == b + 2);
|
||||
assert(a == b + 3);
|
||||
memcpy(out, b, len * 4);
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Entry point
|
||||
|
||||
extern void VP8EncDspARGBInitSSE2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitSSE2(void) {
|
||||
VP8PackARGB = PackARGB;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_SSE2
|
||||
|
||||
WEBP_DSP_INIT_STUB(VP8EncDspARGBInitSSE2)
|
||||
|
||||
#endif // WEBP_USE_SSE2
|
File diff suppressed because it is too large
Load Diff
|
@ -12,13 +12,13 @@
|
|||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "./alphai_dec.h"
|
||||
#include "./vp8i_dec.h"
|
||||
#include "./vp8li_dec.h"
|
||||
#include "../dsp/dsp.h"
|
||||
#include "../utils/quant_levels_dec_utils.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "../webp/format_constants.h"
|
||||
#include "src/dec/alphai_dec.h"
|
||||
#include "src/dec/vp8i_dec.h"
|
||||
#include "src/dec/vp8li_dec.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
#include "src/utils/quant_levels_dec_utils.h"
|
||||
#include "src/utils/utils.h"
|
||||
#include "src/webp/format_constants.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// ALPHDecoder object.
|
|
@ -11,11 +11,11 @@
|
|||
//
|
||||
// Author: Urvang (urvang@google.com)
|
||||
|
||||
#ifndef WEBP_DEC_ALPHAI_H_
|
||||
#define WEBP_DEC_ALPHAI_H_
|
||||
#ifndef WEBP_DEC_ALPHAI_DEC_H_
|
||||
#define WEBP_DEC_ALPHAI_DEC_H_
|
||||
|
||||
#include "./webpi_dec.h"
|
||||
#include "../utils/filters_utils.h"
|
||||
#include "src/dec/webpi_dec.h"
|
||||
#include "src/utils/filters_utils.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
@ -51,4 +51,4 @@ void WebPDeallocateAlphaMemory(VP8Decoder* const dec);
|
|||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif /* WEBP_DEC_ALPHAI_H_ */
|
||||
#endif /* WEBP_DEC_ALPHAI_DEC_H_ */
|
|
@ -13,15 +13,15 @@
|
|||
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "./vp8i_dec.h"
|
||||
#include "./webpi_dec.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "src/dec/vp8i_dec.h"
|
||||
#include "src/dec/webpi_dec.h"
|
||||
#include "src/utils/utils.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// WebPDecBuffer
|
||||
|
||||
// Number of bytes per pixel for the different color-spaces.
|
||||
static const int kModeBpp[MODE_LAST] = {
|
||||
static const uint8_t kModeBpp[MODE_LAST] = {
|
||||
3, 4, 3, 4, 4, 2, 2,
|
||||
4, 4, 4, 2, // pre-multiplied modes
|
||||
1, 1 };
|
||||
|
@ -36,7 +36,7 @@ static int IsValidColorspace(int webp_csp_mode) {
|
|||
// strictly speaking, the very last (or first, if flipped) row
|
||||
// doesn't require padding.
|
||||
#define MIN_BUFFER_SIZE(WIDTH, HEIGHT, STRIDE) \
|
||||
(uint64_t)(STRIDE) * ((HEIGHT) - 1) + (WIDTH)
|
||||
((uint64_t)(STRIDE) * ((HEIGHT) - 1) + (WIDTH))
|
||||
|
||||
static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
|
||||
int ok = 1;
|
||||
|
@ -98,9 +98,14 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
|
|||
uint64_t uv_size = 0, a_size = 0, total_size;
|
||||
// We need memory and it hasn't been allocated yet.
|
||||
// => initialize output buffer, now that dimensions are known.
|
||||
const int stride = w * kModeBpp[mode];
|
||||
const uint64_t size = (uint64_t)stride * h;
|
||||
int stride;
|
||||
uint64_t size;
|
||||
|
||||
if ((uint64_t)w * kModeBpp[mode] >= (1ull << 32)) {
|
||||
return VP8_STATUS_INVALID_PARAM;
|
||||
}
|
||||
stride = w * kModeBpp[mode];
|
||||
size = (uint64_t)stride * h;
|
||||
if (!WebPIsRGBMode(mode)) {
|
||||
uv_stride = (w + 1) / 2;
|
||||
uv_size = (uint64_t)uv_stride * ((h + 1) / 2);
|
||||
|
@ -169,11 +174,11 @@ VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer) {
|
|||
return VP8_STATUS_OK;
|
||||
}
|
||||
|
||||
VP8StatusCode WebPAllocateDecBuffer(int w, int h,
|
||||
VP8StatusCode WebPAllocateDecBuffer(int width, int height,
|
||||
const WebPDecoderOptions* const options,
|
||||
WebPDecBuffer* const out) {
|
||||
WebPDecBuffer* const buffer) {
|
||||
VP8StatusCode status;
|
||||
if (out == NULL || w <= 0 || h <= 0) {
|
||||
if (buffer == NULL || width <= 0 || height <= 0) {
|
||||
return VP8_STATUS_INVALID_PARAM;
|
||||
}
|
||||
if (options != NULL) { // First, apply options if there is any.
|
||||
|
@ -182,33 +187,39 @@ VP8StatusCode WebPAllocateDecBuffer(int w, int h,
|
|||
const int ch = options->crop_height;
|
||||
const int x = options->crop_left & ~1;
|
||||
const int y = options->crop_top & ~1;
|
||||
if (x < 0 || y < 0 || cw <= 0 || ch <= 0 || x + cw > w || y + ch > h) {
|
||||
if (x < 0 || y < 0 || cw <= 0 || ch <= 0 ||
|
||||
x + cw > width || y + ch > height) {
|
||||
return VP8_STATUS_INVALID_PARAM; // out of frame boundary.
|
||||
}
|
||||
w = cw;
|
||||
h = ch;
|
||||
width = cw;
|
||||
height = ch;
|
||||
}
|
||||
|
||||
if (options->use_scaling) {
|
||||
#if !defined(WEBP_REDUCE_SIZE)
|
||||
int scaled_width = options->scaled_width;
|
||||
int scaled_height = options->scaled_height;
|
||||
if (!WebPRescalerGetScaledDimensions(
|
||||
w, h, &scaled_width, &scaled_height)) {
|
||||
width, height, &scaled_width, &scaled_height)) {
|
||||
return VP8_STATUS_INVALID_PARAM;
|
||||
}
|
||||
w = scaled_width;
|
||||
h = scaled_height;
|
||||
width = scaled_width;
|
||||
height = scaled_height;
|
||||
#else
|
||||
return VP8_STATUS_INVALID_PARAM; // rescaling not supported
|
||||
#endif
|
||||
}
|
||||
}
|
||||
out->width = w;
|
||||
out->height = h;
|
||||
buffer->width = width;
|
||||
buffer->height = height;
|
||||
|
||||
// Then, allocate buffer for real.
|
||||
status = AllocateBuffer(out);
|
||||
status = AllocateBuffer(buffer);
|
||||
if (status != VP8_STATUS_OK) return status;
|
||||
|
||||
// Use the stride trick if vertical flip is needed.
|
||||
if (options != NULL && options->flip) {
|
||||
status = WebPFlipBuffer(out);
|
||||
status = WebPFlipBuffer(buffer);
|
||||
}
|
||||
return status;
|
||||
}
|
|
@ -11,8 +11,8 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#ifndef WEBP_DEC_COMMON_H_
|
||||
#define WEBP_DEC_COMMON_H_
|
||||
#ifndef WEBP_DEC_COMMON_DEC_H_
|
||||
#define WEBP_DEC_COMMON_DEC_H_
|
||||
|
||||
// intra prediction modes
|
||||
enum { B_DC_PRED = 0, // 4x4 modes
|
||||
|
@ -51,4 +51,4 @@ enum { MB_FEATURE_TREE_PROBS = 3,
|
|||
NUM_PROBAS = 11
|
||||
};
|
||||
|
||||
#endif // WEBP_DEC_COMMON_H_
|
||||
#endif // WEBP_DEC_COMMON_DEC_H_
|
|
@ -12,13 +12,13 @@
|
|||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "./vp8i_dec.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "src/dec/vp8i_dec.h"
|
||||
#include "src/utils/utils.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Main reconstruction function.
|
||||
|
||||
static const int kScan[16] = {
|
||||
static const uint16_t kScan[16] = {
|
||||
0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
|
||||
0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
|
||||
0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
|
||||
|
@ -320,7 +320,7 @@ static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
|
|||
#define MIN_DITHER_AMP 4
|
||||
|
||||
#define DITHER_AMP_TAB_SIZE 12
|
||||
static const int kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
|
||||
static const uint8_t kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
|
||||
// roughly, it's dqm->uv_mat_[1]
|
||||
8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1
|
||||
};
|
||||
|
@ -728,7 +728,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
|
|||
}
|
||||
|
||||
mem = (uint8_t*)dec->mem_;
|
||||
dec->intra_t_ = (uint8_t*)mem;
|
||||
dec->intra_t_ = mem;
|
||||
mem += intra_pred_mode_size;
|
||||
|
||||
dec->yuv_t_ = (VP8TopSamples*)mem;
|
||||
|
@ -750,7 +750,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
|
|||
|
||||
mem = (uint8_t*)WEBP_ALIGN(mem);
|
||||
assert((yuv_size & WEBP_ALIGN_CST) == 0);
|
||||
dec->yuv_b_ = (uint8_t*)mem;
|
||||
dec->yuv_b_ = mem;
|
||||
mem += yuv_size;
|
||||
|
||||
dec->mb_data_ = (VP8MBData*)mem;
|
||||
|
@ -766,7 +766,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
|
|||
const int extra_rows = kFilterExtraRows[dec->filter_type_];
|
||||
const int extra_y = extra_rows * dec->cache_y_stride_;
|
||||
const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride_;
|
||||
dec->cache_y_ = ((uint8_t*)mem) + extra_y;
|
||||
dec->cache_y_ = mem + extra_y;
|
||||
dec->cache_u_ = dec->cache_y_
|
||||
+ 16 * num_caches * dec->cache_y_stride_ + extra_uv;
|
||||
dec->cache_v_ = dec->cache_u_
|
||||
|
@ -776,7 +776,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
|
|||
mem += cache_size;
|
||||
|
||||
// alpha plane
|
||||
dec->alpha_plane_ = alpha_size ? (uint8_t*)mem : NULL;
|
||||
dec->alpha_plane_ = alpha_size ? mem : NULL;
|
||||
mem += alpha_size;
|
||||
assert(mem <= (uint8_t*)dec->mem_ + dec->mem_size_);
|
||||
|
|
@ -15,10 +15,10 @@
|
|||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "./alphai_dec.h"
|
||||
#include "./webpi_dec.h"
|
||||
#include "./vp8i_dec.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "src/dec/alphai_dec.h"
|
||||
#include "src/dec/webpi_dec.h"
|
||||
#include "src/dec/vp8i_dec.h"
|
||||
#include "src/utils/utils.h"
|
||||
|
||||
// In append mode, buffer allocations increase as multiples of this value.
|
||||
// Needs to be a power of 2.
|
||||
|
@ -673,12 +673,12 @@ void WebPIDelete(WebPIDecoder* idec) {
|
|||
//------------------------------------------------------------------------------
|
||||
// Wrapper toward WebPINewDecoder
|
||||
|
||||
WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
|
||||
WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE csp, uint8_t* output_buffer,
|
||||
size_t output_buffer_size, int output_stride) {
|
||||
const int is_external_memory = (output_buffer != NULL) ? 1 : 0;
|
||||
WebPIDecoder* idec;
|
||||
|
||||
if (mode >= MODE_YUV) return NULL;
|
||||
if (csp >= MODE_YUV) return NULL;
|
||||
if (is_external_memory == 0) { // Overwrite parameters to sane values.
|
||||
output_buffer_size = 0;
|
||||
output_stride = 0;
|
||||
|
@ -689,7 +689,7 @@ WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
|
|||
}
|
||||
idec = WebPINewDecoder(NULL);
|
||||
if (idec == NULL) return NULL;
|
||||
idec->output_.colorspace = mode;
|
||||
idec->output_.colorspace = csp;
|
||||
idec->output_.is_external_memory = is_external_memory;
|
||||
idec->output_.u.RGBA.rgba = output_buffer;
|
||||
idec->output_.u.RGBA.stride = output_stride;
|
|
@ -13,11 +13,11 @@
|
|||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include "../dec/vp8i_dec.h"
|
||||
#include "./webpi_dec.h"
|
||||
#include "../dsp/dsp.h"
|
||||
#include "../dsp/yuv.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "src/dec/vp8i_dec.h"
|
||||
#include "src/dec/webpi_dec.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
#include "src/dsp/yuv.h"
|
||||
#include "src/utils/utils.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Main YUV<->RGB conversion functions
|
||||
|
@ -212,7 +212,7 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
|
|||
int num_rows;
|
||||
const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
|
||||
uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
#if (WEBP_SWAP_16BIT_CSP == 1)
|
||||
uint8_t* alpha_dst = base_rgba;
|
||||
#else
|
||||
uint8_t* alpha_dst = base_rgba + 1;
|
||||
|
@ -241,6 +241,7 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
|
|||
//------------------------------------------------------------------------------
|
||||
// YUV rescaling (no final RGB conversion needed)
|
||||
|
||||
#if !defined(WEBP_REDUCE_SIZE)
|
||||
static int Rescale(const uint8_t* src, int src_stride,
|
||||
int new_lines, WebPRescaler* const wrk) {
|
||||
int num_lines_out = 0;
|
||||
|
@ -431,7 +432,7 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos,
|
|||
int max_lines_out) {
|
||||
const WebPRGBABuffer* const buf = &p->output->u.RGBA;
|
||||
uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
#if (WEBP_SWAP_16BIT_CSP == 1)
|
||||
uint8_t* alpha_dst = base_rgba;
|
||||
#else
|
||||
uint8_t* alpha_dst = base_rgba + 1;
|
||||
|
@ -541,6 +542,8 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
#endif // WEBP_REDUCE_SIZE
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Default custom functions
|
||||
|
||||
|
@ -561,10 +564,14 @@ static int CustomSetup(VP8Io* io) {
|
|||
WebPInitUpsamplers();
|
||||
}
|
||||
if (io->use_scaling) {
|
||||
#if !defined(WEBP_REDUCE_SIZE)
|
||||
const int ok = is_rgb ? InitRGBRescaler(io, p) : InitYUVRescaler(io, p);
|
||||
if (!ok) {
|
||||
return 0; // memory error
|
||||
}
|
||||
#else
|
||||
return 0; // rescaling support not compiled
|
||||
#endif
|
||||
} else {
|
||||
if (is_rgb) {
|
||||
WebPInitSamplers();
|
||||
|
@ -598,9 +605,6 @@ static int CustomSetup(VP8Io* io) {
|
|||
}
|
||||
}
|
||||
|
||||
if (is_rgb) {
|
||||
VP8YUVInit();
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./vp8i_dec.h"
|
||||
#include "src/dec/vp8i_dec.h"
|
||||
|
||||
static WEBP_INLINE int clip(int v, int M) {
|
||||
return v < 0 ? 0 : v > M ? M : v;
|
|
@ -11,15 +11,19 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./vp8i_dec.h"
|
||||
#include "../utils/bit_reader_inl_utils.h"
|
||||
#include "src/dec/vp8i_dec.h"
|
||||
#include "src/utils/bit_reader_inl_utils.h"
|
||||
|
||||
#if !defined(USE_GENERIC_TREE)
|
||||
#if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__)
|
||||
// using a table is ~1-2% slower on ARM. Prefer the coded-tree approach then.
|
||||
#define USE_GENERIC_TREE
|
||||
#define USE_GENERIC_TREE 1 // ALTERNATE_CODE
|
||||
#else
|
||||
#define USE_GENERIC_TREE 0
|
||||
#endif
|
||||
#endif // USE_GENERIC_TREE
|
||||
|
||||
#ifdef USE_GENERIC_TREE
|
||||
#if (USE_GENERIC_TREE == 1)
|
||||
static const int8_t kYModesIntra4[18] = {
|
||||
-B_DC_PRED, 1,
|
||||
-B_TM_PRED, 2,
|
||||
|
@ -317,7 +321,7 @@ static void ParseIntraMode(VP8BitReader* const br,
|
|||
int x;
|
||||
for (x = 0; x < 4; ++x) {
|
||||
const uint8_t* const prob = kBModesProba[top[x]][ymode];
|
||||
#ifdef USE_GENERIC_TREE
|
||||
#if (USE_GENERIC_TREE == 1)
|
||||
// Generic tree-parsing
|
||||
int i = kYModesIntra4[VP8GetBit(br, prob[0])];
|
||||
while (i > 0) {
|
||||
|
@ -335,7 +339,7 @@ static void ParseIntraMode(VP8BitReader* const br,
|
|||
(!VP8GetBit(br, prob[6]) ? B_LD_PRED :
|
||||
(!VP8GetBit(br, prob[7]) ? B_VL_PRED :
|
||||
(!VP8GetBit(br, prob[8]) ? B_HD_PRED : B_HU_PRED)));
|
||||
#endif // USE_GENERIC_TREE
|
||||
#endif // USE_GENERIC_TREE
|
||||
top[x] = ymode;
|
||||
}
|
||||
memcpy(modes, top, 4 * sizeof(*top));
|
||||
|
@ -498,7 +502,7 @@ static const uint8_t
|
|||
|
||||
// Paragraph 9.9
|
||||
|
||||
static const int kBands[16 + 1] = {
|
||||
static const uint8_t kBands[16 + 1] = {
|
||||
0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
|
||||
0 // extra entry as sentinel
|
||||
};
|
|
@ -13,12 +13,12 @@
|
|||
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "./alphai_dec.h"
|
||||
#include "./vp8i_dec.h"
|
||||
#include "./vp8li_dec.h"
|
||||
#include "./webpi_dec.h"
|
||||
#include "../utils/bit_reader_inl_utils.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "src/dec/alphai_dec.h"
|
||||
#include "src/dec/vp8i_dec.h"
|
||||
#include "src/dec/vp8li_dec.h"
|
||||
#include "src/dec/webpi_dec.h"
|
||||
#include "src/utils/bit_reader_inl_utils.h"
|
||||
#include "src/utils/utils.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
|
@ -11,10 +11,10 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#ifndef WEBP_WEBP_DECODE_VP8_H_
|
||||
#define WEBP_WEBP_DECODE_VP8_H_
|
||||
#ifndef WEBP_DEC_VP8_DEC_H_
|
||||
#define WEBP_DEC_VP8_DEC_H_
|
||||
|
||||
#include "../webp/decode.h"
|
||||
#include "src/webp/decode.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
@ -33,7 +33,7 @@ extern "C" {
|
|||
// /* customize io's functions (setup()/put()/teardown()) if needed. */
|
||||
//
|
||||
// VP8Decoder* dec = VP8New();
|
||||
// bool ok = VP8Decode(dec);
|
||||
// int ok = VP8Decode(dec, &io);
|
||||
// if (!ok) printf("Error: %s\n", VP8StatusMessage(dec));
|
||||
// VP8Delete(dec);
|
||||
// return ok;
|
||||
|
@ -157,24 +157,24 @@ void VP8Delete(VP8Decoder* const dec);
|
|||
// Miscellaneous VP8/VP8L bitstream probing functions.
|
||||
|
||||
// Returns true if the next 3 bytes in data contain the VP8 signature.
|
||||
WEBP_EXTERN(int) VP8CheckSignature(const uint8_t* const data, size_t data_size);
|
||||
WEBP_EXTERN int VP8CheckSignature(const uint8_t* const data, size_t data_size);
|
||||
|
||||
// Validates the VP8 data-header and retrieves basic header information viz
|
||||
// width and height. Returns 0 in case of formatting error. *width/*height
|
||||
// can be passed NULL.
|
||||
WEBP_EXTERN(int) VP8GetInfo(
|
||||
WEBP_EXTERN int VP8GetInfo(
|
||||
const uint8_t* data,
|
||||
size_t data_size, // data available so far
|
||||
size_t chunk_size, // total data size expected in the chunk
|
||||
int* const width, int* const height);
|
||||
|
||||
// Returns true if the next byte(s) in data is a VP8L signature.
|
||||
WEBP_EXTERN(int) VP8LCheckSignature(const uint8_t* const data, size_t size);
|
||||
WEBP_EXTERN int VP8LCheckSignature(const uint8_t* const data, size_t size);
|
||||
|
||||
// Validates the VP8L data-header and retrieves basic header information viz
|
||||
// width, height and alpha. Returns 0 in case of formatting error.
|
||||
// width/height/has_alpha can be passed NULL.
|
||||
WEBP_EXTERN(int) VP8LGetInfo(
|
||||
WEBP_EXTERN int VP8LGetInfo(
|
||||
const uint8_t* data, size_t data_size, // data available so far
|
||||
int* const width, int* const height, int* const has_alpha);
|
||||
|
||||
|
@ -182,4 +182,4 @@ WEBP_EXTERN(int) VP8LGetInfo(
|
|||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif /* WEBP_WEBP_DECODE_VP8_H_ */
|
||||
#endif /* WEBP_DEC_VP8_DEC_H_ */
|
|
@ -11,16 +11,16 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#ifndef WEBP_DEC_VP8I_H_
|
||||
#define WEBP_DEC_VP8I_H_
|
||||
#ifndef WEBP_DEC_VP8I_DEC_H_
|
||||
#define WEBP_DEC_VP8I_DEC_H_
|
||||
|
||||
#include <string.h> // for memcpy()
|
||||
#include "./common_dec.h"
|
||||
#include "./vp8li_dec.h"
|
||||
#include "../utils/bit_reader_utils.h"
|
||||
#include "../utils/random_utils.h"
|
||||
#include "../utils/thread_utils.h"
|
||||
#include "../dsp/dsp.h"
|
||||
#include "src/dec/common_dec.h"
|
||||
#include "src/dec/vp8li_dec.h"
|
||||
#include "src/utils/bit_reader_utils.h"
|
||||
#include "src/utils/random_utils.h"
|
||||
#include "src/utils/thread_utils.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
@ -32,7 +32,7 @@ extern "C" {
|
|||
// version numbers
|
||||
#define DEC_MAJ_VERSION 0
|
||||
#define DEC_MIN_VERSION 6
|
||||
#define DEC_REV_VERSION 0
|
||||
#define DEC_REV_VERSION 1
|
||||
|
||||
// YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
|
||||
// Constraints are: We need to store one 16x16 block of luma samples (y),
|
||||
|
@ -57,7 +57,6 @@ extern "C" {
|
|||
// '|' = left sample, '-' = top sample, '+' = top-left sample
|
||||
// 't' = extra top-right sample for 4x4 modes
|
||||
#define YUV_SIZE (BPS * 17 + BPS * 9)
|
||||
#define Y_SIZE (BPS * 17)
|
||||
#define Y_OFF (BPS * 1 + 8)
|
||||
#define U_OFF (Y_OFF + BPS * 16 + BPS)
|
||||
#define V_OFF (U_OFF + 16)
|
||||
|
@ -317,4 +316,4 @@ const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
|
|||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif /* WEBP_DEC_VP8I_H_ */
|
||||
#endif /* WEBP_DEC_VP8I_DEC_H_ */
|
|
@ -14,22 +14,22 @@
|
|||
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "./alphai_dec.h"
|
||||
#include "./vp8li_dec.h"
|
||||
#include "../dsp/dsp.h"
|
||||
#include "../dsp/lossless.h"
|
||||
#include "../dsp/lossless_common.h"
|
||||
#include "../dsp/yuv.h"
|
||||
#include "../utils/endian_inl_utils.h"
|
||||
#include "../utils/huffman_utils.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "src/dec/alphai_dec.h"
|
||||
#include "src/dec/vp8li_dec.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
#include "src/dsp/lossless.h"
|
||||
#include "src/dsp/lossless_common.h"
|
||||
#include "src/dsp/yuv.h"
|
||||
#include "src/utils/endian_inl_utils.h"
|
||||
#include "src/utils/huffman_utils.h"
|
||||
#include "src/utils/utils.h"
|
||||
|
||||
#define NUM_ARGB_CACHE_ROWS 16
|
||||
|
||||
static const int kCodeLengthLiterals = 16;
|
||||
static const int kCodeLengthRepeatCode = 16;
|
||||
static const int kCodeLengthExtraBits[3] = { 2, 3, 7 };
|
||||
static const int kCodeLengthRepeatOffsets[3] = { 3, 3, 11 };
|
||||
static const uint8_t kCodeLengthExtraBits[3] = { 2, 3, 7 };
|
||||
static const uint8_t kCodeLengthRepeatOffsets[3] = { 3, 3, 11 };
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Five Huffman codes are used at each meta code:
|
||||
|
@ -86,7 +86,7 @@ static const uint8_t kCodeToPlane[CODE_TO_PLANE_CODES] = {
|
|||
// All values computed for 8-bit first level lookup with Mark Adler's tool:
|
||||
// http://www.hdfgroup.org/ftp/lib-external/zlib/zlib-1.2.5/examples/enough.c
|
||||
#define FIXED_TABLE_SIZE (630 * 3 + 410)
|
||||
static const int kTableSize[12] = {
|
||||
static const uint16_t kTableSize[12] = {
|
||||
FIXED_TABLE_SIZE + 654,
|
||||
FIXED_TABLE_SIZE + 656,
|
||||
FIXED_TABLE_SIZE + 658,
|
||||
|
@ -485,6 +485,7 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
|
|||
//------------------------------------------------------------------------------
|
||||
// Scaling.
|
||||
|
||||
#if !defined(WEBP_REDUCE_SIZE)
|
||||
static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
|
||||
const int num_channels = 4;
|
||||
const int in_width = io->mb_w;
|
||||
|
@ -516,10 +517,13 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
|
|||
out_width, out_height, 0, num_channels, work);
|
||||
return 1;
|
||||
}
|
||||
#endif // WEBP_REDUCE_SIZE
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Export to ARGB
|
||||
|
||||
#if !defined(WEBP_REDUCE_SIZE)
|
||||
|
||||
// We have special "export" function since we need to convert from BGRA
|
||||
static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
|
||||
int rgba_stride, uint8_t* const rgba) {
|
||||
|
@ -561,6 +565,8 @@ static int EmitRescaledRowsRGBA(const VP8LDecoder* const dec,
|
|||
return num_lines_out;
|
||||
}
|
||||
|
||||
#endif // WEBP_REDUCE_SIZE
|
||||
|
||||
// Emit rows without any scaling.
|
||||
static int EmitRows(WEBP_CSP_MODE colorspace,
|
||||
const uint8_t* row_in, int in_stride,
|
||||
|
@ -746,9 +752,12 @@ static void ProcessRows(VP8LDecoder* const dec, int row) {
|
|||
if (WebPIsRGBMode(output->colorspace)) { // convert to RGBA
|
||||
const WebPRGBABuffer* const buf = &output->u.RGBA;
|
||||
uint8_t* const rgba = buf->rgba + dec->last_out_row_ * buf->stride;
|
||||
const int num_rows_out = io->use_scaling ?
|
||||
const int num_rows_out =
|
||||
#if !defined(WEBP_REDUCE_SIZE)
|
||||
io->use_scaling ?
|
||||
EmitRescaledRowsRGBA(dec, rows_data, in_stride, io->mb_h,
|
||||
rgba, buf->stride) :
|
||||
#endif // WEBP_REDUCE_SIZE
|
||||
EmitRows(output->colorspace, rows_data, in_stride,
|
||||
io->mb_w, io->mb_h, rgba, buf->stride);
|
||||
// Update 'last_out_row_'.
|
||||
|
@ -1012,12 +1021,13 @@ static int DecodeAlphaData(VP8LDecoder* const dec, uint8_t* const data,
|
|||
ok = 0;
|
||||
goto End;
|
||||
}
|
||||
assert(br->eos_ == VP8LIsEndOfStream(br));
|
||||
br->eos_ = VP8LIsEndOfStream(br);
|
||||
}
|
||||
// Process the remaining rows corresponding to last row-block.
|
||||
ExtractPalettedAlphaRows(dec, row > last_row ? last_row : row);
|
||||
|
||||
End:
|
||||
br->eos_ = VP8LIsEndOfStream(br);
|
||||
if (!ok || (br->eos_ && pos < end)) {
|
||||
ok = 0;
|
||||
dec->status_ = br->eos_ ? VP8_STATUS_SUSPENDED
|
||||
|
@ -1090,11 +1100,12 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
|
|||
VP8LFillBitWindow(br);
|
||||
if (htree_group->use_packed_table) {
|
||||
code = ReadPackedSymbols(htree_group, br, src);
|
||||
if (VP8LIsEndOfStream(br)) break;
|
||||
if (code == PACKED_NON_LITERAL_CODE) goto AdvanceByOne;
|
||||
} else {
|
||||
code = ReadSymbol(htree_group->htrees[GREEN], br);
|
||||
}
|
||||
if (br->eos_) break; // early out
|
||||
if (VP8LIsEndOfStream(br)) break;
|
||||
if (code < NUM_LITERAL_CODES) { // Literal
|
||||
if (htree_group->is_trivial_literal) {
|
||||
*src = htree_group->literal_arb | (code << 8);
|
||||
|
@ -1104,7 +1115,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
|
|||
VP8LFillBitWindow(br);
|
||||
blue = ReadSymbol(htree_group->htrees[BLUE], br);
|
||||
alpha = ReadSymbol(htree_group->htrees[ALPHA], br);
|
||||
if (br->eos_) break;
|
||||
if (VP8LIsEndOfStream(br)) break;
|
||||
*src = ((uint32_t)alpha << 24) | (red << 16) | (code << 8) | blue;
|
||||
}
|
||||
AdvanceByOne:
|
||||
|
@ -1132,7 +1143,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
|
|||
VP8LFillBitWindow(br);
|
||||
dist_code = GetCopyDistance(dist_symbol, br);
|
||||
dist = PlaneCodeToDistance(width, dist_code);
|
||||
if (br->eos_) break;
|
||||
if (VP8LIsEndOfStream(br)) break;
|
||||
if (src - data < (ptrdiff_t)dist || src_end - src < (ptrdiff_t)length) {
|
||||
goto Error;
|
||||
} else {
|
||||
|
@ -1169,9 +1180,9 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
|
|||
} else { // Not reached
|
||||
goto Error;
|
||||
}
|
||||
assert(br->eos_ == VP8LIsEndOfStream(br));
|
||||
}
|
||||
|
||||
br->eos_ = VP8LIsEndOfStream(br);
|
||||
if (dec->incremental_ && br->eos_ && src < src_end) {
|
||||
RestoreState(dec);
|
||||
} else if (!br->eos_) {
|
||||
|
@ -1630,12 +1641,19 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
|
|||
|
||||
if (!AllocateInternalBuffers32b(dec, io->width)) goto Err;
|
||||
|
||||
#if !defined(WEBP_REDUCE_SIZE)
|
||||
if (io->use_scaling && !AllocateAndInitRescaler(dec, io)) goto Err;
|
||||
|
||||
if (io->use_scaling || WebPIsPremultipliedMode(dec->output_->colorspace)) {
|
||||
// need the alpha-multiply functions for premultiplied output or rescaling
|
||||
WebPInitAlphaProcessing();
|
||||
}
|
||||
#else
|
||||
if (io->use_scaling) {
|
||||
dec->status_ = VP8_STATUS_INVALID_PARAM;
|
||||
goto Err;
|
||||
}
|
||||
#endif
|
||||
if (!WebPIsRGBMode(dec->output_->colorspace)) {
|
||||
WebPInitConvertARGBToYUV();
|
||||
if (dec->output_->u.YUVA.a != NULL) WebPInitAlphaProcessing();
|
|
@ -12,14 +12,14 @@
|
|||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
// Vikas Arora(vikaas.arora@gmail.com)
|
||||
|
||||
#ifndef WEBP_DEC_VP8LI_H_
|
||||
#define WEBP_DEC_VP8LI_H_
|
||||
#ifndef WEBP_DEC_VP8LI_DEC_H_
|
||||
#define WEBP_DEC_VP8LI_DEC_H_
|
||||
|
||||
#include <string.h> // for memcpy()
|
||||
#include "./webpi_dec.h"
|
||||
#include "../utils/bit_reader_utils.h"
|
||||
#include "../utils/color_cache_utils.h"
|
||||
#include "../utils/huffman_utils.h"
|
||||
#include "src/dec/webpi_dec.h"
|
||||
#include "src/utils/bit_reader_utils.h"
|
||||
#include "src/utils/color_cache_utils.h"
|
||||
#include "src/utils/huffman_utils.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
@ -132,4 +132,4 @@ void VP8LDelete(VP8LDecoder* const dec);
|
|||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif /* WEBP_DEC_VP8LI_H_ */
|
||||
#endif /* WEBP_DEC_VP8LI_DEC_H_ */
|
|
@ -13,11 +13,11 @@
|
|||
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "./vp8i_dec.h"
|
||||
#include "./vp8li_dec.h"
|
||||
#include "./webpi_dec.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "../webp/mux_types.h" // ALPHA_FLAG
|
||||
#include "src/dec/vp8i_dec.h"
|
||||
#include "src/dec/vp8li_dec.h"
|
||||
#include "src/dec/webpi_dec.h"
|
||||
#include "src/utils/utils.h"
|
||||
#include "src/webp/mux_types.h" // ALPHA_FLAG
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// RIFF layout is:
|
||||
|
@ -421,7 +421,9 @@ VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
|
|||
NULL, NULL, NULL, &has_animation,
|
||||
NULL, headers);
|
||||
if (status == VP8_STATUS_OK || status == VP8_STATUS_NOT_ENOUGH_DATA) {
|
||||
// TODO(jzern): full support of animation frames will require API additions.
|
||||
// The WebPDemux API + libwebp can be used to decode individual
|
||||
// uncomposited frames or the WebPAnimDecoder can be used to fully
|
||||
// reconstruct them (see webp/demux.h).
|
||||
if (has_animation) {
|
||||
status = VP8_STATUS_UNSUPPORTED_FEATURE;
|
||||
}
|
|
@ -11,15 +11,15 @@
|
|||
//
|
||||
// Author: somnath@google.com (Somnath Banerjee)
|
||||
|
||||
#ifndef WEBP_DEC_WEBPI_H_
|
||||
#define WEBP_DEC_WEBPI_H_
|
||||
#ifndef WEBP_DEC_WEBPI_DEC_H_
|
||||
#define WEBP_DEC_WEBPI_DEC_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include "../utils/rescaler_utils.h"
|
||||
#include "./vp8_dec.h"
|
||||
#include "src/utils/rescaler_utils.h"
|
||||
#include "src/dec/vp8_dec.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// WebPDecParams: Decoding output parameters. Transient internal object.
|
||||
|
@ -130,4 +130,4 @@ int WebPAvoidSlowMemory(const WebPDecBuffer* const output,
|
|||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif /* WEBP_DEC_WEBPI_H_ */
|
||||
#endif /* WEBP_DEC_WEBPI_DEC_H_ */
|
|
@ -11,15 +11,15 @@
|
|||
//
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "../webp/config.h"
|
||||
#include "src/webp/config.h"
|
||||
#endif
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "../utils/utils.h"
|
||||
#include "../webp/decode.h"
|
||||
#include "../webp/demux.h"
|
||||
#include "src/utils/utils.h"
|
||||
#include "src/webp/decode.h"
|
||||
#include "src/webp/demux.h"
|
||||
|
||||
#define NUM_CHANNELS 4
|
||||
|
|
@ -11,21 +11,21 @@
|
|||
//
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "../webp/config.h"
|
||||
#include "src/webp/config.h"
|
||||
#endif
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "../utils/utils.h"
|
||||
#include "../webp/decode.h" // WebPGetFeatures
|
||||
#include "../webp/demux.h"
|
||||
#include "../webp/format_constants.h"
|
||||
#include "src/utils/utils.h"
|
||||
#include "src/webp/decode.h" // WebPGetFeatures
|
||||
#include "src/webp/demux.h"
|
||||
#include "src/webp/format_constants.h"
|
||||
|
||||
#define DMUX_MAJ_VERSION 0
|
||||
#define DMUX_MIN_VERSION 3
|
||||
#define DMUX_REV_VERSION 2
|
||||
#define DMUX_REV_VERSION 3
|
||||
|
||||
typedef struct {
|
||||
size_t start_; // start location of the data
|
||||
|
@ -205,12 +205,14 @@ static void SetFrameInfo(size_t start_offset, size_t size,
|
|||
frame->complete_ = complete;
|
||||
}
|
||||
|
||||
// Store image bearing chunks to 'frame'.
|
||||
// Store image bearing chunks to 'frame'. 'min_size' is an optional size
|
||||
// requirement, it may be zero.
|
||||
static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
|
||||
MemBuffer* const mem, Frame* const frame) {
|
||||
int alpha_chunks = 0;
|
||||
int image_chunks = 0;
|
||||
int done = (MemDataSize(mem) < min_size);
|
||||
int done = (MemDataSize(mem) < CHUNK_HEADER_SIZE ||
|
||||
MemDataSize(mem) < min_size);
|
||||
ParseStatus status = PARSE_OK;
|
||||
|
||||
if (done) return PARSE_NEED_MORE_DATA;
|
||||
|
@ -401,9 +403,9 @@ static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
|
|||
frame = (Frame*)WebPSafeCalloc(1ULL, sizeof(*frame));
|
||||
if (frame == NULL) return PARSE_ERROR;
|
||||
|
||||
// For the single image case we allow parsing of a partial frame, but we need
|
||||
// at least CHUNK_HEADER_SIZE for parsing.
|
||||
status = StoreFrame(1, CHUNK_HEADER_SIZE, &dmux->mem_, frame);
|
||||
// For the single image case we allow parsing of a partial frame, so no
|
||||
// minimum size is imposed here.
|
||||
status = StoreFrame(1, 0, &dmux->mem_, frame);
|
||||
if (status != PARSE_ERROR) {
|
||||
const int has_alpha = !!(dmux->feature_flags_ & ALPHA_FLAG);
|
||||
// Clear any alpha when the alpha flag is missing.
|
|
@ -12,10 +12,13 @@
|
|||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include <assert.h>
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
// Tables can be faster on some platform but incur some extra binary size (~2k).
|
||||
// #define USE_TABLES_FOR_ALPHA_MULT
|
||||
#if !defined(USE_TABLES_FOR_ALPHA_MULT)
|
||||
#define USE_TABLES_FOR_ALPHA_MULT 0 // ALTERNATE_CODE
|
||||
#endif
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
|
@ -29,7 +32,7 @@ static uint32_t Mult(uint8_t x, uint32_t mult) {
|
|||
return v;
|
||||
}
|
||||
|
||||
#ifdef USE_TABLES_FOR_ALPHA_MULT
|
||||
#if (USE_TABLES_FOR_ALPHA_MULT == 1)
|
||||
|
||||
static const uint32_t kMultTables[2][256] = {
|
||||
{ // (255u << MFIX) / alpha
|
||||
|
@ -132,9 +135,9 @@ static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
|
|||
return inverse ? (255u << MFIX) / a : a * KINV_255;
|
||||
}
|
||||
|
||||
#endif // USE_TABLES_FOR_ALPHA_MULT
|
||||
#endif // USE_TABLES_FOR_ALPHA_MULT
|
||||
|
||||
void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) {
|
||||
void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse) {
|
||||
int x;
|
||||
for (x = 0; x < width; ++x) {
|
||||
const uint32_t argb = ptr[x];
|
||||
|
@ -154,8 +157,8 @@ void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) {
|
|||
}
|
||||
}
|
||||
|
||||
void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
|
||||
int width, int inverse) {
|
||||
void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
|
||||
int width, int inverse) {
|
||||
int x;
|
||||
for (x = 0; x < width; ++x) {
|
||||
const uint32_t a = alpha[x];
|
||||
|
@ -217,8 +220,9 @@ void WebPMultRows(uint8_t* ptr, int stride,
|
|||
#define PREMULTIPLY(x, m) (((x) * (m) + (1U << 23)) >> 24)
|
||||
#endif
|
||||
|
||||
static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
|
||||
int w, int h, int stride) {
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static void ApplyAlphaMultiply_C(uint8_t* rgba, int alpha_first,
|
||||
int w, int h, int stride) {
|
||||
while (h-- > 0) {
|
||||
uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
|
||||
const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
|
||||
|
@ -235,6 +239,7 @@ static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
|
|||
rgba += stride;
|
||||
}
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
#undef MULTIPLIER
|
||||
#undef PREMULTIPLY
|
||||
|
||||
|
@ -254,9 +259,9 @@ static WEBP_INLINE uint8_t multiply(uint8_t x, uint32_t m) {
|
|||
return (x * m) >> 16;
|
||||
}
|
||||
|
||||
static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444,
|
||||
int w, int h, int stride,
|
||||
int rg_byte_pos /* 0 or 1 */) {
|
||||
static WEBP_INLINE void ApplyAlphaMultiply4444_C(uint8_t* rgba4444,
|
||||
int w, int h, int stride,
|
||||
int rg_byte_pos /* 0 or 1 */) {
|
||||
while (h-- > 0) {
|
||||
int i;
|
||||
for (i = 0; i < w; ++i) {
|
||||
|
@ -275,15 +280,16 @@ static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444,
|
|||
}
|
||||
#undef MULTIPLIER
|
||||
|
||||
static void ApplyAlphaMultiply_16b(uint8_t* rgba4444,
|
||||
int w, int h, int stride) {
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
ApplyAlphaMultiply4444(rgba4444, w, h, stride, 1);
|
||||
static void ApplyAlphaMultiply_16b_C(uint8_t* rgba4444,
|
||||
int w, int h, int stride) {
|
||||
#if (WEBP_SWAP_16BIT_CSP == 1)
|
||||
ApplyAlphaMultiply4444_C(rgba4444, w, h, stride, 1);
|
||||
#else
|
||||
ApplyAlphaMultiply4444(rgba4444, w, h, stride, 0);
|
||||
ApplyAlphaMultiply4444_C(rgba4444, w, h, stride, 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
|
||||
int width, int height,
|
||||
uint8_t* dst, int dst_stride) {
|
||||
|
@ -338,6 +344,36 @@ static void ExtractGreen_C(const uint32_t* argb, uint8_t* alpha, int size) {
|
|||
int i;
|
||||
for (i = 0; i < size; ++i) alpha[i] = argb[i] >> 8;
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static int HasAlpha8b_C(const uint8_t* src, int length) {
|
||||
while (length-- > 0) if (*src++ != 0xff) return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int HasAlpha32b_C(const uint8_t* src, int length) {
|
||||
int x;
|
||||
for (x = 0; length-- > 0; x += 4) if (src[x] != 0xff) return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Simple channel manipulations.
|
||||
|
||||
static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
|
||||
return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
|
||||
}
|
||||
|
||||
static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
|
||||
int len, int step, uint32_t* out) {
|
||||
int i, offset = 0;
|
||||
for (i = 0; i < len; ++i) {
|
||||
out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
|
||||
offset += step;
|
||||
}
|
||||
}
|
||||
|
||||
void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
|
||||
void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
|
||||
|
@ -345,6 +381,11 @@ int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
|
|||
void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
|
||||
int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
|
||||
void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
|
||||
void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
|
||||
int len, int step, uint32_t* out);
|
||||
|
||||
int (*WebPHasAlpha8b)(const uint8_t* src, int length);
|
||||
int (*WebPHasAlpha32b)(const uint8_t* src, int length);
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Init function
|
||||
|
@ -360,15 +401,21 @@ static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used =
|
|||
WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
|
||||
if (alpha_processing_last_cpuinfo_used == VP8GetCPUInfo) return;
|
||||
|
||||
WebPMultARGBRow = WebPMultARGBRowC;
|
||||
WebPMultRow = WebPMultRowC;
|
||||
WebPApplyAlphaMultiply = ApplyAlphaMultiply;
|
||||
WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b;
|
||||
WebPMultARGBRow = WebPMultARGBRow_C;
|
||||
WebPMultRow = WebPMultRow_C;
|
||||
WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b_C;
|
||||
|
||||
WebPPackRGB = PackRGB_C;
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
WebPApplyAlphaMultiply = ApplyAlphaMultiply_C;
|
||||
WebPDispatchAlpha = DispatchAlpha_C;
|
||||
WebPDispatchAlphaToGreen = DispatchAlphaToGreen_C;
|
||||
WebPExtractAlpha = ExtractAlpha_C;
|
||||
WebPExtractGreen = ExtractGreen_C;
|
||||
#endif
|
||||
|
||||
WebPHasAlpha8b = HasAlpha8b_C;
|
||||
WebPHasAlpha32b = HasAlpha32b_C;
|
||||
|
||||
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
|
@ -382,16 +429,31 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
|
|||
#endif
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_NEON)
|
||||
if (VP8GetCPUInfo(kNEON)) {
|
||||
WebPInitAlphaProcessingNEON();
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_MIPS_DSP_R2)
|
||||
if (VP8GetCPUInfo(kMIPSdspR2)) {
|
||||
WebPInitAlphaProcessingMIPSdspR2();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
if (WEBP_NEON_OMIT_C_CODE ||
|
||||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
|
||||
WebPInitAlphaProcessingNEON();
|
||||
}
|
||||
#endif
|
||||
|
||||
assert(WebPMultARGBRow != NULL);
|
||||
assert(WebPMultRow != NULL);
|
||||
assert(WebPApplyAlphaMultiply != NULL);
|
||||
assert(WebPApplyAlphaMultiply4444 != NULL);
|
||||
assert(WebPDispatchAlpha != NULL);
|
||||
assert(WebPDispatchAlphaToGreen != NULL);
|
||||
assert(WebPExtractAlpha != NULL);
|
||||
assert(WebPExtractGreen != NULL);
|
||||
assert(WebPPackRGB != NULL);
|
||||
assert(WebPHasAlpha8b != NULL);
|
||||
assert(WebPHasAlpha32b != NULL);
|
||||
|
||||
alpha_processing_last_cpuinfo_used = VP8GetCPUInfo;
|
||||
}
|
|
@ -12,13 +12,13 @@
|
|||
// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
|
||||
// Djordje Pesut (djordje.pesut@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MIPS_DSP_R2)
|
||||
|
||||
static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
|
||||
int width, int height,
|
||||
uint8_t* dst, int dst_stride) {
|
||||
static int DispatchAlpha_MIPSdspR2(const uint8_t* alpha, int alpha_stride,
|
||||
int width, int height,
|
||||
uint8_t* dst, int dst_stride) {
|
||||
uint32_t alpha_mask = 0xffffffff;
|
||||
int i, j, temp0;
|
||||
|
||||
|
@ -79,7 +79,8 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
|
|||
return (alpha_mask != 0xff);
|
||||
}
|
||||
|
||||
static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
|
||||
static void MultARGBRow_MIPSdspR2(uint32_t* const ptr, int width,
|
||||
int inverse) {
|
||||
int x;
|
||||
const uint32_t c_00ffffff = 0x00ffffffu;
|
||||
const uint32_t c_ff000000 = 0xff000000u;
|
||||
|
@ -124,14 +125,54 @@ static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
|
|||
}
|
||||
}
|
||||
|
||||
static void PackRGB_MIPSdspR2(const uint8_t* r, const uint8_t* g,
|
||||
const uint8_t* b, int len, int step,
|
||||
uint32_t* out) {
|
||||
int temp0, temp1, temp2, offset;
|
||||
const int rest = len & 1;
|
||||
const int a = 0xff;
|
||||
const uint32_t* const loop_end = out + len - rest;
|
||||
__asm__ volatile (
|
||||
"xor %[offset], %[offset], %[offset] \n\t"
|
||||
"beq %[loop_end], %[out], 0f \n\t"
|
||||
"2: \n\t"
|
||||
"lbux %[temp0], %[offset](%[r]) \n\t"
|
||||
"lbux %[temp1], %[offset](%[g]) \n\t"
|
||||
"lbux %[temp2], %[offset](%[b]) \n\t"
|
||||
"ins %[temp0], %[a], 16, 16 \n\t"
|
||||
"ins %[temp2], %[temp1], 16, 16 \n\t"
|
||||
"addiu %[out], %[out], 4 \n\t"
|
||||
"precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t"
|
||||
"sw %[temp0], -4(%[out]) \n\t"
|
||||
"addu %[offset], %[offset], %[step] \n\t"
|
||||
"bne %[loop_end], %[out], 2b \n\t"
|
||||
"0: \n\t"
|
||||
"beq %[rest], $zero, 1f \n\t"
|
||||
"lbux %[temp0], %[offset](%[r]) \n\t"
|
||||
"lbux %[temp1], %[offset](%[g]) \n\t"
|
||||
"lbux %[temp2], %[offset](%[b]) \n\t"
|
||||
"ins %[temp0], %[a], 16, 16 \n\t"
|
||||
"ins %[temp2], %[temp1], 16, 16 \n\t"
|
||||
"precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t"
|
||||
"sw %[temp0], 0(%[out]) \n\t"
|
||||
"1: \n\t"
|
||||
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
|
||||
[offset]"=&r"(offset), [out]"+&r"(out)
|
||||
: [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
|
||||
[loop_end]"r"(loop_end), [rest]"r"(rest)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Entry point
|
||||
|
||||
extern void WebPInitAlphaProcessingMIPSdspR2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingMIPSdspR2(void) {
|
||||
WebPDispatchAlpha = DispatchAlpha;
|
||||
WebPMultARGBRow = MultARGBRow;
|
||||
WebPDispatchAlpha = DispatchAlpha_MIPSdspR2;
|
||||
WebPMultARGBRow = MultARGBRow_MIPSdspR2;
|
||||
WebPPackRGB = PackRGB_MIPSdspR2;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_MIPS_DSP_R2
|
|
@ -11,11 +11,11 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
|
||||
#include "./neon.h"
|
||||
#include "src/dsp/neon.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
|
@ -11,16 +11,16 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
#include <emmintrin.h>
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
|
||||
int width, int height,
|
||||
uint8_t* dst, int dst_stride) {
|
||||
static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
|
||||
int width, int height,
|
||||
uint8_t* dst, int dst_stride) {
|
||||
// alpha_and stores an 'and' operation of all the alpha[] values. The final
|
||||
// value is not 0xff if any of the alpha[] is not equal to 0xff.
|
||||
uint32_t alpha_and = 0xff;
|
||||
|
@ -72,9 +72,9 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
|
|||
return (alpha_and != 0xff);
|
||||
}
|
||||
|
||||
static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
|
||||
int width, int height,
|
||||
uint32_t* dst, int dst_stride) {
|
||||
static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
|
||||
int width, int height,
|
||||
uint32_t* dst, int dst_stride) {
|
||||
int i, j;
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const int limit = width & ~15;
|
||||
|
@ -98,9 +98,9 @@ static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
|
|||
}
|
||||
}
|
||||
|
||||
static int ExtractAlpha(const uint8_t* argb, int argb_stride,
|
||||
int width, int height,
|
||||
uint8_t* alpha, int alpha_stride) {
|
||||
static int ExtractAlpha_SSE2(const uint8_t* argb, int argb_stride,
|
||||
int width, int height,
|
||||
uint8_t* alpha, int alpha_stride) {
|
||||
// alpha_and stores an 'and' operation of all the alpha[] values. The final
|
||||
// value is not 0xff if any of the alpha[] is not equal to 0xff.
|
||||
uint32_t alpha_and = 0xff;
|
||||
|
@ -210,6 +210,61 @@ static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first,
|
|||
#undef MULTIPLIER
|
||||
#undef PREMULTIPLY
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Alpha detection
|
||||
|
||||
static int HasAlpha8b_SSE2(const uint8_t* src, int length) {
|
||||
const __m128i all_0xff = _mm_set1_epi8(0xff);
|
||||
int i = 0;
|
||||
for (; i + 16 <= length; i += 16) {
|
||||
const __m128i v = _mm_loadu_si128((const __m128i*)(src + i));
|
||||
const __m128i bits = _mm_cmpeq_epi8(v, all_0xff);
|
||||
const int mask = _mm_movemask_epi8(bits);
|
||||
if (mask != 0xffff) return 1;
|
||||
}
|
||||
for (; i < length; ++i) if (src[i] != 0xff) return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int HasAlpha32b_SSE2(const uint8_t* src, int length) {
|
||||
const __m128i alpha_mask = _mm_set1_epi32(0xff);
|
||||
const __m128i all_0xff = _mm_set1_epi8(0xff);
|
||||
int i = 0;
|
||||
// We don't know if we can access the last 3 bytes after the last alpha
|
||||
// value 'src[4 * length - 4]' (because we don't know if alpha is the first
|
||||
// or the last byte of the quadruplet). Hence the '-3' protection below.
|
||||
length = length * 4 - 3; // size in bytes
|
||||
for (; i + 64 <= length; i += 64) {
|
||||
const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0));
|
||||
const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16));
|
||||
const __m128i a2 = _mm_loadu_si128((const __m128i*)(src + i + 32));
|
||||
const __m128i a3 = _mm_loadu_si128((const __m128i*)(src + i + 48));
|
||||
const __m128i b0 = _mm_and_si128(a0, alpha_mask);
|
||||
const __m128i b1 = _mm_and_si128(a1, alpha_mask);
|
||||
const __m128i b2 = _mm_and_si128(a2, alpha_mask);
|
||||
const __m128i b3 = _mm_and_si128(a3, alpha_mask);
|
||||
const __m128i c0 = _mm_packs_epi32(b0, b1);
|
||||
const __m128i c1 = _mm_packs_epi32(b2, b3);
|
||||
const __m128i d = _mm_packus_epi16(c0, c1);
|
||||
const __m128i bits = _mm_cmpeq_epi8(d, all_0xff);
|
||||
const int mask = _mm_movemask_epi8(bits);
|
||||
if (mask != 0xffff) return 1;
|
||||
}
|
||||
for (; i + 32 <= length; i += 32) {
|
||||
const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0));
|
||||
const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16));
|
||||
const __m128i b0 = _mm_and_si128(a0, alpha_mask);
|
||||
const __m128i b1 = _mm_and_si128(a1, alpha_mask);
|
||||
const __m128i c = _mm_packs_epi32(b0, b1);
|
||||
const __m128i d = _mm_packus_epi16(c, c);
|
||||
const __m128i bits = _mm_cmpeq_epi8(d, all_0xff);
|
||||
const int mask = _mm_movemask_epi8(bits);
|
||||
if (mask != 0xffff) return 1;
|
||||
}
|
||||
for (; i <= length; i += 4) if (src[i] != 0xff) return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Apply alpha value to rows
|
||||
|
||||
|
@ -238,7 +293,7 @@ static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) {
|
|||
}
|
||||
}
|
||||
width -= x;
|
||||
if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse);
|
||||
if (width > 0) WebPMultARGBRow_C(ptr + x, width, inverse);
|
||||
}
|
||||
|
||||
static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
|
||||
|
@ -261,7 +316,7 @@ static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
|
|||
}
|
||||
}
|
||||
width -= x;
|
||||
if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse);
|
||||
if (width > 0) WebPMultRow_C(ptr + x, alpha + x, width, inverse);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
@ -273,9 +328,12 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
|
|||
WebPMultARGBRow = MultARGBRow_SSE2;
|
||||
WebPMultRow = MultRow_SSE2;
|
||||
WebPApplyAlphaMultiply = ApplyAlphaMultiply_SSE2;
|
||||
WebPDispatchAlpha = DispatchAlpha;
|
||||
WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
|
||||
WebPExtractAlpha = ExtractAlpha;
|
||||
WebPDispatchAlpha = DispatchAlpha_SSE2;
|
||||
WebPDispatchAlphaToGreen = DispatchAlphaToGreen_SSE2;
|
||||
WebPExtractAlpha = ExtractAlpha_SSE2;
|
||||
|
||||
WebPHasAlpha8b = HasAlpha8b_SSE2;
|
||||
WebPHasAlpha32b = HasAlpha32b_SSE2;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_SSE2
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_SSE41)
|
||||
|
||||
|
@ -19,9 +19,9 @@
|
|||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static int ExtractAlpha(const uint8_t* argb, int argb_stride,
|
||||
int width, int height,
|
||||
uint8_t* alpha, int alpha_stride) {
|
||||
static int ExtractAlpha_SSE41(const uint8_t* argb, int argb_stride,
|
||||
int width, int height,
|
||||
uint8_t* alpha, int alpha_stride) {
|
||||
// alpha_and stores an 'and' operation of all the alpha[] values. The final
|
||||
// value is not 0xff if any of the alpha[] is not equal to 0xff.
|
||||
uint32_t alpha_and = 0xff;
|
||||
|
@ -82,7 +82,7 @@ static int ExtractAlpha(const uint8_t* argb, int argb_stride,
|
|||
extern void WebPInitAlphaProcessingSSE41(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE41(void) {
|
||||
WebPExtractAlpha = ExtractAlpha;
|
||||
WebPExtractAlpha = ExtractAlpha_SSE41;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_SSE41
|
|
@ -9,8 +9,8 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "../enc/cost_enc.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
#include "src/enc/cost_enc.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Boolean-cost cost table
|
||||
|
@ -319,7 +319,7 @@ const uint8_t VP8EncBands[16 + 1] = {
|
|||
//------------------------------------------------------------------------------
|
||||
// Mode costs
|
||||
|
||||
static int GetResidualCost(int ctx0, const VP8Residual* const res) {
|
||||
static int GetResidualCost_C(int ctx0, const VP8Residual* const res) {
|
||||
int n = res->first;
|
||||
// should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
|
||||
const int p0 = res->prob[n][ctx0][0];
|
||||
|
@ -354,8 +354,8 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
|
|||
return cost;
|
||||
}
|
||||
|
||||
static void SetResidualCoeffs(const int16_t* const coeffs,
|
||||
VP8Residual* const res) {
|
||||
static void SetResidualCoeffs_C(const int16_t* const coeffs,
|
||||
VP8Residual* const res) {
|
||||
int n;
|
||||
res->last = -1;
|
||||
assert(res->first == 0 || coeffs[0] == 0);
|
||||
|
@ -384,8 +384,8 @@ static volatile VP8CPUInfo cost_last_cpuinfo_used =
|
|||
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInit(void) {
|
||||
if (cost_last_cpuinfo_used == VP8GetCPUInfo) return;
|
||||
|
||||
VP8GetResidualCost = GetResidualCost;
|
||||
VP8SetResidualCoeffs = SetResidualCoeffs;
|
||||
VP8GetResidualCost = GetResidualCost_C;
|
||||
VP8SetResidualCoeffs = SetResidualCoeffs_C;
|
||||
|
||||
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
|
||||
if (VP8GetCPUInfo != NULL) {
|
|
@ -9,13 +9,13 @@
|
|||
//
|
||||
// Author: Djordje Pesut (djordje.pesut@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MIPS32)
|
||||
|
||||
#include "../enc/cost_enc.h"
|
||||
#include "src/enc/cost_enc.h"
|
||||
|
||||
static int GetResidualCost(int ctx0, const VP8Residual* const res) {
|
||||
static int GetResidualCost_MIPS32(int ctx0, const VP8Residual* const res) {
|
||||
int temp0, temp1;
|
||||
int v_reg, ctx_reg;
|
||||
int n = res->first;
|
||||
|
@ -96,8 +96,8 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
|
|||
return cost;
|
||||
}
|
||||
|
||||
static void SetResidualCoeffs(const int16_t* const coeffs,
|
||||
VP8Residual* const res) {
|
||||
static void SetResidualCoeffs_MIPS32(const int16_t* const coeffs,
|
||||
VP8Residual* const res) {
|
||||
const int16_t* p_coeffs = (int16_t*)coeffs;
|
||||
int temp0, temp1, temp2, n, n1;
|
||||
assert(res->first == 0 || coeffs[0] == 0);
|
||||
|
@ -143,8 +143,8 @@ static void SetResidualCoeffs(const int16_t* const coeffs,
|
|||
extern void VP8EncDspCostInitMIPS32(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPS32(void) {
|
||||
VP8GetResidualCost = GetResidualCost;
|
||||
VP8SetResidualCoeffs = SetResidualCoeffs;
|
||||
VP8GetResidualCost = GetResidualCost_MIPS32;
|
||||
VP8SetResidualCoeffs = SetResidualCoeffs_MIPS32;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_MIPS32
|
|
@ -9,13 +9,13 @@
|
|||
//
|
||||
// Author: Djordje Pesut (djordje.pesut@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MIPS_DSP_R2)
|
||||
|
||||
#include "../enc/cost_enc.h"
|
||||
#include "src/enc/cost_enc.h"
|
||||
|
||||
static int GetResidualCost(int ctx0, const VP8Residual* const res) {
|
||||
static int GetResidualCost_MIPSdspR2(int ctx0, const VP8Residual* const res) {
|
||||
int temp0, temp1;
|
||||
int v_reg, ctx_reg;
|
||||
int n = res->first;
|
||||
|
@ -97,7 +97,7 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
|
|||
extern void VP8EncDspCostInitMIPSdspR2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPSdspR2(void) {
|
||||
VP8GetResidualCost = GetResidualCost;
|
||||
VP8GetResidualCost = GetResidualCost_MIPSdspR2;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_MIPS_DSP_R2
|
|
@ -11,19 +11,19 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
#include <emmintrin.h>
|
||||
|
||||
#include "../enc/cost_enc.h"
|
||||
#include "../enc/vp8i_enc.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "src/enc/cost_enc.h"
|
||||
#include "src/enc/vp8i_enc.h"
|
||||
#include "src/utils/utils.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static void SetResidualCoeffsSSE2(const int16_t* const coeffs,
|
||||
VP8Residual* const res) {
|
||||
static void SetResidualCoeffs_SSE2(const int16_t* const coeffs,
|
||||
VP8Residual* const res) {
|
||||
const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
|
||||
const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
|
||||
// Use SSE2 to compare 16 values with a single instruction.
|
||||
|
@ -42,7 +42,7 @@ static void SetResidualCoeffsSSE2(const int16_t* const coeffs,
|
|||
res->coeffs = coeffs;
|
||||
}
|
||||
|
||||
static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
|
||||
static int GetResidualCost_SSE2(int ctx0, const VP8Residual* const res) {
|
||||
uint8_t levels[16], ctxs[16];
|
||||
uint16_t abs_levels[16];
|
||||
int n = res->first;
|
||||
|
@ -108,8 +108,8 @@ static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
|
|||
extern void VP8EncDspCostInitSSE2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitSSE2(void) {
|
||||
VP8SetResidualCoeffs = SetResidualCoeffsSSE2;
|
||||
VP8GetResidualCost = GetResidualCostSSE2;
|
||||
VP8SetResidualCoeffs = SetResidualCoeffs_SSE2;
|
||||
VP8GetResidualCost = GetResidualCost_SSE2;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_SSE2
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// Author: Christian Duvivier (cduvivier@google.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_HAVE_NEON_RTCD)
|
||||
#include <stdio.h>
|
||||
|
@ -143,7 +143,7 @@ static int x86CPUInfo(CPUFeature feature) {
|
|||
return !!(cpu_info[2] & (1 << 0));
|
||||
}
|
||||
if (feature == kSlowSSSE3) {
|
||||
if (is_intel && (cpu_info[2] & (1 << 0))) { // SSSE3?
|
||||
if (is_intel && (cpu_info[2] & (1 << 9))) { // SSSE3?
|
||||
return CheckSlowModel(cpu_info[0]);
|
||||
}
|
||||
return 0;
|
|
@ -11,9 +11,11 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "../dec/vp8i_dec.h"
|
||||
#include "../utils/utils.h"
|
||||
#include <assert.h>
|
||||
|
||||
#include "src/dsp/dsp.h"
|
||||
#include "src/dec/vp8i_dec.h"
|
||||
#include "src/utils/utils.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
|
@ -25,7 +27,7 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
|
|||
// Transforms (Paragraph 14.4)
|
||||
|
||||
#define STORE(x, y, v) \
|
||||
dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3))
|
||||
dst[(x) + (y) * BPS] = clip_8b(dst[(x) + (y) * BPS] + ((v) >> 3))
|
||||
|
||||
#define STORE2(y, dc, d, c) do { \
|
||||
const int DC = (dc); \
|
||||
|
@ -38,7 +40,8 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
|
|||
#define MUL1(a) ((((a) * 20091) >> 16) + (a))
|
||||
#define MUL2(a) (((a) * 35468) >> 16)
|
||||
|
||||
static void TransformOne(const int16_t* in, uint8_t* dst) {
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static void TransformOne_C(const int16_t* in, uint8_t* dst) {
|
||||
int C[4 * 4], *tmp;
|
||||
int i;
|
||||
tmp = C;
|
||||
|
@ -78,7 +81,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
|
|||
}
|
||||
|
||||
// Simplified transform when only in[0], in[1] and in[4] are non-zero
|
||||
static void TransformAC3(const int16_t* in, uint8_t* dst) {
|
||||
static void TransformAC3_C(const int16_t* in, uint8_t* dst) {
|
||||
const int a = in[0] + 4;
|
||||
const int c4 = MUL2(in[4]);
|
||||
const int d4 = MUL1(in[4]);
|
||||
|
@ -93,19 +96,21 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
|
|||
#undef MUL2
|
||||
#undef STORE2
|
||||
|
||||
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
|
||||
TransformOne(in, dst);
|
||||
static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) {
|
||||
TransformOne_C(in, dst);
|
||||
if (do_two) {
|
||||
TransformOne(in + 16, dst + 4);
|
||||
TransformOne_C(in + 16, dst + 4);
|
||||
}
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
static void TransformUV(const int16_t* in, uint8_t* dst) {
|
||||
static void TransformUV_C(const int16_t* in, uint8_t* dst) {
|
||||
VP8Transform(in + 0 * 16, dst, 1);
|
||||
VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
|
||||
}
|
||||
|
||||
static void TransformDC(const int16_t* in, uint8_t* dst) {
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static void TransformDC_C(const int16_t* in, uint8_t* dst) {
|
||||
const int DC = in[0] + 4;
|
||||
int i, j;
|
||||
for (j = 0; j < 4; ++j) {
|
||||
|
@ -114,8 +119,9 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
|
|||
}
|
||||
}
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
static void TransformDCUV(const int16_t* in, uint8_t* dst) {
|
||||
static void TransformDCUV_C(const int16_t* in, uint8_t* dst) {
|
||||
if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
|
||||
if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4);
|
||||
if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS);
|
||||
|
@ -127,7 +133,8 @@ static void TransformDCUV(const int16_t* in, uint8_t* dst) {
|
|||
//------------------------------------------------------------------------------
|
||||
// Paragraph 14.3
|
||||
|
||||
static void TransformWHT(const int16_t* in, int16_t* out) {
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static void TransformWHT_C(const int16_t* in, int16_t* out) {
|
||||
int tmp[16];
|
||||
int i;
|
||||
for (i = 0; i < 4; ++i) {
|
||||
|
@ -153,6 +160,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
|
|||
out += 64;
|
||||
}
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
|
||||
|
||||
|
@ -161,6 +169,7 @@ void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
|
|||
|
||||
#define DST(x, y) dst[(x) + (y) * BPS]
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
|
||||
const uint8_t* top = dst - BPS;
|
||||
const uint8_t* const clip0 = VP8kclip1 - top[-1];
|
||||
|
@ -174,21 +183,21 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
|
|||
dst += BPS;
|
||||
}
|
||||
}
|
||||
static void TM4(uint8_t* dst) { TrueMotion(dst, 4); }
|
||||
static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
|
||||
static void TM16(uint8_t* dst) { TrueMotion(dst, 16); }
|
||||
static void TM4_C(uint8_t* dst) { TrueMotion(dst, 4); }
|
||||
static void TM8uv_C(uint8_t* dst) { TrueMotion(dst, 8); }
|
||||
static void TM16_C(uint8_t* dst) { TrueMotion(dst, 16); }
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// 16x16
|
||||
|
||||
static void VE16(uint8_t* dst) { // vertical
|
||||
static void VE16_C(uint8_t* dst) { // vertical
|
||||
int j;
|
||||
for (j = 0; j < 16; ++j) {
|
||||
memcpy(dst + j * BPS, dst - BPS, 16);
|
||||
}
|
||||
}
|
||||
|
||||
static void HE16(uint8_t* dst) { // horizontal
|
||||
static void HE16_C(uint8_t* dst) { // horizontal
|
||||
int j;
|
||||
for (j = 16; j > 0; --j) {
|
||||
memset(dst, dst[-1], 16);
|
||||
|
@ -203,7 +212,7 @@ static WEBP_INLINE void Put16(int v, uint8_t* dst) {
|
|||
}
|
||||
}
|
||||
|
||||
static void DC16(uint8_t* dst) { // DC
|
||||
static void DC16_C(uint8_t* dst) { // DC
|
||||
int DC = 16;
|
||||
int j;
|
||||
for (j = 0; j < 16; ++j) {
|
||||
|
@ -212,7 +221,7 @@ static void DC16(uint8_t* dst) { // DC
|
|||
Put16(DC >> 5, dst);
|
||||
}
|
||||
|
||||
static void DC16NoTop(uint8_t* dst) { // DC with top samples not available
|
||||
static void DC16NoTop_C(uint8_t* dst) { // DC with top samples not available
|
||||
int DC = 8;
|
||||
int j;
|
||||
for (j = 0; j < 16; ++j) {
|
||||
|
@ -221,7 +230,7 @@ static void DC16NoTop(uint8_t* dst) { // DC with top samples not available
|
|||
Put16(DC >> 4, dst);
|
||||
}
|
||||
|
||||
static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available
|
||||
static void DC16NoLeft_C(uint8_t* dst) { // DC with left samples not available
|
||||
int DC = 8;
|
||||
int i;
|
||||
for (i = 0; i < 16; ++i) {
|
||||
|
@ -230,9 +239,10 @@ static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available
|
|||
Put16(DC >> 4, dst);
|
||||
}
|
||||
|
||||
static void DC16NoTopLeft(uint8_t* dst) { // DC with no top and left samples
|
||||
static void DC16NoTopLeft_C(uint8_t* dst) { // DC with no top and left samples
|
||||
Put16(0x80, dst);
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
|
||||
|
||||
|
@ -242,7 +252,8 @@ VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
|
|||
#define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
|
||||
#define AVG2(a, b) (((a) + (b) + 1) >> 1)
|
||||
|
||||
static void VE4(uint8_t* dst) { // vertical
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static void VE4_C(uint8_t* dst) { // vertical
|
||||
const uint8_t* top = dst - BPS;
|
||||
const uint8_t vals[4] = {
|
||||
AVG3(top[-1], top[0], top[1]),
|
||||
|
@ -255,8 +266,9 @@ static void VE4(uint8_t* dst) { // vertical
|
|||
memcpy(dst + i * BPS, vals, sizeof(vals));
|
||||
}
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
static void HE4(uint8_t* dst) { // horizontal
|
||||
static void HE4_C(uint8_t* dst) { // horizontal
|
||||
const int A = dst[-1 - BPS];
|
||||
const int B = dst[-1];
|
||||
const int C = dst[-1 + BPS];
|
||||
|
@ -268,7 +280,8 @@ static void HE4(uint8_t* dst) { // horizontal
|
|||
WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(D, E, E));
|
||||
}
|
||||
|
||||
static void DC4(uint8_t* dst) { // DC
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static void DC4_C(uint8_t* dst) { // DC
|
||||
uint32_t dc = 4;
|
||||
int i;
|
||||
for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
|
||||
|
@ -276,7 +289,7 @@ static void DC4(uint8_t* dst) { // DC
|
|||
for (i = 0; i < 4; ++i) memset(dst + i * BPS, dc, 4);
|
||||
}
|
||||
|
||||
static void RD4(uint8_t* dst) { // Down-right
|
||||
static void RD4_C(uint8_t* dst) { // Down-right
|
||||
const int I = dst[-1 + 0 * BPS];
|
||||
const int J = dst[-1 + 1 * BPS];
|
||||
const int K = dst[-1 + 2 * BPS];
|
||||
|
@ -295,7 +308,7 @@ static void RD4(uint8_t* dst) { // Down-right
|
|||
DST(3, 0) = AVG3(D, C, B);
|
||||
}
|
||||
|
||||
static void LD4(uint8_t* dst) { // Down-Left
|
||||
static void LD4_C(uint8_t* dst) { // Down-Left
|
||||
const int A = dst[0 - BPS];
|
||||
const int B = dst[1 - BPS];
|
||||
const int C = dst[2 - BPS];
|
||||
|
@ -312,8 +325,9 @@ static void LD4(uint8_t* dst) { // Down-Left
|
|||
DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
|
||||
DST(3, 3) = AVG3(G, H, H);
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
static void VR4(uint8_t* dst) { // Vertical-Right
|
||||
static void VR4_C(uint8_t* dst) { // Vertical-Right
|
||||
const int I = dst[-1 + 0 * BPS];
|
||||
const int J = dst[-1 + 1 * BPS];
|
||||
const int K = dst[-1 + 2 * BPS];
|
||||
|
@ -335,7 +349,7 @@ static void VR4(uint8_t* dst) { // Vertical-Right
|
|||
DST(3, 1) = AVG3(B, C, D);
|
||||
}
|
||||
|
||||
static void VL4(uint8_t* dst) { // Vertical-Left
|
||||
static void VL4_C(uint8_t* dst) { // Vertical-Left
|
||||
const int A = dst[0 - BPS];
|
||||
const int B = dst[1 - BPS];
|
||||
const int C = dst[2 - BPS];
|
||||
|
@ -357,7 +371,7 @@ static void VL4(uint8_t* dst) { // Vertical-Left
|
|||
DST(3, 3) = AVG3(F, G, H);
|
||||
}
|
||||
|
||||
static void HU4(uint8_t* dst) { // Horizontal-Up
|
||||
static void HU4_C(uint8_t* dst) { // Horizontal-Up
|
||||
const int I = dst[-1 + 0 * BPS];
|
||||
const int J = dst[-1 + 1 * BPS];
|
||||
const int K = dst[-1 + 2 * BPS];
|
||||
|
@ -372,7 +386,7 @@ static void HU4(uint8_t* dst) { // Horizontal-Up
|
|||
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
|
||||
}
|
||||
|
||||
static void HD4(uint8_t* dst) { // Horizontal-Down
|
||||
static void HD4_C(uint8_t* dst) { // Horizontal-Down
|
||||
const int I = dst[-1 + 0 * BPS];
|
||||
const int J = dst[-1 + 1 * BPS];
|
||||
const int K = dst[-1 + 2 * BPS];
|
||||
|
@ -404,14 +418,15 @@ VP8PredFunc VP8PredLuma4[NUM_BMODES];
|
|||
//------------------------------------------------------------------------------
|
||||
// Chroma
|
||||
|
||||
static void VE8uv(uint8_t* dst) { // vertical
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static void VE8uv_C(uint8_t* dst) { // vertical
|
||||
int j;
|
||||
for (j = 0; j < 8; ++j) {
|
||||
memcpy(dst + j * BPS, dst - BPS, 8);
|
||||
}
|
||||
}
|
||||
|
||||
static void HE8uv(uint8_t* dst) { // horizontal
|
||||
static void HE8uv_C(uint8_t* dst) { // horizontal
|
||||
int j;
|
||||
for (j = 0; j < 8; ++j) {
|
||||
memset(dst, dst[-1], 8);
|
||||
|
@ -427,7 +442,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
|
|||
}
|
||||
}
|
||||
|
||||
static void DC8uv(uint8_t* dst) { // DC
|
||||
static void DC8uv_C(uint8_t* dst) { // DC
|
||||
int dc0 = 8;
|
||||
int i;
|
||||
for (i = 0; i < 8; ++i) {
|
||||
|
@ -436,7 +451,7 @@ static void DC8uv(uint8_t* dst) { // DC
|
|||
Put8x8uv(dc0 >> 4, dst);
|
||||
}
|
||||
|
||||
static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
|
||||
static void DC8uvNoLeft_C(uint8_t* dst) { // DC with no left samples
|
||||
int dc0 = 4;
|
||||
int i;
|
||||
for (i = 0; i < 8; ++i) {
|
||||
|
@ -445,7 +460,7 @@ static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
|
|||
Put8x8uv(dc0 >> 3, dst);
|
||||
}
|
||||
|
||||
static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
|
||||
static void DC8uvNoTop_C(uint8_t* dst) { // DC with no top samples
|
||||
int dc0 = 4;
|
||||
int i;
|
||||
for (i = 0; i < 8; ++i) {
|
||||
|
@ -454,17 +469,19 @@ static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
|
|||
Put8x8uv(dc0 >> 3, dst);
|
||||
}
|
||||
|
||||
static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing
|
||||
static void DC8uvNoTopLeft_C(uint8_t* dst) { // DC with nothing
|
||||
Put8x8uv(0x80, dst);
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES];
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Edge filtering functions
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
|
||||
// 4 pixels in, 2 pixels out
|
||||
static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
|
||||
static WEBP_INLINE void DoFilter2_C(uint8_t* p, int step) {
|
||||
const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
|
||||
const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1]; // in [-893,892]
|
||||
const int a1 = VP8ksclip2[(a + 4) >> 3]; // in [-16,15]
|
||||
|
@ -474,7 +491,7 @@ static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
|
|||
}
|
||||
|
||||
// 4 pixels in, 4 pixels out
|
||||
static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
|
||||
static WEBP_INLINE void DoFilter4_C(uint8_t* p, int step) {
|
||||
const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
|
||||
const int a = 3 * (q0 - p0);
|
||||
const int a1 = VP8ksclip2[(a + 4) >> 3];
|
||||
|
@ -487,7 +504,7 @@ static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
|
|||
}
|
||||
|
||||
// 6 pixels in, 6 pixels out
|
||||
static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
|
||||
static WEBP_INLINE void DoFilter6_C(uint8_t* p, int step) {
|
||||
const int p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
|
||||
const int q0 = p[0], q1 = p[step], q2 = p[2*step];
|
||||
const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
|
||||
|
@ -503,18 +520,22 @@ static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
|
|||
p[ 2*step] = VP8kclip1[q2 - a3];
|
||||
}
|
||||
|
||||
static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
|
||||
static WEBP_INLINE int Hev(const uint8_t* p, int step, int thresh) {
|
||||
const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
|
||||
return (VP8kabs0[p1 - p0] > thresh) || (VP8kabs0[q1 - q0] > thresh);
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
|
||||
|
||||
static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static WEBP_INLINE int NeedsFilter_C(const uint8_t* p, int step, int t) {
|
||||
const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
|
||||
return ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) <= t);
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
static WEBP_INLINE int needs_filter2(const uint8_t* p,
|
||||
int step, int t, int it) {
|
||||
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
|
||||
static WEBP_INLINE int NeedsFilter2_C(const uint8_t* p,
|
||||
int step, int t, int it) {
|
||||
const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step];
|
||||
const int p0 = p[-step], q0 = p[0];
|
||||
const int q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
|
||||
|
@ -523,140 +544,159 @@ static WEBP_INLINE int needs_filter2(const uint8_t* p,
|
|||
VP8kabs0[p1 - p0] <= it && VP8kabs0[q3 - q2] <= it &&
|
||||
VP8kabs0[q2 - q1] <= it && VP8kabs0[q1 - q0] <= it;
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Simple In-loop filtering (Paragraph 15.2)
|
||||
|
||||
static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static void SimpleVFilter16_C(uint8_t* p, int stride, int thresh) {
|
||||
int i;
|
||||
const int thresh2 = 2 * thresh + 1;
|
||||
for (i = 0; i < 16; ++i) {
|
||||
if (needs_filter(p + i, stride, thresh2)) {
|
||||
do_filter2(p + i, stride);
|
||||
if (NeedsFilter_C(p + i, stride, thresh2)) {
|
||||
DoFilter2_C(p + i, stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
|
||||
static void SimpleHFilter16_C(uint8_t* p, int stride, int thresh) {
|
||||
int i;
|
||||
const int thresh2 = 2 * thresh + 1;
|
||||
for (i = 0; i < 16; ++i) {
|
||||
if (needs_filter(p + i * stride, 1, thresh2)) {
|
||||
do_filter2(p + i * stride, 1);
|
||||
if (NeedsFilter_C(p + i * stride, 1, thresh2)) {
|
||||
DoFilter2_C(p + i * stride, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
|
||||
static void SimpleVFilter16i_C(uint8_t* p, int stride, int thresh) {
|
||||
int k;
|
||||
for (k = 3; k > 0; --k) {
|
||||
p += 4 * stride;
|
||||
SimpleVFilter16(p, stride, thresh);
|
||||
SimpleVFilter16_C(p, stride, thresh);
|
||||
}
|
||||
}
|
||||
|
||||
static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
|
||||
static void SimpleHFilter16i_C(uint8_t* p, int stride, int thresh) {
|
||||
int k;
|
||||
for (k = 3; k > 0; --k) {
|
||||
p += 4;
|
||||
SimpleHFilter16(p, stride, thresh);
|
||||
SimpleHFilter16_C(p, stride, thresh);
|
||||
}
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Complex In-loop filtering (Paragraph 15.3)
|
||||
|
||||
static WEBP_INLINE void FilterLoop26(uint8_t* p,
|
||||
int hstride, int vstride, int size,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
|
||||
static WEBP_INLINE void FilterLoop26_C(uint8_t* p,
|
||||
int hstride, int vstride, int size,
|
||||
int thresh, int ithresh,
|
||||
int hev_thresh) {
|
||||
const int thresh2 = 2 * thresh + 1;
|
||||
while (size-- > 0) {
|
||||
if (needs_filter2(p, hstride, thresh2, ithresh)) {
|
||||
if (hev(p, hstride, hev_thresh)) {
|
||||
do_filter2(p, hstride);
|
||||
if (NeedsFilter2_C(p, hstride, thresh2, ithresh)) {
|
||||
if (Hev(p, hstride, hev_thresh)) {
|
||||
DoFilter2_C(p, hstride);
|
||||
} else {
|
||||
do_filter6(p, hstride);
|
||||
DoFilter6_C(p, hstride);
|
||||
}
|
||||
}
|
||||
p += vstride;
|
||||
}
|
||||
}
|
||||
|
||||
static WEBP_INLINE void FilterLoop24(uint8_t* p,
|
||||
int hstride, int vstride, int size,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static WEBP_INLINE void FilterLoop24_C(uint8_t* p,
|
||||
int hstride, int vstride, int size,
|
||||
int thresh, int ithresh,
|
||||
int hev_thresh) {
|
||||
const int thresh2 = 2 * thresh + 1;
|
||||
while (size-- > 0) {
|
||||
if (needs_filter2(p, hstride, thresh2, ithresh)) {
|
||||
if (hev(p, hstride, hev_thresh)) {
|
||||
do_filter2(p, hstride);
|
||||
if (NeedsFilter2_C(p, hstride, thresh2, ithresh)) {
|
||||
if (Hev(p, hstride, hev_thresh)) {
|
||||
DoFilter2_C(p, hstride);
|
||||
} else {
|
||||
do_filter4(p, hstride);
|
||||
DoFilter4_C(p, hstride);
|
||||
}
|
||||
}
|
||||
p += vstride;
|
||||
}
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
// on macroblock edges
|
||||
static void VFilter16(uint8_t* p, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
|
||||
static void VFilter16_C(uint8_t* p, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
FilterLoop26_C(p, stride, 1, 16, thresh, ithresh, hev_thresh);
|
||||
}
|
||||
|
||||
static void HFilter16(uint8_t* p, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
|
||||
static void HFilter16_C(uint8_t* p, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
FilterLoop26_C(p, 1, stride, 16, thresh, ithresh, hev_thresh);
|
||||
}
|
||||
|
||||
// on three inner edges
|
||||
static void VFilter16i(uint8_t* p, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void VFilter16i_C(uint8_t* p, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
int k;
|
||||
for (k = 3; k > 0; --k) {
|
||||
p += 4 * stride;
|
||||
FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
|
||||
FilterLoop24_C(p, stride, 1, 16, thresh, ithresh, hev_thresh);
|
||||
}
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
static void HFilter16i(uint8_t* p, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
|
||||
static void HFilter16i_C(uint8_t* p, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
int k;
|
||||
for (k = 3; k > 0; --k) {
|
||||
p += 4;
|
||||
FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
|
||||
FilterLoop24_C(p, 1, stride, 16, thresh, ithresh, hev_thresh);
|
||||
}
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
// 8-pixels wide variant, for chroma filtering
|
||||
static void VFilter8(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
|
||||
FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
|
||||
static void VFilter8_C(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
FilterLoop26_C(u, stride, 1, 8, thresh, ithresh, hev_thresh);
|
||||
FilterLoop26_C(v, stride, 1, 8, thresh, ithresh, hev_thresh);
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
static void HFilter8(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
|
||||
FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
|
||||
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
|
||||
static void HFilter8_C(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
FilterLoop26_C(u, 1, stride, 8, thresh, ithresh, hev_thresh);
|
||||
FilterLoop26_C(v, 1, stride, 8, thresh, ithresh, hev_thresh);
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
|
||||
|
||||
static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
|
||||
FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static void VFilter8i_C(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
FilterLoop24_C(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
|
||||
FilterLoop24_C(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
|
||||
FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
|
||||
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
|
||||
static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
FilterLoop24_C(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
|
||||
FilterLoop24_C(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static void DitherCombine8x8(const uint8_t* dither, uint8_t* dst,
|
||||
int dst_stride) {
|
||||
static void DitherCombine8x8_C(const uint8_t* dither, uint8_t* dst,
|
||||
int dst_stride) {
|
||||
int i, j;
|
||||
for (j = 0; j < 8; ++j) {
|
||||
for (i = 0; i < 8; ++i) {
|
||||
|
@ -709,54 +749,66 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
|
|||
|
||||
VP8InitClipTables();
|
||||
|
||||
VP8TransformWHT = TransformWHT;
|
||||
VP8Transform = TransformTwo;
|
||||
VP8TransformUV = TransformUV;
|
||||
VP8TransformDC = TransformDC;
|
||||
VP8TransformDCUV = TransformDCUV;
|
||||
VP8TransformAC3 = TransformAC3;
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
VP8TransformWHT = TransformWHT_C;
|
||||
VP8Transform = TransformTwo_C;
|
||||
VP8TransformDC = TransformDC_C;
|
||||
VP8TransformAC3 = TransformAC3_C;
|
||||
#endif
|
||||
VP8TransformUV = TransformUV_C;
|
||||
VP8TransformDCUV = TransformDCUV_C;
|
||||
|
||||
VP8VFilter16 = VFilter16;
|
||||
VP8HFilter16 = HFilter16;
|
||||
VP8VFilter8 = VFilter8;
|
||||
VP8HFilter8 = HFilter8;
|
||||
VP8VFilter16i = VFilter16i;
|
||||
VP8HFilter16i = HFilter16i;
|
||||
VP8VFilter8i = VFilter8i;
|
||||
VP8HFilter8i = HFilter8i;
|
||||
VP8SimpleVFilter16 = SimpleVFilter16;
|
||||
VP8SimpleHFilter16 = SimpleHFilter16;
|
||||
VP8SimpleVFilter16i = SimpleVFilter16i;
|
||||
VP8SimpleHFilter16i = SimpleHFilter16i;
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
VP8VFilter16 = VFilter16_C;
|
||||
VP8VFilter16i = VFilter16i_C;
|
||||
VP8HFilter16 = HFilter16_C;
|
||||
VP8VFilter8 = VFilter8_C;
|
||||
VP8VFilter8i = VFilter8i_C;
|
||||
VP8SimpleVFilter16 = SimpleVFilter16_C;
|
||||
VP8SimpleHFilter16 = SimpleHFilter16_C;
|
||||
VP8SimpleVFilter16i = SimpleVFilter16i_C;
|
||||
VP8SimpleHFilter16i = SimpleHFilter16i_C;
|
||||
#endif
|
||||
|
||||
VP8PredLuma4[0] = DC4;
|
||||
VP8PredLuma4[1] = TM4;
|
||||
VP8PredLuma4[2] = VE4;
|
||||
VP8PredLuma4[3] = HE4;
|
||||
VP8PredLuma4[4] = RD4;
|
||||
VP8PredLuma4[5] = VR4;
|
||||
VP8PredLuma4[6] = LD4;
|
||||
VP8PredLuma4[7] = VL4;
|
||||
VP8PredLuma4[8] = HD4;
|
||||
VP8PredLuma4[9] = HU4;
|
||||
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
|
||||
VP8HFilter16i = HFilter16i_C;
|
||||
VP8HFilter8 = HFilter8_C;
|
||||
VP8HFilter8i = HFilter8i_C;
|
||||
#endif
|
||||
|
||||
VP8PredLuma16[0] = DC16;
|
||||
VP8PredLuma16[1] = TM16;
|
||||
VP8PredLuma16[2] = VE16;
|
||||
VP8PredLuma16[3] = HE16;
|
||||
VP8PredLuma16[4] = DC16NoTop;
|
||||
VP8PredLuma16[5] = DC16NoLeft;
|
||||
VP8PredLuma16[6] = DC16NoTopLeft;
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
VP8PredLuma4[0] = DC4_C;
|
||||
VP8PredLuma4[1] = TM4_C;
|
||||
VP8PredLuma4[2] = VE4_C;
|
||||
VP8PredLuma4[4] = RD4_C;
|
||||
VP8PredLuma4[6] = LD4_C;
|
||||
#endif
|
||||
|
||||
VP8PredChroma8[0] = DC8uv;
|
||||
VP8PredChroma8[1] = TM8uv;
|
||||
VP8PredChroma8[2] = VE8uv;
|
||||
VP8PredChroma8[3] = HE8uv;
|
||||
VP8PredChroma8[4] = DC8uvNoTop;
|
||||
VP8PredChroma8[5] = DC8uvNoLeft;
|
||||
VP8PredChroma8[6] = DC8uvNoTopLeft;
|
||||
VP8PredLuma4[3] = HE4_C;
|
||||
VP8PredLuma4[5] = VR4_C;
|
||||
VP8PredLuma4[7] = VL4_C;
|
||||
VP8PredLuma4[8] = HD4_C;
|
||||
VP8PredLuma4[9] = HU4_C;
|
||||
|
||||
VP8DitherCombine8x8 = DitherCombine8x8;
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
VP8PredLuma16[0] = DC16_C;
|
||||
VP8PredLuma16[1] = TM16_C;
|
||||
VP8PredLuma16[2] = VE16_C;
|
||||
VP8PredLuma16[3] = HE16_C;
|
||||
VP8PredLuma16[4] = DC16NoTop_C;
|
||||
VP8PredLuma16[5] = DC16NoLeft_C;
|
||||
VP8PredLuma16[6] = DC16NoTopLeft_C;
|
||||
|
||||
VP8PredChroma8[0] = DC8uv_C;
|
||||
VP8PredChroma8[1] = TM8uv_C;
|
||||
VP8PredChroma8[2] = VE8uv_C;
|
||||
VP8PredChroma8[3] = HE8uv_C;
|
||||
VP8PredChroma8[4] = DC8uvNoTop_C;
|
||||
VP8PredChroma8[5] = DC8uvNoLeft_C;
|
||||
VP8PredChroma8[6] = DC8uvNoTopLeft_C;
|
||||
#endif
|
||||
|
||||
VP8DitherCombine8x8 = DitherCombine8x8_C;
|
||||
|
||||
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
|
@ -770,11 +822,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
|
|||
#endif
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_NEON)
|
||||
if (VP8GetCPUInfo(kNEON)) {
|
||||
VP8DspInitNEON();
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_MIPS32)
|
||||
if (VP8GetCPUInfo(kMIPS32)) {
|
||||
VP8DspInitMIPS32();
|
||||
|
@ -791,5 +838,57 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
|
|||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
if (WEBP_NEON_OMIT_C_CODE ||
|
||||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
|
||||
VP8DspInitNEON();
|
||||
}
|
||||
#endif
|
||||
|
||||
assert(VP8TransformWHT != NULL);
|
||||
assert(VP8Transform != NULL);
|
||||
assert(VP8TransformDC != NULL);
|
||||
assert(VP8TransformAC3 != NULL);
|
||||
assert(VP8TransformUV != NULL);
|
||||
assert(VP8TransformDCUV != NULL);
|
||||
assert(VP8VFilter16 != NULL);
|
||||
assert(VP8HFilter16 != NULL);
|
||||
assert(VP8VFilter8 != NULL);
|
||||
assert(VP8HFilter8 != NULL);
|
||||
assert(VP8VFilter16i != NULL);
|
||||
assert(VP8HFilter16i != NULL);
|
||||
assert(VP8VFilter8i != NULL);
|
||||
assert(VP8HFilter8i != NULL);
|
||||
assert(VP8SimpleVFilter16 != NULL);
|
||||
assert(VP8SimpleHFilter16 != NULL);
|
||||
assert(VP8SimpleVFilter16i != NULL);
|
||||
assert(VP8SimpleHFilter16i != NULL);
|
||||
assert(VP8PredLuma4[0] != NULL);
|
||||
assert(VP8PredLuma4[1] != NULL);
|
||||
assert(VP8PredLuma4[2] != NULL);
|
||||
assert(VP8PredLuma4[3] != NULL);
|
||||
assert(VP8PredLuma4[4] != NULL);
|
||||
assert(VP8PredLuma4[5] != NULL);
|
||||
assert(VP8PredLuma4[6] != NULL);
|
||||
assert(VP8PredLuma4[7] != NULL);
|
||||
assert(VP8PredLuma4[8] != NULL);
|
||||
assert(VP8PredLuma4[9] != NULL);
|
||||
assert(VP8PredLuma16[0] != NULL);
|
||||
assert(VP8PredLuma16[1] != NULL);
|
||||
assert(VP8PredLuma16[2] != NULL);
|
||||
assert(VP8PredLuma16[3] != NULL);
|
||||
assert(VP8PredLuma16[4] != NULL);
|
||||
assert(VP8PredLuma16[5] != NULL);
|
||||
assert(VP8PredLuma16[6] != NULL);
|
||||
assert(VP8PredChroma8[0] != NULL);
|
||||
assert(VP8PredChroma8[1] != NULL);
|
||||
assert(VP8PredChroma8[2] != NULL);
|
||||
assert(VP8PredChroma8[3] != NULL);
|
||||
assert(VP8PredChroma8[4] != NULL);
|
||||
assert(VP8PredChroma8[5] != NULL);
|
||||
assert(VP8PredChroma8[6] != NULL);
|
||||
assert(VP8DitherCombine8x8 != NULL);
|
||||
|
||||
dec_last_cpuinfo_used = VP8GetCPUInfo;
|
||||
}
|
|
@ -11,11 +11,14 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#define USE_STATIC_TABLES // undefine to have run-time table initialization
|
||||
// define to 0 to have run-time table initialization
|
||||
#if !defined(USE_STATIC_TABLES)
|
||||
#define USE_STATIC_TABLES 1 // ALTERNATE_CODE
|
||||
#endif
|
||||
|
||||
#ifdef USE_STATIC_TABLES
|
||||
#if (USE_STATIC_TABLES == 1)
|
||||
|
||||
static const uint8_t abs0[255 + 255 + 1] = {
|
||||
0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4,
|
||||
|
@ -337,7 +340,7 @@ static uint8_t clip1[255 + 511 + 1];
|
|||
// and make sure it's set to true _last_ (so as to be thread-safe)
|
||||
static volatile int tables_ok = 0;
|
||||
|
||||
#endif
|
||||
#endif // USE_STATIC_TABLES
|
||||
|
||||
const int8_t* const VP8ksclip1 = (const int8_t*)&sclip1[1020];
|
||||
const int8_t* const VP8ksclip2 = (const int8_t*)&sclip2[112];
|
||||
|
@ -345,7 +348,7 @@ const uint8_t* const VP8kclip1 = &clip1[255];
|
|||
const uint8_t* const VP8kabs0 = &abs0[255];
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8InitClipTables(void) {
|
||||
#if !defined(USE_STATIC_TABLES)
|
||||
#if (USE_STATIC_TABLES == 0)
|
||||
int i;
|
||||
if (!tables_ok) {
|
||||
for (i = -255; i <= 255; ++i) {
|
|
@ -12,11 +12,11 @@
|
|||
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
|
||||
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MIPS32)
|
||||
|
||||
#include "./mips_macro.h"
|
||||
#include "src/dsp/mips_macro.h"
|
||||
|
||||
static const int kC1 = 20091 + (1 << 16);
|
||||
static const int kC2 = 35468;
|
|
@ -12,11 +12,11 @@
|
|||
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
|
||||
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MIPS_DSP_R2)
|
||||
|
||||
#include "./mips_macro.h"
|
||||
#include "src/dsp/mips_macro.h"
|
||||
|
||||
static const int kC1 = 20091 + (1 << 16);
|
||||
static const int kC2 = 35468;
|
|
@ -12,11 +12,11 @@
|
|||
// Author(s): Prashant Patil (prashant.patil@imgtec.com)
|
||||
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MSA)
|
||||
|
||||
#include "./msa_macro.h"
|
||||
#include "src/dsp/msa_macro.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Transforms
|
||||
|
@ -222,6 +222,7 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
|
|||
const v16i8 cnst4b = __msa_ldi_b(4); \
|
||||
const v16i8 cnst3b = __msa_ldi_b(3); \
|
||||
const v8i16 cnst9h = __msa_ldi_h(9); \
|
||||
const v8i16 cnst63h = __msa_ldi_h(63); \
|
||||
\
|
||||
FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m); \
|
||||
filt = __msa_subs_s_b(p1_m, q1_m); \
|
||||
|
@ -241,9 +242,9 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
|
|||
ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l); \
|
||||
/* update q2/p2 */ \
|
||||
temp0 = filt_r * cnst9h; \
|
||||
temp1 = ADDVI_H(temp0, 63); \
|
||||
temp1 = temp0 + cnst63h; \
|
||||
temp2 = filt_l * cnst9h; \
|
||||
temp3 = ADDVI_H(temp2, 63); \
|
||||
temp3 = temp2 + cnst63h; \
|
||||
FILT2(q2_m, p2_m, q2, p2); \
|
||||
/* update q1/p1 */ \
|
||||
temp1 = temp1 + temp0; \
|
||||
|
@ -708,7 +709,7 @@ static void VE4(uint8_t* dst) { // vertical
|
|||
const uint32_t val0 = LW(ptop + 0);
|
||||
const uint32_t val1 = LW(ptop + 4);
|
||||
uint32_t out;
|
||||
v16u8 A, B, C, AC, B2, R;
|
||||
v16u8 A = { 0 }, B, C, AC, B2, R;
|
||||
|
||||
INSERT_W2_UB(val0, val1, A);
|
||||
B = SLDI_UB(A, A, 1);
|
||||
|
@ -725,7 +726,7 @@ static void RD4(uint8_t* dst) { // Down-right
|
|||
uint32_t val0 = LW(ptop + 0);
|
||||
uint32_t val1 = LW(ptop + 4);
|
||||
uint32_t val2, val3;
|
||||
v16u8 A, B, C, AC, B2, R, A1;
|
||||
v16u8 A, B, C, AC, B2, R, A1 = { 0 };
|
||||
|
||||
INSERT_W2_UB(val0, val1, A1);
|
||||
A = SLDI_UB(A1, A1, 12);
|
||||
|
@ -753,7 +754,7 @@ static void LD4(uint8_t* dst) { // Down-Left
|
|||
uint32_t val0 = LW(ptop + 0);
|
||||
uint32_t val1 = LW(ptop + 4);
|
||||
uint32_t val2, val3;
|
||||
v16u8 A, B, C, AC, B2, R;
|
||||
v16u8 A = { 0 }, B, C, AC, B2, R;
|
||||
|
||||
INSERT_W2_UB(val0, val1, A);
|
||||
B = SLDI_UB(A, A, 1);
|
File diff suppressed because it is too large
Load Diff
|
@ -12,23 +12,25 @@
|
|||
// Author: somnath@google.com (Somnath Banerjee)
|
||||
// cduvivier@google.com (Christian Duvivier)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
|
||||
// The 3-coeff sparse transform in SSE2 is not really faster than the plain-C
|
||||
// one it seems => disable it by default. Uncomment the following to enable:
|
||||
// #define USE_TRANSFORM_AC3
|
||||
#if !defined(USE_TRANSFORM_AC3)
|
||||
#define USE_TRANSFORM_AC3 0 // ALTERNATE_CODE
|
||||
#endif
|
||||
|
||||
#include <emmintrin.h>
|
||||
#include "./common_sse2.h"
|
||||
#include "../dec/vp8i_dec.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "src/dsp/common_sse2.h"
|
||||
#include "src/dec/vp8i_dec.h"
|
||||
#include "src/utils/utils.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Transforms (Paragraph 14.4)
|
||||
|
||||
static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
|
||||
static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
|
||||
// This implementation makes use of 16-bit fixed point versions of two
|
||||
// multiply constants:
|
||||
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
|
||||
|
@ -193,7 +195,7 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
|
|||
}
|
||||
}
|
||||
|
||||
#if defined(USE_TRANSFORM_AC3)
|
||||
#if (USE_TRANSFORM_AC3 == 1)
|
||||
#define MUL(a, b) (((a) * (b)) >> 16)
|
||||
static void TransformAC3(const int16_t* in, uint8_t* dst) {
|
||||
static const int kC1 = 20091 + (1 << 16);
|
||||
|
@ -248,7 +250,7 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
|
|||
_mm_subs_epu8((p), (q)))
|
||||
|
||||
// Shift each byte of "x" by 3 bits while preserving by the sign bit.
|
||||
static WEBP_INLINE void SignedShift8b(__m128i* const x) {
|
||||
static WEBP_INLINE void SignedShift8b_SSE2(__m128i* const x) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i lo_0 = _mm_unpacklo_epi8(zero, *x);
|
||||
const __m128i hi_0 = _mm_unpackhi_epi8(zero, *x);
|
||||
|
@ -258,8 +260,8 @@ static WEBP_INLINE void SignedShift8b(__m128i* const x) {
|
|||
}
|
||||
|
||||
#define FLIP_SIGN_BIT2(a, b) { \
|
||||
a = _mm_xor_si128(a, sign_bit); \
|
||||
b = _mm_xor_si128(b, sign_bit); \
|
||||
(a) = _mm_xor_si128(a, sign_bit); \
|
||||
(b) = _mm_xor_si128(b, sign_bit); \
|
||||
}
|
||||
|
||||
#define FLIP_SIGN_BIT4(a, b, c, d) { \
|
||||
|
@ -268,11 +270,11 @@ static WEBP_INLINE void SignedShift8b(__m128i* const x) {
|
|||
}
|
||||
|
||||
// input/output is uint8_t
|
||||
static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
|
||||
const __m128i* const p0,
|
||||
const __m128i* const q0,
|
||||
const __m128i* const q1,
|
||||
int hev_thresh, __m128i* const not_hev) {
|
||||
static WEBP_INLINE void GetNotHEV_SSE2(const __m128i* const p1,
|
||||
const __m128i* const p0,
|
||||
const __m128i* const q0,
|
||||
const __m128i* const q1,
|
||||
int hev_thresh, __m128i* const not_hev) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i t_1 = MM_ABS(*p1, *p0);
|
||||
const __m128i t_2 = MM_ABS(*q1, *q0);
|
||||
|
@ -285,11 +287,11 @@ static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
|
|||
}
|
||||
|
||||
// input pixels are int8_t
|
||||
static WEBP_INLINE void GetBaseDelta(const __m128i* const p1,
|
||||
const __m128i* const p0,
|
||||
const __m128i* const q0,
|
||||
const __m128i* const q1,
|
||||
__m128i* const delta) {
|
||||
static WEBP_INLINE void GetBaseDelta_SSE2(const __m128i* const p1,
|
||||
const __m128i* const p0,
|
||||
const __m128i* const q0,
|
||||
const __m128i* const q1,
|
||||
__m128i* const delta) {
|
||||
// beware of addition order, for saturation!
|
||||
const __m128i p1_q1 = _mm_subs_epi8(*p1, *q1); // p1 - q1
|
||||
const __m128i q0_p0 = _mm_subs_epi8(*q0, *p0); // q0 - p0
|
||||
|
@ -300,15 +302,16 @@ static WEBP_INLINE void GetBaseDelta(const __m128i* const p1,
|
|||
}
|
||||
|
||||
// input and output are int8_t
|
||||
static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0,
|
||||
const __m128i* const fl) {
|
||||
static WEBP_INLINE void DoSimpleFilter_SSE2(__m128i* const p0,
|
||||
__m128i* const q0,
|
||||
const __m128i* const fl) {
|
||||
const __m128i k3 = _mm_set1_epi8(3);
|
||||
const __m128i k4 = _mm_set1_epi8(4);
|
||||
__m128i v3 = _mm_adds_epi8(*fl, k3);
|
||||
__m128i v4 = _mm_adds_epi8(*fl, k4);
|
||||
|
||||
SignedShift8b(&v4); // v4 >> 3
|
||||
SignedShift8b(&v3); // v3 >> 3
|
||||
SignedShift8b_SSE2(&v4); // v4 >> 3
|
||||
SignedShift8b_SSE2(&v3); // v3 >> 3
|
||||
*q0 = _mm_subs_epi8(*q0, v4); // q0 -= v4
|
||||
*p0 = _mm_adds_epi8(*p0, v3); // p0 += v3
|
||||
}
|
||||
|
@ -317,9 +320,9 @@ static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0,
|
|||
// Update operations:
|
||||
// q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
|
||||
// Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip).
|
||||
static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi,
|
||||
const __m128i* const a0_lo,
|
||||
const __m128i* const a0_hi) {
|
||||
static WEBP_INLINE void Update2Pixels_SSE2(__m128i* const pi, __m128i* const qi,
|
||||
const __m128i* const a0_lo,
|
||||
const __m128i* const a0_hi) {
|
||||
const __m128i a1_lo = _mm_srai_epi16(*a0_lo, 7);
|
||||
const __m128i a1_hi = _mm_srai_epi16(*a0_hi, 7);
|
||||
const __m128i delta = _mm_packs_epi16(a1_lo, a1_hi);
|
||||
|
@ -330,11 +333,11 @@ static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi,
|
|||
}
|
||||
|
||||
// input pixels are uint8_t
|
||||
static WEBP_INLINE void NeedsFilter(const __m128i* const p1,
|
||||
const __m128i* const p0,
|
||||
const __m128i* const q0,
|
||||
const __m128i* const q1,
|
||||
int thresh, __m128i* const mask) {
|
||||
static WEBP_INLINE void NeedsFilter_SSE2(const __m128i* const p1,
|
||||
const __m128i* const p0,
|
||||
const __m128i* const q0,
|
||||
const __m128i* const q1,
|
||||
int thresh, __m128i* const mask) {
|
||||
const __m128i m_thresh = _mm_set1_epi8(thresh);
|
||||
const __m128i t1 = MM_ABS(*p1, *q1); // abs(p1 - q1)
|
||||
const __m128i kFE = _mm_set1_epi8(0xFE);
|
||||
|
@ -353,28 +356,29 @@ static WEBP_INLINE void NeedsFilter(const __m128i* const p1,
|
|||
// Edge filtering functions
|
||||
|
||||
// Applies filter on 2 pixels (p0 and q0)
|
||||
static WEBP_INLINE void DoFilter2(__m128i* const p1, __m128i* const p0,
|
||||
__m128i* const q0, __m128i* const q1,
|
||||
int thresh) {
|
||||
static WEBP_INLINE void DoFilter2_SSE2(__m128i* const p1, __m128i* const p0,
|
||||
__m128i* const q0, __m128i* const q1,
|
||||
int thresh) {
|
||||
__m128i a, mask;
|
||||
const __m128i sign_bit = _mm_set1_epi8(0x80);
|
||||
// convert p1/q1 to int8_t (for GetBaseDelta)
|
||||
// convert p1/q1 to int8_t (for GetBaseDelta_SSE2)
|
||||
const __m128i p1s = _mm_xor_si128(*p1, sign_bit);
|
||||
const __m128i q1s = _mm_xor_si128(*q1, sign_bit);
|
||||
|
||||
NeedsFilter(p1, p0, q0, q1, thresh, &mask);
|
||||
NeedsFilter_SSE2(p1, p0, q0, q1, thresh, &mask);
|
||||
|
||||
FLIP_SIGN_BIT2(*p0, *q0);
|
||||
GetBaseDelta(&p1s, p0, q0, &q1s, &a);
|
||||
GetBaseDelta_SSE2(&p1s, p0, q0, &q1s, &a);
|
||||
a = _mm_and_si128(a, mask); // mask filter values we don't care about
|
||||
DoSimpleFilter(p0, q0, &a);
|
||||
DoSimpleFilter_SSE2(p0, q0, &a);
|
||||
FLIP_SIGN_BIT2(*p0, *q0);
|
||||
}
|
||||
|
||||
// Applies filter on 4 pixels (p1, p0, q0 and q1)
|
||||
static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
|
||||
__m128i* const q0, __m128i* const q1,
|
||||
const __m128i* const mask, int hev_thresh) {
|
||||
static WEBP_INLINE void DoFilter4_SSE2(__m128i* const p1, __m128i* const p0,
|
||||
__m128i* const q0, __m128i* const q1,
|
||||
const __m128i* const mask,
|
||||
int hev_thresh) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i sign_bit = _mm_set1_epi8(0x80);
|
||||
const __m128i k64 = _mm_set1_epi8(64);
|
||||
|
@ -384,7 +388,7 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
|
|||
__m128i t1, t2, t3;
|
||||
|
||||
// compute hev mask
|
||||
GetNotHEV(p1, p0, q0, q1, hev_thresh, ¬_hev);
|
||||
GetNotHEV_SSE2(p1, p0, q0, q1, hev_thresh, ¬_hev);
|
||||
|
||||
// convert to signed values
|
||||
FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
|
||||
|
@ -399,8 +403,8 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
|
|||
|
||||
t2 = _mm_adds_epi8(t1, k3); // 3 * (q0 - p0) + hev(p1 - q1) + 3
|
||||
t3 = _mm_adds_epi8(t1, k4); // 3 * (q0 - p0) + hev(p1 - q1) + 4
|
||||
SignedShift8b(&t2); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
|
||||
SignedShift8b(&t3); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
|
||||
SignedShift8b_SSE2(&t2); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
|
||||
SignedShift8b_SSE2(&t3); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
|
||||
*p0 = _mm_adds_epi8(*p0, t2); // p0 += t2
|
||||
*q0 = _mm_subs_epi8(*q0, t3); // q0 -= t3
|
||||
FLIP_SIGN_BIT2(*p0, *q0);
|
||||
|
@ -417,25 +421,26 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
|
|||
}
|
||||
|
||||
// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
|
||||
static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
|
||||
__m128i* const p0, __m128i* const q0,
|
||||
__m128i* const q1, __m128i* const q2,
|
||||
const __m128i* const mask, int hev_thresh) {
|
||||
static WEBP_INLINE void DoFilter6_SSE2(__m128i* const p2, __m128i* const p1,
|
||||
__m128i* const p0, __m128i* const q0,
|
||||
__m128i* const q1, __m128i* const q2,
|
||||
const __m128i* const mask,
|
||||
int hev_thresh) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i sign_bit = _mm_set1_epi8(0x80);
|
||||
__m128i a, not_hev;
|
||||
|
||||
// compute hev mask
|
||||
GetNotHEV(p1, p0, q0, q1, hev_thresh, ¬_hev);
|
||||
GetNotHEV_SSE2(p1, p0, q0, q1, hev_thresh, ¬_hev);
|
||||
|
||||
FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
|
||||
FLIP_SIGN_BIT2(*p2, *q2);
|
||||
GetBaseDelta(p1, p0, q0, q1, &a);
|
||||
GetBaseDelta_SSE2(p1, p0, q0, q1, &a);
|
||||
|
||||
{ // do simple filter on pixels with hev
|
||||
const __m128i m = _mm_andnot_si128(not_hev, *mask);
|
||||
const __m128i f = _mm_and_si128(a, m);
|
||||
DoSimpleFilter(p0, q0, &f);
|
||||
DoSimpleFilter_SSE2(p0, q0, &f);
|
||||
}
|
||||
|
||||
{ // do strong filter on pixels with not hev
|
||||
|
@ -460,15 +465,15 @@ static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
|
|||
const __m128i a0_lo = _mm_add_epi16(a1_lo, f9_lo); // Filter * 27 + 63
|
||||
const __m128i a0_hi = _mm_add_epi16(a1_hi, f9_hi); // Filter * 27 + 63
|
||||
|
||||
Update2Pixels(p2, q2, &a2_lo, &a2_hi);
|
||||
Update2Pixels(p1, q1, &a1_lo, &a1_hi);
|
||||
Update2Pixels(p0, q0, &a0_lo, &a0_hi);
|
||||
Update2Pixels_SSE2(p2, q2, &a2_lo, &a2_hi);
|
||||
Update2Pixels_SSE2(p1, q1, &a1_lo, &a1_hi);
|
||||
Update2Pixels_SSE2(p0, q0, &a0_lo, &a0_hi);
|
||||
}
|
||||
}
|
||||
|
||||
// reads 8 rows across a vertical edge.
|
||||
static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride,
|
||||
__m128i* const p, __m128i* const q) {
|
||||
static WEBP_INLINE void Load8x4_SSE2(const uint8_t* const b, int stride,
|
||||
__m128i* const p, __m128i* const q) {
|
||||
// A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00
|
||||
// A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10
|
||||
const __m128i A0 = _mm_set_epi32(
|
||||
|
@ -494,11 +499,11 @@ static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride,
|
|||
*q = _mm_unpackhi_epi32(C0, C1);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void Load16x4(const uint8_t* const r0,
|
||||
const uint8_t* const r8,
|
||||
int stride,
|
||||
__m128i* const p1, __m128i* const p0,
|
||||
__m128i* const q0, __m128i* const q1) {
|
||||
static WEBP_INLINE void Load16x4_SSE2(const uint8_t* const r0,
|
||||
const uint8_t* const r8,
|
||||
int stride,
|
||||
__m128i* const p1, __m128i* const p0,
|
||||
__m128i* const q0, __m128i* const q1) {
|
||||
// Assume the pixels around the edge (|) are numbered as follows
|
||||
// 00 01 | 02 03
|
||||
// 10 11 | 12 13
|
||||
|
@ -514,8 +519,8 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0,
|
|||
// q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
|
||||
// p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
|
||||
// q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
|
||||
Load8x4(r0, stride, p1, q0);
|
||||
Load8x4(r8, stride, p0, q1);
|
||||
Load8x4_SSE2(r0, stride, p1, q0);
|
||||
Load8x4_SSE2(r8, stride, p0, q1);
|
||||
|
||||
{
|
||||
// p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
|
||||
|
@ -531,7 +536,8 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0,
|
|||
}
|
||||
}
|
||||
|
||||
static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
|
||||
static WEBP_INLINE void Store4x4_SSE2(__m128i* const x,
|
||||
uint8_t* dst, int stride) {
|
||||
int i;
|
||||
for (i = 0; i < 4; ++i, dst += stride) {
|
||||
WebPUint32ToMem(dst, _mm_cvtsi128_si32(*x));
|
||||
|
@ -540,12 +546,12 @@ static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
|
|||
}
|
||||
|
||||
// Transpose back and store
|
||||
static WEBP_INLINE void Store16x4(const __m128i* const p1,
|
||||
const __m128i* const p0,
|
||||
const __m128i* const q0,
|
||||
const __m128i* const q1,
|
||||
uint8_t* r0, uint8_t* r8,
|
||||
int stride) {
|
||||
static WEBP_INLINE void Store16x4_SSE2(const __m128i* const p1,
|
||||
const __m128i* const p0,
|
||||
const __m128i* const q0,
|
||||
const __m128i* const q1,
|
||||
uint8_t* r0, uint8_t* r8,
|
||||
int stride) {
|
||||
__m128i t1, p1_s, p0_s, q0_s, q1_s;
|
||||
|
||||
// p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
|
||||
|
@ -572,55 +578,55 @@ static WEBP_INLINE void Store16x4(const __m128i* const p1,
|
|||
p1_s = _mm_unpacklo_epi16(t1, q1_s);
|
||||
q1_s = _mm_unpackhi_epi16(t1, q1_s);
|
||||
|
||||
Store4x4(&p0_s, r0, stride);
|
||||
Store4x4_SSE2(&p0_s, r0, stride);
|
||||
r0 += 4 * stride;
|
||||
Store4x4(&q0_s, r0, stride);
|
||||
Store4x4_SSE2(&q0_s, r0, stride);
|
||||
|
||||
Store4x4(&p1_s, r8, stride);
|
||||
Store4x4_SSE2(&p1_s, r8, stride);
|
||||
r8 += 4 * stride;
|
||||
Store4x4(&q1_s, r8, stride);
|
||||
Store4x4_SSE2(&q1_s, r8, stride);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Simple In-loop filtering (Paragraph 15.2)
|
||||
|
||||
static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
|
||||
static void SimpleVFilter16_SSE2(uint8_t* p, int stride, int thresh) {
|
||||
// Load
|
||||
__m128i p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]);
|
||||
__m128i p0 = _mm_loadu_si128((__m128i*)&p[-stride]);
|
||||
__m128i q0 = _mm_loadu_si128((__m128i*)&p[0]);
|
||||
__m128i q1 = _mm_loadu_si128((__m128i*)&p[stride]);
|
||||
|
||||
DoFilter2(&p1, &p0, &q0, &q1, thresh);
|
||||
DoFilter2_SSE2(&p1, &p0, &q0, &q1, thresh);
|
||||
|
||||
// Store
|
||||
_mm_storeu_si128((__m128i*)&p[-stride], p0);
|
||||
_mm_storeu_si128((__m128i*)&p[0], q0);
|
||||
}
|
||||
|
||||
static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
|
||||
static void SimpleHFilter16_SSE2(uint8_t* p, int stride, int thresh) {
|
||||
__m128i p1, p0, q0, q1;
|
||||
|
||||
p -= 2; // beginning of p1
|
||||
|
||||
Load16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
|
||||
DoFilter2(&p1, &p0, &q0, &q1, thresh);
|
||||
Store16x4(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride);
|
||||
Load16x4_SSE2(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
|
||||
DoFilter2_SSE2(&p1, &p0, &q0, &q1, thresh);
|
||||
Store16x4_SSE2(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride);
|
||||
}
|
||||
|
||||
static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
|
||||
static void SimpleVFilter16i_SSE2(uint8_t* p, int stride, int thresh) {
|
||||
int k;
|
||||
for (k = 3; k > 0; --k) {
|
||||
p += 4 * stride;
|
||||
SimpleVFilter16(p, stride, thresh);
|
||||
SimpleVFilter16_SSE2(p, stride, thresh);
|
||||
}
|
||||
}
|
||||
|
||||
static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
|
||||
static void SimpleHFilter16i_SSE2(uint8_t* p, int stride, int thresh) {
|
||||
int k;
|
||||
for (k = 3; k > 0; --k) {
|
||||
p += 4;
|
||||
SimpleHFilter16(p, stride, thresh);
|
||||
SimpleHFilter16_SSE2(p, stride, thresh);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -628,60 +634,60 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
|
|||
// Complex In-loop filtering (Paragraph 15.3)
|
||||
|
||||
#define MAX_DIFF1(p3, p2, p1, p0, m) do { \
|
||||
m = MM_ABS(p1, p0); \
|
||||
m = _mm_max_epu8(m, MM_ABS(p3, p2)); \
|
||||
m = _mm_max_epu8(m, MM_ABS(p2, p1)); \
|
||||
(m) = MM_ABS(p1, p0); \
|
||||
(m) = _mm_max_epu8(m, MM_ABS(p3, p2)); \
|
||||
(m) = _mm_max_epu8(m, MM_ABS(p2, p1)); \
|
||||
} while (0)
|
||||
|
||||
#define MAX_DIFF2(p3, p2, p1, p0, m) do { \
|
||||
m = _mm_max_epu8(m, MM_ABS(p1, p0)); \
|
||||
m = _mm_max_epu8(m, MM_ABS(p3, p2)); \
|
||||
m = _mm_max_epu8(m, MM_ABS(p2, p1)); \
|
||||
(m) = _mm_max_epu8(m, MM_ABS(p1, p0)); \
|
||||
(m) = _mm_max_epu8(m, MM_ABS(p3, p2)); \
|
||||
(m) = _mm_max_epu8(m, MM_ABS(p2, p1)); \
|
||||
} while (0)
|
||||
|
||||
#define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) { \
|
||||
e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]); \
|
||||
e2 = _mm_loadu_si128((__m128i*)&(p)[1 * stride]); \
|
||||
e3 = _mm_loadu_si128((__m128i*)&(p)[2 * stride]); \
|
||||
e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]); \
|
||||
(e1) = _mm_loadu_si128((__m128i*)&(p)[0 * (stride)]); \
|
||||
(e2) = _mm_loadu_si128((__m128i*)&(p)[1 * (stride)]); \
|
||||
(e3) = _mm_loadu_si128((__m128i*)&(p)[2 * (stride)]); \
|
||||
(e4) = _mm_loadu_si128((__m128i*)&(p)[3 * (stride)]); \
|
||||
}
|
||||
|
||||
#define LOADUV_H_EDGE(p, u, v, stride) do { \
|
||||
const __m128i U = _mm_loadl_epi64((__m128i*)&(u)[(stride)]); \
|
||||
const __m128i V = _mm_loadl_epi64((__m128i*)&(v)[(stride)]); \
|
||||
p = _mm_unpacklo_epi64(U, V); \
|
||||
(p) = _mm_unpacklo_epi64(U, V); \
|
||||
} while (0)
|
||||
|
||||
#define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) { \
|
||||
LOADUV_H_EDGE(e1, u, v, 0 * stride); \
|
||||
LOADUV_H_EDGE(e2, u, v, 1 * stride); \
|
||||
LOADUV_H_EDGE(e3, u, v, 2 * stride); \
|
||||
LOADUV_H_EDGE(e4, u, v, 3 * stride); \
|
||||
LOADUV_H_EDGE(e1, u, v, 0 * (stride)); \
|
||||
LOADUV_H_EDGE(e2, u, v, 1 * (stride)); \
|
||||
LOADUV_H_EDGE(e3, u, v, 2 * (stride)); \
|
||||
LOADUV_H_EDGE(e4, u, v, 3 * (stride)); \
|
||||
}
|
||||
|
||||
#define STOREUV(p, u, v, stride) { \
|
||||
_mm_storel_epi64((__m128i*)&u[(stride)], p); \
|
||||
p = _mm_srli_si128(p, 8); \
|
||||
_mm_storel_epi64((__m128i*)&v[(stride)], p); \
|
||||
_mm_storel_epi64((__m128i*)&(u)[(stride)], p); \
|
||||
(p) = _mm_srli_si128(p, 8); \
|
||||
_mm_storel_epi64((__m128i*)&(v)[(stride)], p); \
|
||||
}
|
||||
|
||||
static WEBP_INLINE void ComplexMask(const __m128i* const p1,
|
||||
const __m128i* const p0,
|
||||
const __m128i* const q0,
|
||||
const __m128i* const q1,
|
||||
int thresh, int ithresh,
|
||||
__m128i* const mask) {
|
||||
static WEBP_INLINE void ComplexMask_SSE2(const __m128i* const p1,
|
||||
const __m128i* const p0,
|
||||
const __m128i* const q0,
|
||||
const __m128i* const q1,
|
||||
int thresh, int ithresh,
|
||||
__m128i* const mask) {
|
||||
const __m128i it = _mm_set1_epi8(ithresh);
|
||||
const __m128i diff = _mm_subs_epu8(*mask, it);
|
||||
const __m128i thresh_mask = _mm_cmpeq_epi8(diff, _mm_setzero_si128());
|
||||
__m128i filter_mask;
|
||||
NeedsFilter(p1, p0, q0, q1, thresh, &filter_mask);
|
||||
NeedsFilter_SSE2(p1, p0, q0, q1, thresh, &filter_mask);
|
||||
*mask = _mm_and_si128(thresh_mask, filter_mask);
|
||||
}
|
||||
|
||||
// on macroblock edges
|
||||
static void VFilter16(uint8_t* p, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void VFilter16_SSE2(uint8_t* p, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
__m128i t1;
|
||||
__m128i mask;
|
||||
__m128i p2, p1, p0, q0, q1, q2;
|
||||
|
@ -694,8 +700,8 @@ static void VFilter16(uint8_t* p, int stride,
|
|||
LOAD_H_EDGES4(p, stride, q0, q1, q2, t1);
|
||||
MAX_DIFF2(t1, q2, q1, q0, mask);
|
||||
|
||||
ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
|
||||
DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
|
||||
ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
|
||||
DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
|
||||
|
||||
// Store
|
||||
_mm_storeu_si128((__m128i*)&p[-3 * stride], p2);
|
||||
|
@ -706,28 +712,28 @@ static void VFilter16(uint8_t* p, int stride,
|
|||
_mm_storeu_si128((__m128i*)&p[+2 * stride], q2);
|
||||
}
|
||||
|
||||
static void HFilter16(uint8_t* p, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void HFilter16_SSE2(uint8_t* p, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
__m128i mask;
|
||||
__m128i p3, p2, p1, p0, q0, q1, q2, q3;
|
||||
|
||||
uint8_t* const b = p - 4;
|
||||
Load16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0); // p3, p2, p1, p0
|
||||
Load16x4_SSE2(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);
|
||||
MAX_DIFF1(p3, p2, p1, p0, mask);
|
||||
|
||||
Load16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3); // q0, q1, q2, q3
|
||||
Load16x4_SSE2(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);
|
||||
MAX_DIFF2(q3, q2, q1, q0, mask);
|
||||
|
||||
ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
|
||||
DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
|
||||
ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
|
||||
DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
|
||||
|
||||
Store16x4(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride);
|
||||
Store16x4(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride);
|
||||
Store16x4_SSE2(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride);
|
||||
Store16x4_SSE2(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride);
|
||||
}
|
||||
|
||||
// on three inner edges
|
||||
static void VFilter16i(uint8_t* p, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void VFilter16i_SSE2(uint8_t* p, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
int k;
|
||||
__m128i p3, p2, p1, p0; // loop invariants
|
||||
|
||||
|
@ -744,8 +750,8 @@ static void VFilter16i(uint8_t* p, int stride,
|
|||
|
||||
// p3 and p2 are not just temporary variables here: they will be
|
||||
// re-used for next span. And q2/q3 will become p1/p0 accordingly.
|
||||
ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
|
||||
DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
|
||||
ComplexMask_SSE2(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
|
||||
DoFilter4_SSE2(&p1, &p0, &p3, &p2, &mask, hev_thresh);
|
||||
|
||||
// Store
|
||||
_mm_storeu_si128((__m128i*)&b[0 * stride], p1);
|
||||
|
@ -759,12 +765,12 @@ static void VFilter16i(uint8_t* p, int stride,
|
|||
}
|
||||
}
|
||||
|
||||
static void HFilter16i(uint8_t* p, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void HFilter16i_SSE2(uint8_t* p, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
int k;
|
||||
__m128i p3, p2, p1, p0; // loop invariants
|
||||
|
||||
Load16x4(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0); // prologue
|
||||
Load16x4_SSE2(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0); // prologue
|
||||
|
||||
for (k = 3; k > 0; --k) {
|
||||
__m128i mask, tmp1, tmp2;
|
||||
|
@ -773,13 +779,13 @@ static void HFilter16i(uint8_t* p, int stride,
|
|||
p += 4; // beginning of q0 (and next span)
|
||||
|
||||
MAX_DIFF1(p3, p2, p1, p0, mask); // compute partial mask
|
||||
Load16x4(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2);
|
||||
Load16x4_SSE2(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2);
|
||||
MAX_DIFF2(p3, p2, tmp1, tmp2, mask);
|
||||
|
||||
ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
|
||||
DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
|
||||
ComplexMask_SSE2(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
|
||||
DoFilter4_SSE2(&p1, &p0, &p3, &p2, &mask, hev_thresh);
|
||||
|
||||
Store16x4(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride);
|
||||
Store16x4_SSE2(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride);
|
||||
|
||||
// rotate samples
|
||||
p1 = tmp1;
|
||||
|
@ -788,8 +794,8 @@ static void HFilter16i(uint8_t* p, int stride,
|
|||
}
|
||||
|
||||
// 8-pixels wide variant, for chroma filtering
|
||||
static void VFilter8(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
__m128i mask;
|
||||
__m128i t1, p2, p1, p0, q0, q1, q2;
|
||||
|
||||
|
@ -801,8 +807,8 @@ static void VFilter8(uint8_t* u, uint8_t* v, int stride,
|
|||
LOADUV_H_EDGES4(u, v, stride, q0, q1, q2, t1);
|
||||
MAX_DIFF2(t1, q2, q1, q0, mask);
|
||||
|
||||
ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
|
||||
DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
|
||||
ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
|
||||
DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
|
||||
|
||||
// Store
|
||||
STOREUV(p2, u, v, -3 * stride);
|
||||
|
@ -813,28 +819,28 @@ static void VFilter8(uint8_t* u, uint8_t* v, int stride,
|
|||
STOREUV(q2, u, v, 2 * stride);
|
||||
}
|
||||
|
||||
static void HFilter8(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
__m128i mask;
|
||||
__m128i p3, p2, p1, p0, q0, q1, q2, q3;
|
||||
|
||||
uint8_t* const tu = u - 4;
|
||||
uint8_t* const tv = v - 4;
|
||||
Load16x4(tu, tv, stride, &p3, &p2, &p1, &p0); // p3, p2, p1, p0
|
||||
Load16x4_SSE2(tu, tv, stride, &p3, &p2, &p1, &p0);
|
||||
MAX_DIFF1(p3, p2, p1, p0, mask);
|
||||
|
||||
Load16x4(u, v, stride, &q0, &q1, &q2, &q3); // q0, q1, q2, q3
|
||||
Load16x4_SSE2(u, v, stride, &q0, &q1, &q2, &q3);
|
||||
MAX_DIFF2(q3, q2, q1, q0, mask);
|
||||
|
||||
ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
|
||||
DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
|
||||
ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
|
||||
DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
|
||||
|
||||
Store16x4(&p3, &p2, &p1, &p0, tu, tv, stride);
|
||||
Store16x4(&q0, &q1, &q2, &q3, u, v, stride);
|
||||
Store16x4_SSE2(&p3, &p2, &p1, &p0, tu, tv, stride);
|
||||
Store16x4_SSE2(&q0, &q1, &q2, &q3, u, v, stride);
|
||||
}
|
||||
|
||||
static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
__m128i mask;
|
||||
__m128i t1, t2, p1, p0, q0, q1;
|
||||
|
||||
|
@ -849,8 +855,8 @@ static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
|
|||
LOADUV_H_EDGES4(u, v, stride, q0, q1, t1, t2);
|
||||
MAX_DIFF2(t2, t1, q1, q0, mask);
|
||||
|
||||
ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
|
||||
DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
|
||||
ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
|
||||
DoFilter4_SSE2(&p1, &p0, &q0, &q1, &mask, hev_thresh);
|
||||
|
||||
// Store
|
||||
STOREUV(p1, u, v, -2 * stride);
|
||||
|
@ -859,24 +865,24 @@ static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
|
|||
STOREUV(q1, u, v, 1 * stride);
|
||||
}
|
||||
|
||||
static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
static void HFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
|
||||
int thresh, int ithresh, int hev_thresh) {
|
||||
__m128i mask;
|
||||
__m128i t1, t2, p1, p0, q0, q1;
|
||||
Load16x4(u, v, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0
|
||||
Load16x4_SSE2(u, v, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0
|
||||
MAX_DIFF1(t2, t1, p1, p0, mask);
|
||||
|
||||
u += 4; // beginning of q0
|
||||
v += 4;
|
||||
Load16x4(u, v, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3
|
||||
Load16x4_SSE2(u, v, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3
|
||||
MAX_DIFF2(t2, t1, q1, q0, mask);
|
||||
|
||||
ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
|
||||
DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
|
||||
ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
|
||||
DoFilter4_SSE2(&p1, &p0, &q0, &q1, &mask, hev_thresh);
|
||||
|
||||
u -= 2; // beginning of p1
|
||||
v -= 2;
|
||||
Store16x4(&p1, &p0, &q0, &q1, u, v, stride);
|
||||
Store16x4_SSE2(&p1, &p0, &q0, &q1, u, v, stride);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
@ -893,7 +899,7 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
|
|||
// where: AC = (a + b + 1) >> 1, BC = (b + c + 1) >> 1
|
||||
// and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
|
||||
|
||||
static void VE4(uint8_t* dst) { // vertical
|
||||
static void VE4_SSE2(uint8_t* dst) { // vertical
|
||||
const __m128i one = _mm_set1_epi8(1);
|
||||
const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
|
||||
const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
|
||||
|
@ -909,7 +915,7 @@ static void VE4(uint8_t* dst) { // vertical
|
|||
}
|
||||
}
|
||||
|
||||
static void LD4(uint8_t* dst) { // Down-Left
|
||||
static void LD4_SSE2(uint8_t* dst) { // Down-Left
|
||||
const __m128i one = _mm_set1_epi8(1);
|
||||
const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
|
||||
const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
|
||||
|
@ -925,7 +931,7 @@ static void LD4(uint8_t* dst) { // Down-Left
|
|||
WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
|
||||
}
|
||||
|
||||
static void VR4(uint8_t* dst) { // Vertical-Right
|
||||
static void VR4_SSE2(uint8_t* dst) { // Vertical-Right
|
||||
const __m128i one = _mm_set1_epi8(1);
|
||||
const int I = dst[-1 + 0 * BPS];
|
||||
const int J = dst[-1 + 1 * BPS];
|
||||
|
@ -950,7 +956,7 @@ static void VR4(uint8_t* dst) { // Vertical-Right
|
|||
DST(0, 3) = AVG3(K, J, I);
|
||||
}
|
||||
|
||||
static void VL4(uint8_t* dst) { // Vertical-Left
|
||||
static void VL4_SSE2(uint8_t* dst) { // Vertical-Left
|
||||
const __m128i one = _mm_set1_epi8(1);
|
||||
const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
|
||||
const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
|
||||
|
@ -975,7 +981,7 @@ static void VL4(uint8_t* dst) { // Vertical-Left
|
|||
DST(3, 3) = (extra_out >> 8) & 0xff;
|
||||
}
|
||||
|
||||
static void RD4(uint8_t* dst) { // Down-right
|
||||
static void RD4_SSE2(uint8_t* dst) { // Down-right
|
||||
const __m128i one = _mm_set1_epi8(1);
|
||||
const __m128i XABCD = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
|
||||
const __m128i ____XABCD = _mm_slli_si128(XABCD, 4);
|
||||
|
@ -1004,7 +1010,7 @@ static void RD4(uint8_t* dst) { // Down-right
|
|||
//------------------------------------------------------------------------------
|
||||
// Luma 16x16
|
||||
|
||||
static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
|
||||
static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, int size) {
|
||||
const uint8_t* top = dst - BPS;
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
int y;
|
||||
|
@ -1041,11 +1047,11 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
|
|||
}
|
||||
}
|
||||
|
||||
static void TM4(uint8_t* dst) { TrueMotion(dst, 4); }
|
||||
static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
|
||||
static void TM16(uint8_t* dst) { TrueMotion(dst, 16); }
|
||||
static void TM4_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 4); }
|
||||
static void TM8uv_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 8); }
|
||||
static void TM16_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 16); }
|
||||
|
||||
static void VE16(uint8_t* dst) {
|
||||
static void VE16_SSE2(uint8_t* dst) {
|
||||
const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
|
||||
int j;
|
||||
for (j = 0; j < 16; ++j) {
|
||||
|
@ -1053,7 +1059,7 @@ static void VE16(uint8_t* dst) {
|
|||
}
|
||||
}
|
||||
|
||||
static void HE16(uint8_t* dst) { // horizontal
|
||||
static void HE16_SSE2(uint8_t* dst) { // horizontal
|
||||
int j;
|
||||
for (j = 16; j > 0; --j) {
|
||||
const __m128i values = _mm_set1_epi8(dst[-1]);
|
||||
|
@ -1062,7 +1068,7 @@ static void HE16(uint8_t* dst) { // horizontal
|
|||
}
|
||||
}
|
||||
|
||||
static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
|
||||
static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
|
||||
int j;
|
||||
const __m128i values = _mm_set1_epi8(v);
|
||||
for (j = 0; j < 16; ++j) {
|
||||
|
@ -1070,7 +1076,7 @@ static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
|
|||
}
|
||||
}
|
||||
|
||||
static void DC16(uint8_t* dst) { // DC
|
||||
static void DC16_SSE2(uint8_t* dst) { // DC
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
|
||||
const __m128i sad8x2 = _mm_sad_epu8(top, zero);
|
||||
|
@ -1083,37 +1089,37 @@ static void DC16(uint8_t* dst) { // DC
|
|||
}
|
||||
{
|
||||
const int DC = _mm_cvtsi128_si32(sum) + left + 16;
|
||||
Put16(DC >> 5, dst);
|
||||
Put16_SSE2(DC >> 5, dst);
|
||||
}
|
||||
}
|
||||
|
||||
static void DC16NoTop(uint8_t* dst) { // DC with top samples not available
|
||||
static void DC16NoTop_SSE2(uint8_t* dst) { // DC with top samples unavailable
|
||||
int DC = 8;
|
||||
int j;
|
||||
for (j = 0; j < 16; ++j) {
|
||||
DC += dst[-1 + j * BPS];
|
||||
}
|
||||
Put16(DC >> 4, dst);
|
||||
Put16_SSE2(DC >> 4, dst);
|
||||
}
|
||||
|
||||
static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available
|
||||
static void DC16NoLeft_SSE2(uint8_t* dst) { // DC with left samples unavailable
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
|
||||
const __m128i sad8x2 = _mm_sad_epu8(top, zero);
|
||||
// sum the two sads: sad8x2[0:1] + sad8x2[8:9]
|
||||
const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
|
||||
const int DC = _mm_cvtsi128_si32(sum) + 8;
|
||||
Put16(DC >> 4, dst);
|
||||
Put16_SSE2(DC >> 4, dst);
|
||||
}
|
||||
|
||||
static void DC16NoTopLeft(uint8_t* dst) { // DC with no top and left samples
|
||||
Put16(0x80, dst);
|
||||
static void DC16NoTopLeft_SSE2(uint8_t* dst) { // DC with no top & left samples
|
||||
Put16_SSE2(0x80, dst);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Chroma
|
||||
|
||||
static void VE8uv(uint8_t* dst) { // vertical
|
||||
static void VE8uv_SSE2(uint8_t* dst) { // vertical
|
||||
int j;
|
||||
const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
|
||||
for (j = 0; j < 8; ++j) {
|
||||
|
@ -1121,17 +1127,8 @@ static void VE8uv(uint8_t* dst) { // vertical
|
|||
}
|
||||
}
|
||||
|
||||
static void HE8uv(uint8_t* dst) { // horizontal
|
||||
int j;
|
||||
for (j = 0; j < 8; ++j) {
|
||||
const __m128i values = _mm_set1_epi8(dst[-1]);
|
||||
_mm_storel_epi64((__m128i*)dst, values);
|
||||
dst += BPS;
|
||||
}
|
||||
}
|
||||
|
||||
// helper for chroma-DC predictions
|
||||
static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
|
||||
static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
|
||||
int j;
|
||||
const __m128i values = _mm_set1_epi8(v);
|
||||
for (j = 0; j < 8; ++j) {
|
||||
|
@ -1139,7 +1136,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
|
|||
}
|
||||
}
|
||||
|
||||
static void DC8uv(uint8_t* dst) { // DC
|
||||
static void DC8uv_SSE2(uint8_t* dst) { // DC
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
|
||||
const __m128i sum = _mm_sad_epu8(top, zero);
|
||||
|
@ -1150,29 +1147,29 @@ static void DC8uv(uint8_t* dst) { // DC
|
|||
}
|
||||
{
|
||||
const int DC = _mm_cvtsi128_si32(sum) + left + 8;
|
||||
Put8x8uv(DC >> 4, dst);
|
||||
Put8x8uv_SSE2(DC >> 4, dst);
|
||||
}
|
||||
}
|
||||
|
||||
static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
|
||||
static void DC8uvNoLeft_SSE2(uint8_t* dst) { // DC with no left samples
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
|
||||
const __m128i sum = _mm_sad_epu8(top, zero);
|
||||
const int DC = _mm_cvtsi128_si32(sum) + 4;
|
||||
Put8x8uv(DC >> 3, dst);
|
||||
Put8x8uv_SSE2(DC >> 3, dst);
|
||||
}
|
||||
|
||||
static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
|
||||
static void DC8uvNoTop_SSE2(uint8_t* dst) { // DC with no top samples
|
||||
int dc0 = 4;
|
||||
int i;
|
||||
for (i = 0; i < 8; ++i) {
|
||||
dc0 += dst[-1 + i * BPS];
|
||||
}
|
||||
Put8x8uv(dc0 >> 3, dst);
|
||||
Put8x8uv_SSE2(dc0 >> 3, dst);
|
||||
}
|
||||
|
||||
static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing
|
||||
Put8x8uv(0x80, dst);
|
||||
static void DC8uvNoTopLeft_SSE2(uint8_t* dst) { // DC with nothing
|
||||
Put8x8uv_SSE2(0x80, dst);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
@ -1181,47 +1178,46 @@ static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing
|
|||
extern void VP8DspInitSSE2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE2(void) {
|
||||
VP8Transform = Transform;
|
||||
#if defined(USE_TRANSFORM_AC3)
|
||||
VP8TransformAC3 = TransformAC3;
|
||||
VP8Transform = Transform_SSE2;
|
||||
#if (USE_TRANSFORM_AC3 == 1)
|
||||
VP8TransformAC3 = TransformAC3_SSE2;
|
||||
#endif
|
||||
|
||||
VP8VFilter16 = VFilter16;
|
||||
VP8HFilter16 = HFilter16;
|
||||
VP8VFilter8 = VFilter8;
|
||||
VP8HFilter8 = HFilter8;
|
||||
VP8VFilter16i = VFilter16i;
|
||||
VP8HFilter16i = HFilter16i;
|
||||
VP8VFilter8i = VFilter8i;
|
||||
VP8HFilter8i = HFilter8i;
|
||||
VP8VFilter16 = VFilter16_SSE2;
|
||||
VP8HFilter16 = HFilter16_SSE2;
|
||||
VP8VFilter8 = VFilter8_SSE2;
|
||||
VP8HFilter8 = HFilter8_SSE2;
|
||||
VP8VFilter16i = VFilter16i_SSE2;
|
||||
VP8HFilter16i = HFilter16i_SSE2;
|
||||
VP8VFilter8i = VFilter8i_SSE2;
|
||||
VP8HFilter8i = HFilter8i_SSE2;
|
||||
|
||||
VP8SimpleVFilter16 = SimpleVFilter16;
|
||||
VP8SimpleHFilter16 = SimpleHFilter16;
|
||||
VP8SimpleVFilter16i = SimpleVFilter16i;
|
||||
VP8SimpleHFilter16i = SimpleHFilter16i;
|
||||
VP8SimpleVFilter16 = SimpleVFilter16_SSE2;
|
||||
VP8SimpleHFilter16 = SimpleHFilter16_SSE2;
|
||||
VP8SimpleVFilter16i = SimpleVFilter16i_SSE2;
|
||||
VP8SimpleHFilter16i = SimpleHFilter16i_SSE2;
|
||||
|
||||
VP8PredLuma4[1] = TM4;
|
||||
VP8PredLuma4[2] = VE4;
|
||||
VP8PredLuma4[4] = RD4;
|
||||
VP8PredLuma4[5] = VR4;
|
||||
VP8PredLuma4[6] = LD4;
|
||||
VP8PredLuma4[7] = VL4;
|
||||
VP8PredLuma4[1] = TM4_SSE2;
|
||||
VP8PredLuma4[2] = VE4_SSE2;
|
||||
VP8PredLuma4[4] = RD4_SSE2;
|
||||
VP8PredLuma4[5] = VR4_SSE2;
|
||||
VP8PredLuma4[6] = LD4_SSE2;
|
||||
VP8PredLuma4[7] = VL4_SSE2;
|
||||
|
||||
VP8PredLuma16[0] = DC16;
|
||||
VP8PredLuma16[1] = TM16;
|
||||
VP8PredLuma16[2] = VE16;
|
||||
VP8PredLuma16[3] = HE16;
|
||||
VP8PredLuma16[4] = DC16NoTop;
|
||||
VP8PredLuma16[5] = DC16NoLeft;
|
||||
VP8PredLuma16[6] = DC16NoTopLeft;
|
||||
VP8PredLuma16[0] = DC16_SSE2;
|
||||
VP8PredLuma16[1] = TM16_SSE2;
|
||||
VP8PredLuma16[2] = VE16_SSE2;
|
||||
VP8PredLuma16[3] = HE16_SSE2;
|
||||
VP8PredLuma16[4] = DC16NoTop_SSE2;
|
||||
VP8PredLuma16[5] = DC16NoLeft_SSE2;
|
||||
VP8PredLuma16[6] = DC16NoTopLeft_SSE2;
|
||||
|
||||
VP8PredChroma8[0] = DC8uv;
|
||||
VP8PredChroma8[1] = TM8uv;
|
||||
VP8PredChroma8[2] = VE8uv;
|
||||
VP8PredChroma8[3] = HE8uv;
|
||||
VP8PredChroma8[4] = DC8uvNoTop;
|
||||
VP8PredChroma8[5] = DC8uvNoLeft;
|
||||
VP8PredChroma8[6] = DC8uvNoTopLeft;
|
||||
VP8PredChroma8[0] = DC8uv_SSE2;
|
||||
VP8PredChroma8[1] = TM8uv_SSE2;
|
||||
VP8PredChroma8[2] = VE8uv_SSE2;
|
||||
VP8PredChroma8[4] = DC8uvNoTop_SSE2;
|
||||
VP8PredChroma8[5] = DC8uvNoLeft_SSE2;
|
||||
VP8PredChroma8[6] = DC8uvNoTopLeft_SSE2;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_SSE2
|
|
@ -11,15 +11,15 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_SSE41)
|
||||
|
||||
#include <smmintrin.h>
|
||||
#include "../dec/vp8i_dec.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "src/dec/vp8i_dec.h"
|
||||
#include "src/utils/utils.h"
|
||||
|
||||
static void HE16(uint8_t* dst) { // horizontal
|
||||
static void HE16_SSE41(uint8_t* dst) { // horizontal
|
||||
int j;
|
||||
const __m128i kShuffle3 = _mm_set1_epi8(3);
|
||||
for (j = 16; j > 0; --j) {
|
||||
|
@ -36,7 +36,7 @@ static void HE16(uint8_t* dst) { // horizontal
|
|||
extern void VP8DspInitSSE41(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE41(void) {
|
||||
VP8PredLuma16[3] = HE16;
|
||||
VP8PredLuma16[3] = HE16_SSE41;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_SSE41
|
|
@ -15,10 +15,10 @@
|
|||
#define WEBP_DSP_DSP_H_
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "../webp/config.h"
|
||||
#include "src/webp/config.h"
|
||||
#endif
|
||||
|
||||
#include "../webp/types.h"
|
||||
#include "src/webp/types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
@ -38,10 +38,22 @@ extern "C" {
|
|||
# define LOCAL_GCC_PREREQ(maj, min) 0
|
||||
#endif
|
||||
|
||||
#if defined(__clang__)
|
||||
# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
|
||||
# define LOCAL_CLANG_PREREQ(maj, min) \
|
||||
(LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
|
||||
#else
|
||||
# define LOCAL_CLANG_VERSION 0
|
||||
# define LOCAL_CLANG_PREREQ(maj, min) 0
|
||||
#endif
|
||||
|
||||
#ifndef __has_builtin
|
||||
# define __has_builtin(x) 0
|
||||
#endif
|
||||
|
||||
// for now, none of the optimizations below are available in emscripten
|
||||
#if !defined(EMSCRIPTEN)
|
||||
|
||||
#if defined(_MSC_VER) && _MSC_VER > 1310 && \
|
||||
(defined(_M_X64) || defined(_M_IX86))
|
||||
#define WEBP_MSC_SSE2 // Visual C++ SSE2 targets
|
||||
|
@ -68,18 +80,20 @@ extern "C" {
|
|||
#define WEBP_USE_AVX2
|
||||
#endif
|
||||
|
||||
#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
|
||||
#define WEBP_ANDROID_NEON // Android targets that might support NEON
|
||||
#endif
|
||||
|
||||
// The intrinsics currently cause compiler errors with arm-nacl-gcc and the
|
||||
// inline assembly would need to be modified for use with Native Client.
|
||||
#if (defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON) || \
|
||||
#if (defined(__ARM_NEON__) || \
|
||||
defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \
|
||||
!defined(__native_client__)
|
||||
#define WEBP_USE_NEON
|
||||
#endif
|
||||
|
||||
#if !defined(WEBP_USE_NEON) && defined(__ANDROID__) && \
|
||||
defined(__ARM_ARCH_7A__) && defined(HAVE_CPU_FEATURES_H)
|
||||
#define WEBP_ANDROID_NEON // Android targets that may have NEON
|
||||
#define WEBP_USE_NEON
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)
|
||||
#define WEBP_USE_NEON
|
||||
#define WEBP_USE_INTRINSICS
|
||||
|
@ -90,7 +104,7 @@ extern "C" {
|
|||
#define WEBP_USE_MIPS32
|
||||
#if (__mips_isa_rev >= 2)
|
||||
#define WEBP_USE_MIPS32_R2
|
||||
#if defined(__mips_dspr2) || (__mips_dsp_rev >= 2)
|
||||
#if defined(__mips_dspr2) || (defined(__mips_dsp_rev) && __mips_dsp_rev >= 2)
|
||||
#define WEBP_USE_MIPS_DSP_R2
|
||||
#endif
|
||||
#endif
|
||||
|
@ -100,6 +114,24 @@ extern "C" {
|
|||
#define WEBP_USE_MSA
|
||||
#endif
|
||||
|
||||
#endif /* EMSCRIPTEN */
|
||||
|
||||
#ifndef WEBP_DSP_OMIT_C_CODE
|
||||
#define WEBP_DSP_OMIT_C_CODE 1
|
||||
#endif
|
||||
|
||||
#if (defined(__aarch64__) || defined(__ARM_NEON__)) && WEBP_DSP_OMIT_C_CODE
|
||||
#define WEBP_NEON_OMIT_C_CODE 1
|
||||
#else
|
||||
#define WEBP_NEON_OMIT_C_CODE 0
|
||||
#endif
|
||||
|
||||
#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
|
||||
#define WEBP_NEON_WORK_AROUND_GCC 1
|
||||
#else
|
||||
#define WEBP_NEON_WORK_AROUND_GCC 0
|
||||
#endif
|
||||
|
||||
// This macro prevents thread_sanitizer from reporting known concurrent writes.
|
||||
#define WEBP_TSAN_IGNORE_FUNCTION
|
||||
#if defined(__has_feature)
|
||||
|
@ -129,6 +161,11 @@ extern "C" {
|
|||
#endif
|
||||
#endif
|
||||
|
||||
// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
|
||||
#if !defined(WEBP_SWAP_16BIT_CSP)
|
||||
#define WEBP_SWAP_16BIT_CSP 0
|
||||
#endif
|
||||
|
||||
typedef enum {
|
||||
kSSE2,
|
||||
kSSE3,
|
||||
|
@ -143,7 +180,7 @@ typedef enum {
|
|||
} CPUFeature;
|
||||
// returns true if the CPU supports the feature.
|
||||
typedef int (*VP8CPUInfo)(CPUFeature feature);
|
||||
WEBP_EXTERN(VP8CPUInfo) VP8GetCPUInfo;
|
||||
WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Init stub generator
|
||||
|
@ -271,6 +308,7 @@ typedef double (*VP8SSIMGetClippedFunc)(const uint8_t* src1, int stride1,
|
|||
int xo, int yo, // center position
|
||||
int W, int H); // plane dimension
|
||||
|
||||
#if !defined(WEBP_REDUCE_SIZE)
|
||||
// This version is called with the guarantee that you can load 8 bytes and
|
||||
// 8 rows at offset src1 and src2
|
||||
typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1,
|
||||
|
@ -278,10 +316,13 @@ typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1,
|
|||
|
||||
extern VP8SSIMGetFunc VP8SSIMGet; // unclipped / unchecked
|
||||
extern VP8SSIMGetClippedFunc VP8SSIMGetClipped; // with clipping
|
||||
#endif
|
||||
|
||||
#if !defined(WEBP_DISABLE_STATS)
|
||||
typedef uint32_t (*VP8AccumulateSSEFunc)(const uint8_t* src1,
|
||||
const uint8_t* src2, int len);
|
||||
extern VP8AccumulateSSEFunc VP8AccumulateSSE;
|
||||
#endif
|
||||
|
||||
// must be called before using any of the above directly
|
||||
void VP8SSIMDspInit(void);
|
||||
|
@ -462,12 +503,12 @@ extern WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
|
|||
extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink;
|
||||
|
||||
// Plain-C implementation, as fall-back.
|
||||
extern void WebPRescalerImportRowExpandC(struct WebPRescaler* const wrk,
|
||||
const uint8_t* src);
|
||||
extern void WebPRescalerImportRowShrinkC(struct WebPRescaler* const wrk,
|
||||
const uint8_t* src);
|
||||
extern void WebPRescalerExportRowExpandC(struct WebPRescaler* const wrk);
|
||||
extern void WebPRescalerExportRowShrinkC(struct WebPRescaler* const wrk);
|
||||
extern void WebPRescalerImportRowExpand_C(struct WebPRescaler* const wrk,
|
||||
const uint8_t* src);
|
||||
extern void WebPRescalerImportRowShrink_C(struct WebPRescaler* const wrk,
|
||||
const uint8_t* src);
|
||||
extern void WebPRescalerExportRowExpand_C(struct WebPRescaler* const wrk);
|
||||
extern void WebPRescalerExportRowShrink_C(struct WebPRescaler* const wrk);
|
||||
|
||||
// Main entry calls:
|
||||
extern void WebPRescalerImportRow(struct WebPRescaler* const wrk,
|
||||
|
@ -533,25 +574,22 @@ void WebPMultRows(uint8_t* ptr, int stride,
|
|||
int width, int num_rows, int inverse);
|
||||
|
||||
// Plain-C versions, used as fallback by some implementations.
|
||||
void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
|
||||
int width, int inverse);
|
||||
void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse);
|
||||
void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
|
||||
int width, int inverse);
|
||||
void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse);
|
||||
|
||||
// RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
|
||||
extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
|
||||
int len, int step, uint32_t* out);
|
||||
|
||||
// This function returns true if src[i] contains a value different from 0xff.
|
||||
extern int (*WebPHasAlpha8b)(const uint8_t* src, int length);
|
||||
// This function returns true if src[4*i] contains a value different from 0xff.
|
||||
extern int (*WebPHasAlpha32b)(const uint8_t* src, int length);
|
||||
|
||||
// To be called first before using the above.
|
||||
void WebPInitAlphaProcessing(void);
|
||||
|
||||
// ARGB packing function: a/r/g/b input is rgba or bgra order.
|
||||
extern void (*VP8PackARGB)(const uint8_t* a, const uint8_t* r,
|
||||
const uint8_t* g, const uint8_t* b, int len,
|
||||
uint32_t* out);
|
||||
|
||||
// RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
|
||||
extern void (*VP8PackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
|
||||
int len, int step, uint32_t* out);
|
||||
|
||||
// To be called first before using the above.
|
||||
void VP8EncDspARGBInit(void);
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Filter functions
|
||||
|
|
@ -14,16 +14,18 @@
|
|||
#include <assert.h>
|
||||
#include <stdlib.h> // for abs()
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "../enc/vp8i_enc.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
#include "src/enc/vp8i_enc.h"
|
||||
|
||||
static WEBP_INLINE uint8_t clip_8b(int v) {
|
||||
return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
|
||||
}
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static WEBP_INLINE int clip_max(int v, int max) {
|
||||
return (v > max) ? max : v;
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Compute susceptibility based on DCT-coeff histograms:
|
||||
|
@ -56,9 +58,10 @@ void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
|
|||
histo->last_non_zero = last_non_zero;
|
||||
}
|
||||
|
||||
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
|
||||
int start_block, int end_block,
|
||||
VP8Histogram* const histo) {
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred,
|
||||
int start_block, int end_block,
|
||||
VP8Histogram* const histo) {
|
||||
int j;
|
||||
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
|
||||
for (j = start_block; j < end_block; ++j) {
|
||||
|
@ -76,6 +79,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
|
|||
}
|
||||
VP8SetHistogramData(distribution, histo);
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// run-time tables (~4k)
|
||||
|
@ -100,6 +104,8 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
|
|||
//------------------------------------------------------------------------------
|
||||
// Transforms (Paragraph 14.4)
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
#define STORE(x, y, v) \
|
||||
dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
|
||||
|
||||
|
@ -140,15 +146,15 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
|
|||
}
|
||||
}
|
||||
|
||||
static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
|
||||
int do_two) {
|
||||
static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
|
||||
int do_two) {
|
||||
ITransformOne(ref, in, dst);
|
||||
if (do_two) {
|
||||
ITransformOne(ref + 4, in + 16, dst + 4);
|
||||
}
|
||||
}
|
||||
|
||||
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
||||
static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
||||
int i;
|
||||
int tmp[16];
|
||||
for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
|
||||
|
@ -176,13 +182,16 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
|||
out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
|
||||
}
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
||||
static void FTransform2_C(const uint8_t* src, const uint8_t* ref,
|
||||
int16_t* out) {
|
||||
VP8FTransform(src, ref, out);
|
||||
VP8FTransform(src + 4, ref + 4, out + 16);
|
||||
}
|
||||
|
||||
static void FTransformWHT(const int16_t* in, int16_t* out) {
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static void FTransformWHT_C(const int16_t* in, int16_t* out) {
|
||||
// input is 12b signed
|
||||
int32_t tmp[16];
|
||||
int i;
|
||||
|
@ -211,6 +220,7 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
|
|||
out[12 + i] = b3 >> 1;
|
||||
}
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
#undef MUL
|
||||
#undef STORE
|
||||
|
@ -303,8 +313,8 @@ static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
|
|||
//------------------------------------------------------------------------------
|
||||
// Chroma 8x8 prediction (paragraph 12.2)
|
||||
|
||||
static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
|
||||
const uint8_t* top) {
|
||||
static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
|
||||
const uint8_t* top) {
|
||||
// U block
|
||||
DCMode(C8DC8 + dst, left, top, 8, 8, 4);
|
||||
VerticalPred(C8VE8 + dst, top, 8);
|
||||
|
@ -323,8 +333,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
|
|||
//------------------------------------------------------------------------------
|
||||
// luma 16x16 prediction (paragraph 12.3)
|
||||
|
||||
static void Intra16Preds(uint8_t* dst,
|
||||
const uint8_t* left, const uint8_t* top) {
|
||||
static void Intra16Preds_C(uint8_t* dst,
|
||||
const uint8_t* left, const uint8_t* top) {
|
||||
DCMode(I16DC16 + dst, left, top, 16, 16, 5);
|
||||
VerticalPred(I16VE16 + dst, top, 16);
|
||||
HorizontalPred(I16HE16 + dst, left, 16);
|
||||
|
@ -507,7 +517,7 @@ static void TM4(uint8_t* dst, const uint8_t* top) {
|
|||
|
||||
// Left samples are top[-5 .. -2], top_left is top[-1], top are
|
||||
// located at top[0..3], and top right is top[4..7]
|
||||
static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
|
||||
static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
|
||||
DC4(I4DC4 + dst, top);
|
||||
TM4(I4TM4 + dst, top);
|
||||
VE4(I4VE4 + dst, top);
|
||||
|
@ -523,6 +533,7 @@ static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
|
|||
//------------------------------------------------------------------------------
|
||||
// Metric
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
|
||||
int w, int h) {
|
||||
int count = 0;
|
||||
|
@ -538,20 +549,21 @@ static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
|
|||
return count;
|
||||
}
|
||||
|
||||
static int SSE16x16(const uint8_t* a, const uint8_t* b) {
|
||||
static int SSE16x16_C(const uint8_t* a, const uint8_t* b) {
|
||||
return GetSSE(a, b, 16, 16);
|
||||
}
|
||||
static int SSE16x8(const uint8_t* a, const uint8_t* b) {
|
||||
static int SSE16x8_C(const uint8_t* a, const uint8_t* b) {
|
||||
return GetSSE(a, b, 16, 8);
|
||||
}
|
||||
static int SSE8x8(const uint8_t* a, const uint8_t* b) {
|
||||
static int SSE8x8_C(const uint8_t* a, const uint8_t* b) {
|
||||
return GetSSE(a, b, 8, 8);
|
||||
}
|
||||
static int SSE4x4(const uint8_t* a, const uint8_t* b) {
|
||||
static int SSE4x4_C(const uint8_t* a, const uint8_t* b) {
|
||||
return GetSSE(a, b, 4, 4);
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
|
||||
static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
|
||||
int k, x, y;
|
||||
for (k = 0; k < 4; ++k) {
|
||||
uint32_t avg = 0;
|
||||
|
@ -571,6 +583,7 @@ static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
|
|||
// We try to match the spectral content (weighted) between source and
|
||||
// reconstructed samples.
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
// Hadamard transform
|
||||
// Returns the weighted sum of the absolute value of transformed coefficients.
|
||||
// w[] contains a row-major 4 by 4 symmetric matrix.
|
||||
|
@ -608,24 +621,25 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
|
|||
return sum;
|
||||
}
|
||||
|
||||
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
const int sum1 = TTransform(a, w);
|
||||
const int sum2 = TTransform(b, w);
|
||||
return abs(sum2 - sum1) >> 5;
|
||||
}
|
||||
|
||||
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
int D = 0;
|
||||
int x, y;
|
||||
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
|
||||
for (x = 0; x < 16; x += 4) {
|
||||
D += Disto4x4(a + x + y, b + x + y, w);
|
||||
D += Disto4x4_C(a + x + y, b + x + y, w);
|
||||
}
|
||||
}
|
||||
return D;
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Quantization
|
||||
|
@ -636,8 +650,8 @@ static const uint8_t kZigzag[16] = {
|
|||
};
|
||||
|
||||
// Simple quantization
|
||||
static int QuantizeBlock(int16_t in[16], int16_t out[16],
|
||||
const VP8Matrix* const mtx) {
|
||||
static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
|
||||
const VP8Matrix* const mtx) {
|
||||
int last = -1;
|
||||
int n;
|
||||
for (n = 0; n < 16; ++n) {
|
||||
|
@ -662,13 +676,15 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
|
|||
return (last >= 0);
|
||||
}
|
||||
|
||||
static int Quantize2Blocks(int16_t in[32], int16_t out[32],
|
||||
const VP8Matrix* const mtx) {
|
||||
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
|
||||
static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
|
||||
const VP8Matrix* const mtx) {
|
||||
int nz;
|
||||
nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
|
||||
nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
|
||||
return nz;
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Block copy
|
||||
|
@ -682,148 +698,14 @@ static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
|
|||
}
|
||||
}
|
||||
|
||||
static void Copy4x4(const uint8_t* src, uint8_t* dst) {
|
||||
static void Copy4x4_C(const uint8_t* src, uint8_t* dst) {
|
||||
Copy(src, dst, 4, 4);
|
||||
}
|
||||
|
||||
static void Copy16x8(const uint8_t* src, uint8_t* dst) {
|
||||
static void Copy16x8_C(const uint8_t* src, uint8_t* dst) {
|
||||
Copy(src, dst, 16, 8);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// SSIM / PSNR
|
||||
|
||||
// hat-shaped filter. Sum of coefficients is equal to 16.
|
||||
static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
|
||||
1, 2, 3, 4, 3, 2, 1
|
||||
};
|
||||
static const uint32_t kWeightSum = 16 * 16; // sum{kWeight}^2
|
||||
|
||||
static WEBP_INLINE double SSIMCalculation(
|
||||
const VP8DistoStats* const stats, uint32_t N /*num samples*/) {
|
||||
const uint32_t w2 = N * N;
|
||||
const uint32_t C1 = 20 * w2;
|
||||
const uint32_t C2 = 60 * w2;
|
||||
const uint32_t C3 = 8 * 8 * w2; // 'dark' limit ~= 6
|
||||
const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
|
||||
const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
|
||||
if (xmxm + ymym >= C3) {
|
||||
const int64_t xmym = (int64_t)stats->xm * stats->ym;
|
||||
const int64_t sxy = (int64_t)stats->xym * N - xmym; // can be negative
|
||||
const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
|
||||
const uint64_t syy = (uint64_t)stats->yym * N - ymym;
|
||||
// we descale by 8 to prevent overflow during the fnum/fden multiply.
|
||||
const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
|
||||
const uint64_t den_S = (sxx + syy + C2) >> 8;
|
||||
const uint64_t fnum = (2 * xmym + C1) * num_S;
|
||||
const uint64_t fden = (xmxm + ymym + C1) * den_S;
|
||||
const double r = (double)fnum / fden;
|
||||
assert(r >= 0. && r <= 1.0);
|
||||
return r;
|
||||
}
|
||||
return 1.; // area is too dark to contribute meaningfully
|
||||
}
|
||||
|
||||
double VP8SSIMFromStats(const VP8DistoStats* const stats) {
|
||||
return SSIMCalculation(stats, kWeightSum);
|
||||
}
|
||||
|
||||
double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
|
||||
return SSIMCalculation(stats, stats->w);
|
||||
}
|
||||
|
||||
static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
|
||||
const uint8_t* src2, int stride2,
|
||||
int xo, int yo, int W, int H) {
|
||||
VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
|
||||
const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
|
||||
const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
|
||||
: yo + VP8_SSIM_KERNEL;
|
||||
const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
|
||||
const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
|
||||
: xo + VP8_SSIM_KERNEL;
|
||||
int x, y;
|
||||
src1 += ymin * stride1;
|
||||
src2 += ymin * stride2;
|
||||
for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
|
||||
for (x = xmin; x <= xmax; ++x) {
|
||||
const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
|
||||
* kWeight[VP8_SSIM_KERNEL + y - yo];
|
||||
const uint32_t s1 = src1[x];
|
||||
const uint32_t s2 = src2[x];
|
||||
stats.w += w;
|
||||
stats.xm += w * s1;
|
||||
stats.ym += w * s2;
|
||||
stats.xxm += w * s1 * s1;
|
||||
stats.xym += w * s1 * s2;
|
||||
stats.yym += w * s2 * s2;
|
||||
}
|
||||
}
|
||||
return VP8SSIMFromStatsClipped(&stats);
|
||||
}
|
||||
|
||||
static double SSIMGet_C(const uint8_t* src1, int stride1,
|
||||
const uint8_t* src2, int stride2) {
|
||||
VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
|
||||
int x, y;
|
||||
for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
|
||||
for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
|
||||
const uint32_t w = kWeight[x] * kWeight[y];
|
||||
const uint32_t s1 = src1[x];
|
||||
const uint32_t s2 = src2[x];
|
||||
stats.xm += w * s1;
|
||||
stats.ym += w * s2;
|
||||
stats.xxm += w * s1 * s1;
|
||||
stats.xym += w * s1 * s2;
|
||||
stats.yym += w * s2 * s2;
|
||||
}
|
||||
}
|
||||
return VP8SSIMFromStats(&stats);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static uint32_t AccumulateSSE(const uint8_t* src1,
|
||||
const uint8_t* src2, int len) {
|
||||
int i;
|
||||
uint32_t sse2 = 0;
|
||||
assert(len <= 65535); // to ensure that accumulation fits within uint32_t
|
||||
for (i = 0; i < len; ++i) {
|
||||
const int32_t diff = src1[i] - src2[i];
|
||||
sse2 += diff * diff;
|
||||
}
|
||||
return sse2;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
VP8SSIMGetFunc VP8SSIMGet;
|
||||
VP8SSIMGetClippedFunc VP8SSIMGetClipped;
|
||||
VP8AccumulateSSEFunc VP8AccumulateSSE;
|
||||
|
||||
extern void VP8SSIMDspInitSSE2(void);
|
||||
|
||||
static volatile VP8CPUInfo ssim_last_cpuinfo_used =
|
||||
(VP8CPUInfo)&ssim_last_cpuinfo_used;
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
|
||||
if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
|
||||
|
||||
VP8SSIMGetClipped = SSIMGetClipped_C;
|
||||
VP8SSIMGet = SSIMGet_C;
|
||||
|
||||
VP8AccumulateSSE = AccumulateSSE;
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
if (VP8GetCPUInfo(kSSE2)) {
|
||||
VP8SSIMDspInitSSE2();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
ssim_last_cpuinfo_used = VP8GetCPUInfo;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Initialization
|
||||
|
||||
|
@ -868,26 +750,32 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
|
|||
InitTables();
|
||||
|
||||
// default C implementations
|
||||
VP8CollectHistogram = CollectHistogram;
|
||||
VP8ITransform = ITransform;
|
||||
VP8FTransform = FTransform;
|
||||
VP8FTransform2 = FTransform2;
|
||||
VP8FTransformWHT = FTransformWHT;
|
||||
VP8EncPredLuma4 = Intra4Preds;
|
||||
VP8EncPredLuma16 = Intra16Preds;
|
||||
VP8EncPredChroma8 = IntraChromaPreds;
|
||||
VP8SSE16x16 = SSE16x16;
|
||||
VP8SSE8x8 = SSE8x8;
|
||||
VP8SSE16x8 = SSE16x8;
|
||||
VP8SSE4x4 = SSE4x4;
|
||||
VP8TDisto4x4 = Disto4x4;
|
||||
VP8TDisto16x16 = Disto16x16;
|
||||
VP8Mean16x4 = Mean16x4;
|
||||
VP8EncQuantizeBlock = QuantizeBlock;
|
||||
VP8EncQuantize2Blocks = Quantize2Blocks;
|
||||
VP8EncQuantizeBlockWHT = QuantizeBlock;
|
||||
VP8Copy4x4 = Copy4x4;
|
||||
VP8Copy16x8 = Copy16x8;
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
VP8ITransform = ITransform_C;
|
||||
VP8FTransform = FTransform_C;
|
||||
VP8FTransformWHT = FTransformWHT_C;
|
||||
VP8TDisto4x4 = Disto4x4_C;
|
||||
VP8TDisto16x16 = Disto16x16_C;
|
||||
VP8CollectHistogram = CollectHistogram_C;
|
||||
VP8SSE16x16 = SSE16x16_C;
|
||||
VP8SSE16x8 = SSE16x8_C;
|
||||
VP8SSE8x8 = SSE8x8_C;
|
||||
VP8SSE4x4 = SSE4x4_C;
|
||||
#endif
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
|
||||
VP8EncQuantizeBlock = QuantizeBlock_C;
|
||||
VP8EncQuantize2Blocks = Quantize2Blocks_C;
|
||||
#endif
|
||||
|
||||
VP8FTransform2 = FTransform2_C;
|
||||
VP8EncPredLuma4 = Intra4Preds_C;
|
||||
VP8EncPredLuma16 = Intra16Preds_C;
|
||||
VP8EncPredChroma8 = IntraChromaPreds_C;
|
||||
VP8Mean16x4 = Mean16x4_C;
|
||||
VP8EncQuantizeBlockWHT = QuantizeBlock_C;
|
||||
VP8Copy4x4 = Copy4x4_C;
|
||||
VP8Copy16x8 = Copy16x8_C;
|
||||
|
||||
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
|
@ -906,11 +794,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
|
|||
VP8EncDspInitAVX2();
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_NEON)
|
||||
if (VP8GetCPUInfo(kNEON)) {
|
||||
VP8EncDspInitNEON();
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_MIPS32)
|
||||
if (VP8GetCPUInfo(kMIPS32)) {
|
||||
VP8EncDspInitMIPS32();
|
||||
|
@ -927,5 +810,34 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
|
|||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
if (WEBP_NEON_OMIT_C_CODE ||
|
||||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
|
||||
VP8EncDspInitNEON();
|
||||
}
|
||||
#endif
|
||||
|
||||
assert(VP8ITransform != NULL);
|
||||
assert(VP8FTransform != NULL);
|
||||
assert(VP8FTransformWHT != NULL);
|
||||
assert(VP8TDisto4x4 != NULL);
|
||||
assert(VP8TDisto16x16 != NULL);
|
||||
assert(VP8CollectHistogram != NULL);
|
||||
assert(VP8SSE16x16 != NULL);
|
||||
assert(VP8SSE16x8 != NULL);
|
||||
assert(VP8SSE8x8 != NULL);
|
||||
assert(VP8SSE4x4 != NULL);
|
||||
assert(VP8EncQuantizeBlock != NULL);
|
||||
assert(VP8EncQuantize2Blocks != NULL);
|
||||
assert(VP8FTransform2 != NULL);
|
||||
assert(VP8EncPredLuma4 != NULL);
|
||||
assert(VP8EncPredLuma16 != NULL);
|
||||
assert(VP8EncPredChroma8 != NULL);
|
||||
assert(VP8Mean16x4 != NULL);
|
||||
assert(VP8EncQuantizeBlockWHT != NULL);
|
||||
assert(VP8Copy4x4 != NULL);
|
||||
assert(VP8Copy16x8 != NULL);
|
||||
|
||||
enc_last_cpuinfo_used = VP8GetCPUInfo;
|
||||
}
|
|
@ -9,7 +9,7 @@
|
|||
//
|
||||
// AVX2 version of speed-critical encoding functions.
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_AVX2)
|
||||
|
|
@ -13,13 +13,13 @@
|
|||
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
|
||||
// Slobodan Prijic (slobodan.prijic@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MIPS32)
|
||||
|
||||
#include "./mips_macro.h"
|
||||
#include "../enc/vp8i_enc.h"
|
||||
#include "../enc/cost_enc.h"
|
||||
#include "src/dsp/mips_macro.h"
|
||||
#include "src/enc/vp8i_enc.h"
|
||||
#include "src/enc/cost_enc.h"
|
||||
|
||||
static const int kC1 = 20091 + (1 << 16);
|
||||
static const int kC2 = 35468;
|
||||
|
@ -113,8 +113,9 @@ static const int kC2 = 35468;
|
|||
"sb %[" #TEMP12 "], 3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"
|
||||
|
||||
// Does one or two inverse transforms.
|
||||
static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
|
||||
uint8_t* dst) {
|
||||
static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref,
|
||||
const int16_t* in,
|
||||
uint8_t* dst) {
|
||||
int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
|
||||
int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
|
||||
int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
|
||||
|
@ -144,11 +145,11 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
|
|||
);
|
||||
}
|
||||
|
||||
static void ITransform(const uint8_t* ref, const int16_t* in,
|
||||
uint8_t* dst, int do_two) {
|
||||
ITransformOne(ref, in, dst);
|
||||
static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in,
|
||||
uint8_t* dst, int do_two) {
|
||||
ITransformOne_MIPS32(ref, in, dst);
|
||||
if (do_two) {
|
||||
ITransformOne(ref + 4, in + 16, dst + 4);
|
||||
ITransformOne_MIPS32(ref + 4, in + 16, dst + 4);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -187,8 +188,8 @@ static void ITransform(const uint8_t* ref, const int16_t* in,
|
|||
"sh %[temp5], " #J "(%[ppin]) \n\t" \
|
||||
"sh %[level], " #N "(%[pout]) \n\t"
|
||||
|
||||
static int QuantizeBlock(int16_t in[16], int16_t out[16],
|
||||
const VP8Matrix* const mtx) {
|
||||
static int QuantizeBlock_MIPS32(int16_t in[16], int16_t out[16],
|
||||
const VP8Matrix* const mtx) {
|
||||
int temp0, temp1, temp2, temp3, temp4, temp5;
|
||||
int sign, coeff, level, i;
|
||||
int max_level = MAX_LEVEL;
|
||||
|
@ -238,11 +239,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int Quantize2Blocks(int16_t in[32], int16_t out[32],
|
||||
const VP8Matrix* const mtx) {
|
||||
static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
|
||||
const VP8Matrix* const mtx) {
|
||||
int nz;
|
||||
nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
|
||||
nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
|
||||
nz = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0;
|
||||
nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1;
|
||||
return nz;
|
||||
}
|
||||
|
||||
|
@ -361,8 +362,8 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
|
|||
"msub %[temp6], %[temp0] \n\t" \
|
||||
"msub %[temp7], %[temp1] \n\t"
|
||||
|
||||
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
int tmp[32];
|
||||
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
|
||||
|
||||
|
@ -396,13 +397,13 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
|
|||
#undef VERTICAL_PASS
|
||||
#undef HORIZONTAL_PASS
|
||||
|
||||
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
int D = 0;
|
||||
int x, y;
|
||||
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
|
||||
for (x = 0; x < 16; x += 4) {
|
||||
D += Disto4x4(a + x + y, b + x + y, w);
|
||||
D += Disto4x4_MIPS32(a + x + y, b + x + y, w);
|
||||
}
|
||||
}
|
||||
return D;
|
||||
|
@ -478,7 +479,8 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
|
|||
"sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \
|
||||
"sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t"
|
||||
|
||||
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
||||
static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref,
|
||||
int16_t* out) {
|
||||
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
|
||||
int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
|
||||
int temp17, temp18, temp19, temp20;
|
||||
|
@ -539,7 +541,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
|||
GET_SSE_INNER(C, C + 1, C + 2, C + 3) \
|
||||
GET_SSE_INNER(D, D + 1, D + 2, D + 3)
|
||||
|
||||
static int SSE16x16(const uint8_t* a, const uint8_t* b) {
|
||||
static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) {
|
||||
int count;
|
||||
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
||||
|
||||
|
@ -573,7 +575,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
|
|||
return count;
|
||||
}
|
||||
|
||||
static int SSE16x8(const uint8_t* a, const uint8_t* b) {
|
||||
static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) {
|
||||
int count;
|
||||
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
||||
|
||||
|
@ -599,7 +601,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
|
|||
return count;
|
||||
}
|
||||
|
||||
static int SSE8x8(const uint8_t* a, const uint8_t* b) {
|
||||
static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) {
|
||||
int count;
|
||||
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
||||
|
||||
|
@ -621,7 +623,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
|
|||
return count;
|
||||
}
|
||||
|
||||
static int SSE4x4(const uint8_t* a, const uint8_t* b) {
|
||||
static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) {
|
||||
int count;
|
||||
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
||||
|
||||
|
@ -651,17 +653,20 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
|
|||
extern void VP8EncDspInitMIPS32(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) {
|
||||
VP8ITransform = ITransform;
|
||||
VP8FTransform = FTransform;
|
||||
VP8EncQuantizeBlock = QuantizeBlock;
|
||||
VP8EncQuantize2Blocks = Quantize2Blocks;
|
||||
VP8TDisto4x4 = Disto4x4;
|
||||
VP8TDisto16x16 = Disto16x16;
|
||||
VP8ITransform = ITransform_MIPS32;
|
||||
VP8FTransform = FTransform_MIPS32;
|
||||
|
||||
VP8EncQuantizeBlock = QuantizeBlock_MIPS32;
|
||||
VP8EncQuantize2Blocks = Quantize2Blocks_MIPS32;
|
||||
|
||||
VP8TDisto4x4 = Disto4x4_MIPS32;
|
||||
VP8TDisto16x16 = Disto16x16_MIPS32;
|
||||
|
||||
#if !defined(WORK_AROUND_GCC)
|
||||
VP8SSE16x16 = SSE16x16;
|
||||
VP8SSE8x8 = SSE8x8;
|
||||
VP8SSE16x8 = SSE16x8;
|
||||
VP8SSE4x4 = SSE4x4;
|
||||
VP8SSE16x16 = SSE16x16_MIPS32;
|
||||
VP8SSE8x8 = SSE8x8_MIPS32;
|
||||
VP8SSE16x8 = SSE16x8_MIPS32;
|
||||
VP8SSE4x4 = SSE4x4_MIPS32;
|
||||
#endif
|
||||
}
|
||||
|
|
@ -12,13 +12,13 @@
|
|||
// Author(s): Darko Laus (darko.laus@imgtec.com)
|
||||
// Mirko Raus (mirko.raus@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MIPS_DSP_R2)
|
||||
|
||||
#include "./mips_macro.h"
|
||||
#include "../enc/cost_enc.h"
|
||||
#include "../enc/vp8i_enc.h"
|
||||
#include "src/dsp/mips_macro.h"
|
||||
#include "src/enc/cost_enc.h"
|
||||
#include "src/enc/vp8i_enc.h"
|
||||
|
||||
static const int kC1 = 20091 + (1 << 16);
|
||||
static const int kC2 = 35468;
|
||||
|
@ -141,7 +141,8 @@ static const int kC2 = 35468;
|
|||
"sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \
|
||||
"sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t"
|
||||
|
||||
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
||||
static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref,
|
||||
int16_t* out) {
|
||||
const int c2217 = 2217;
|
||||
const int c5352 = 5352;
|
||||
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
|
||||
|
@ -238,16 +239,16 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
|
|||
);
|
||||
}
|
||||
|
||||
static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
|
||||
int do_two) {
|
||||
static void ITransform_MIPSdspR2(const uint8_t* ref, const int16_t* in,
|
||||
uint8_t* dst, int do_two) {
|
||||
ITransformOne(ref, in, dst);
|
||||
if (do_two) {
|
||||
ITransformOne(ref + 4, in + 16, dst + 4);
|
||||
}
|
||||
}
|
||||
|
||||
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
|
||||
int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
|
||||
|
||||
|
@ -313,13 +314,14 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
|
|||
return abs(temp3 - temp17) >> 5;
|
||||
}
|
||||
|
||||
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
static int Disto16x16_MIPSdspR2(const uint8_t* const a,
|
||||
const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
int D = 0;
|
||||
int x, y;
|
||||
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
|
||||
for (x = 0; x < 16; x += 4) {
|
||||
D += Disto4x4(a + x + y, b + x + y, w);
|
||||
D += Disto4x4_MIPSdspR2(a + x + y, b + x + y, w);
|
||||
}
|
||||
}
|
||||
return D;
|
||||
|
@ -1011,8 +1013,8 @@ static void HU4(uint8_t* dst, const uint8_t* top) {
|
|||
//------------------------------------------------------------------------------
|
||||
// Chroma 8x8 prediction (paragraph 12.2)
|
||||
|
||||
static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
|
||||
const uint8_t* top) {
|
||||
static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left,
|
||||
const uint8_t* top) {
|
||||
// U block
|
||||
DCMode8(C8DC8 + dst, left, top);
|
||||
VerticalPred8(C8VE8 + dst, top);
|
||||
|
@ -1031,8 +1033,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
|
|||
//------------------------------------------------------------------------------
|
||||
// luma 16x16 prediction (paragraph 12.3)
|
||||
|
||||
static void Intra16Preds(uint8_t* dst,
|
||||
const uint8_t* left, const uint8_t* top) {
|
||||
static void Intra16Preds_MIPSdspR2(uint8_t* dst,
|
||||
const uint8_t* left, const uint8_t* top) {
|
||||
DCMode16(I16DC16 + dst, left, top);
|
||||
VerticalPred16(I16VE16 + dst, top);
|
||||
HorizontalPred16(I16HE16 + dst, left);
|
||||
|
@ -1041,7 +1043,7 @@ static void Intra16Preds(uint8_t* dst,
|
|||
|
||||
// Left samples are top[-5 .. -2], top_left is top[-1], top are
|
||||
// located at top[0..3], and top right is top[4..7]
|
||||
static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
|
||||
static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) {
|
||||
DC4(I4DC4 + dst, top);
|
||||
TM4(I4TM4 + dst, top);
|
||||
VE4(I4VE4 + dst, top);
|
||||
|
@ -1077,7 +1079,7 @@ static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
|
|||
GET_SSE_INNER(C) \
|
||||
GET_SSE_INNER(D)
|
||||
|
||||
static int SSE16x16(const uint8_t* a, const uint8_t* b) {
|
||||
static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
|
||||
int count;
|
||||
int temp0, temp1, temp2, temp3;
|
||||
__asm__ volatile (
|
||||
|
@ -1107,7 +1109,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
|
|||
return count;
|
||||
}
|
||||
|
||||
static int SSE16x8(const uint8_t* a, const uint8_t* b) {
|
||||
static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
|
||||
int count;
|
||||
int temp0, temp1, temp2, temp3;
|
||||
__asm__ volatile (
|
||||
|
@ -1129,7 +1131,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
|
|||
return count;
|
||||
}
|
||||
|
||||
static int SSE8x8(const uint8_t* a, const uint8_t* b) {
|
||||
static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
|
||||
int count;
|
||||
int temp0, temp1, temp2, temp3;
|
||||
__asm__ volatile (
|
||||
|
@ -1147,7 +1149,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
|
|||
return count;
|
||||
}
|
||||
|
||||
static int SSE4x4(const uint8_t* a, const uint8_t* b) {
|
||||
static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
|
||||
int count;
|
||||
int temp0, temp1, temp2, temp3;
|
||||
__asm__ volatile (
|
||||
|
@ -1270,8 +1272,8 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
|
|||
"usw $0, " #J "(%[ppin]) \n\t" \
|
||||
"3: \n\t"
|
||||
|
||||
static int QuantizeBlock(int16_t in[16], int16_t out[16],
|
||||
const VP8Matrix* const mtx) {
|
||||
static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
|
||||
const VP8Matrix* const mtx) {
|
||||
int temp0, temp1, temp2, temp3, temp4, temp5,temp6;
|
||||
int sign, coeff, level;
|
||||
int max_level = MAX_LEVEL;
|
||||
|
@ -1311,11 +1313,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
|
|||
return (ret != 0);
|
||||
}
|
||||
|
||||
static int Quantize2Blocks(int16_t in[32], int16_t out[32],
|
||||
const VP8Matrix* const mtx) {
|
||||
static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
|
||||
const VP8Matrix* const mtx) {
|
||||
int nz;
|
||||
nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
|
||||
nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
|
||||
nz = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0;
|
||||
nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1;
|
||||
return nz;
|
||||
}
|
||||
|
||||
|
@ -1358,7 +1360,7 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
|
|||
"usw %[" #TEMP4 "], " #C "(%[out]) \n\t" \
|
||||
"usw %[" #TEMP6 "], " #D "(%[out]) \n\t"
|
||||
|
||||
static void FTransformWHT(const int16_t* in, int16_t* out) {
|
||||
static void FTransformWHT_MIPSdspR2(const int16_t* in, int16_t* out) {
|
||||
int temp0, temp1, temp2, temp3, temp4;
|
||||
int temp5, temp6, temp7, temp8, temp9;
|
||||
|
||||
|
@ -1450,9 +1452,9 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
|
|||
"addiu %[temp8], %[temp8], 1 \n\t" \
|
||||
"sw %[temp8], 0(%[temp3]) \n\t"
|
||||
|
||||
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
|
||||
int start_block, int end_block,
|
||||
VP8Histogram* const histo) {
|
||||
static void CollectHistogram_MIPSdspR2(const uint8_t* ref, const uint8_t* pred,
|
||||
int start_block, int end_block,
|
||||
VP8Histogram* const histo) {
|
||||
int j;
|
||||
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
|
||||
const int max_coeff = (MAX_COEFF_THRESH << 16) + MAX_COEFF_THRESH;
|
||||
|
@ -1484,23 +1486,28 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
|
|||
extern void VP8EncDspInitMIPSdspR2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) {
|
||||
VP8FTransform = FTransform;
|
||||
VP8ITransform = ITransform;
|
||||
VP8TDisto4x4 = Disto4x4;
|
||||
VP8TDisto16x16 = Disto16x16;
|
||||
VP8EncPredLuma16 = Intra16Preds;
|
||||
VP8EncPredChroma8 = IntraChromaPreds;
|
||||
VP8EncPredLuma4 = Intra4Preds;
|
||||
VP8FTransform = FTransform_MIPSdspR2;
|
||||
VP8FTransformWHT = FTransformWHT_MIPSdspR2;
|
||||
VP8ITransform = ITransform_MIPSdspR2;
|
||||
|
||||
VP8TDisto4x4 = Disto4x4_MIPSdspR2;
|
||||
VP8TDisto16x16 = Disto16x16_MIPSdspR2;
|
||||
|
||||
VP8EncPredLuma16 = Intra16Preds_MIPSdspR2;
|
||||
VP8EncPredChroma8 = IntraChromaPreds_MIPSdspR2;
|
||||
VP8EncPredLuma4 = Intra4Preds_MIPSdspR2;
|
||||
|
||||
#if !defined(WORK_AROUND_GCC)
|
||||
VP8SSE16x16 = SSE16x16;
|
||||
VP8SSE8x8 = SSE8x8;
|
||||
VP8SSE16x8 = SSE16x8;
|
||||
VP8SSE4x4 = SSE4x4;
|
||||
VP8SSE16x16 = SSE16x16_MIPSdspR2;
|
||||
VP8SSE8x8 = SSE8x8_MIPSdspR2;
|
||||
VP8SSE16x8 = SSE16x8_MIPSdspR2;
|
||||
VP8SSE4x4 = SSE4x4_MIPSdspR2;
|
||||
#endif
|
||||
VP8EncQuantizeBlock = QuantizeBlock;
|
||||
VP8EncQuantize2Blocks = Quantize2Blocks;
|
||||
VP8FTransformWHT = FTransformWHT;
|
||||
VP8CollectHistogram = CollectHistogram;
|
||||
|
||||
VP8EncQuantizeBlock = QuantizeBlock_MIPSdspR2;
|
||||
VP8EncQuantize2Blocks = Quantize2Blocks_MIPSdspR2;
|
||||
|
||||
VP8CollectHistogram = CollectHistogram_MIPSdspR2;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_MIPS_DSP_R2
|
|
@ -11,13 +11,13 @@
|
|||
//
|
||||
// Author: Prashant Patil (prashant.patil@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MSA)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "./msa_macro.h"
|
||||
#include "../enc/vp8i_enc.h"
|
||||
#include "src/dsp/msa_macro.h"
|
||||
#include "src/enc/vp8i_enc.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Transforms
|
||||
|
@ -69,20 +69,21 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
|
|||
ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
|
||||
}
|
||||
|
||||
static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
|
||||
int do_two) {
|
||||
static void ITransform_MSA(const uint8_t* ref, const int16_t* in, uint8_t* dst,
|
||||
int do_two) {
|
||||
ITransformOne(ref, in, dst);
|
||||
if (do_two) {
|
||||
ITransformOne(ref + 4, in + 16, dst + 4);
|
||||
}
|
||||
}
|
||||
|
||||
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
||||
static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,
|
||||
int16_t* out) {
|
||||
uint64_t out0, out1, out2, out3;
|
||||
uint32_t in0, in1, in2, in3;
|
||||
v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
|
||||
v8i16 t0, t1, t2, t3;
|
||||
v16u8 srcl0, srcl1, src0, src1;
|
||||
v16u8 srcl0, srcl1, src0 = { 0 }, src1 = { 0 };
|
||||
const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 };
|
||||
const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 };
|
||||
const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 };
|
||||
|
@ -130,7 +131,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
|||
SD4(out0, out1, out2, out3, out, 8);
|
||||
}
|
||||
|
||||
static void FTransformWHT(const int16_t* in, int16_t* out) {
|
||||
static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {
|
||||
v8i16 in0 = { 0 };
|
||||
v8i16 in1 = { 0 };
|
||||
v8i16 tmp0, tmp1, tmp2, tmp3;
|
||||
|
@ -167,10 +168,10 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
|
|||
ST_SH2(out0, out1, out, 8);
|
||||
}
|
||||
|
||||
static int TTransform(const uint8_t* in, const uint16_t* w) {
|
||||
static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {
|
||||
int sum;
|
||||
uint32_t in0_m, in1_m, in2_m, in3_m;
|
||||
v16i8 src0;
|
||||
v16i8 src0 = { 0 };
|
||||
v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3;
|
||||
v4i32 dst0, dst1;
|
||||
const v16i8 zero = { 0 };
|
||||
|
@ -199,20 +200,20 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
|
|||
return sum;
|
||||
}
|
||||
|
||||
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
const int sum1 = TTransform(a, w);
|
||||
const int sum2 = TTransform(b, w);
|
||||
static int Disto4x4_MSA(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
const int sum1 = TTransform_MSA(a, w);
|
||||
const int sum2 = TTransform_MSA(b, w);
|
||||
return abs(sum2 - sum1) >> 5;
|
||||
}
|
||||
|
||||
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
int D = 0;
|
||||
int x, y;
|
||||
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
|
||||
for (x = 0; x < 16; x += 4) {
|
||||
D += Disto4x4(a + x + y, b + x + y, w);
|
||||
D += Disto4x4_MSA(a + x + y, b + x + y, w);
|
||||
}
|
||||
}
|
||||
return D;
|
||||
|
@ -221,9 +222,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
|
|||
//------------------------------------------------------------------------------
|
||||
// Histogram
|
||||
|
||||
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
|
||||
int start_block, int end_block,
|
||||
VP8Histogram* const histo) {
|
||||
static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred,
|
||||
int start_block, int end_block,
|
||||
VP8Histogram* const histo) {
|
||||
int j;
|
||||
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
|
||||
for (j = start_block; j < end_block; ++j) {
|
||||
|
@ -259,8 +260,9 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
|
|||
#define AVG2(a, b) (((a) + (b) + 1) >> 1)
|
||||
|
||||
static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical
|
||||
const v16u8 A1 = { 0 };
|
||||
const uint64_t val_m = LD(top - 1);
|
||||
const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, 0, val_m);
|
||||
const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
|
||||
const v16u8 B = SLDI_UB(A, A, 1);
|
||||
const v16u8 C = SLDI_UB(A, A, 2);
|
||||
const v16u8 AC = __msa_ave_u_b(A, C);
|
||||
|
@ -292,8 +294,9 @@ static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
|
|||
}
|
||||
|
||||
static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
|
||||
const v16u8 A2 = { 0 };
|
||||
const uint64_t val_m = LD(top - 5);
|
||||
const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
|
||||
const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m);
|
||||
const v16u8 A = (v16u8)__msa_insert_b((v16i8)A1, 8, top[3]);
|
||||
const v16u8 B = SLDI_UB(A, A, 1);
|
||||
const v16u8 C = SLDI_UB(A, A, 2);
|
||||
|
@ -311,8 +314,9 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
|
|||
}
|
||||
|
||||
static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
|
||||
const v16u8 A1 = { 0 };
|
||||
const uint64_t val_m = LD(top);
|
||||
const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, 0, val_m);
|
||||
const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
|
||||
const v16u8 B = SLDI_UB(A, A, 1);
|
||||
const v16u8 C1 = SLDI_UB(A, A, 2);
|
||||
const v16u8 C = (v16u8)__msa_insert_b((v16i8)C1, 6, top[7]);
|
||||
|
@ -427,7 +431,7 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
|
|||
#undef AVG3
|
||||
#undef AVG2
|
||||
|
||||
static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
|
||||
static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) {
|
||||
DC4(I4DC4 + dst, top);
|
||||
TM4(I4TM4 + dst, top);
|
||||
VE4(I4VE4 + dst, top);
|
||||
|
@ -544,8 +548,8 @@ static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
|
|||
STORE16x16(out, dst);
|
||||
}
|
||||
|
||||
static void Intra16Preds(uint8_t* dst,
|
||||
const uint8_t* left, const uint8_t* top) {
|
||||
static void Intra16Preds_MSA(uint8_t* dst,
|
||||
const uint8_t* left, const uint8_t* top) {
|
||||
DCMode16x16(I16DC16 + dst, left, top);
|
||||
VerticalPred16x16(I16VE16 + dst, top);
|
||||
HorizontalPred16x16(I16HE16 + dst, left);
|
||||
|
@ -645,7 +649,7 @@ static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
|
|||
static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
|
||||
const uint8_t* top) {
|
||||
uint64_t out;
|
||||
v16u8 src;
|
||||
v16u8 src = { 0 };
|
||||
if (top != NULL && left != NULL) {
|
||||
const uint64_t left_m = LD(left);
|
||||
const uint64_t top_m = LD(top);
|
||||
|
@ -666,8 +670,8 @@ static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
|
|||
STORE8x8(out, dst);
|
||||
}
|
||||
|
||||
static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
|
||||
const uint8_t* top) {
|
||||
static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,
|
||||
const uint8_t* top) {
|
||||
// U block
|
||||
DCMode8x8(C8DC8 + dst, left, top);
|
||||
VerticalPred8x8(C8VE8 + dst, top);
|
||||
|
@ -708,7 +712,7 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
|
|||
DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \
|
||||
} while (0)
|
||||
|
||||
static int SSE16x16(const uint8_t* a, const uint8_t* b) {
|
||||
static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {
|
||||
uint32_t sum;
|
||||
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
|
||||
|
@ -735,7 +739,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
|
|||
return sum;
|
||||
}
|
||||
|
||||
static int SSE16x8(const uint8_t* a, const uint8_t* b) {
|
||||
static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {
|
||||
uint32_t sum;
|
||||
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
|
||||
|
@ -754,7 +758,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
|
|||
return sum;
|
||||
}
|
||||
|
||||
static int SSE8x8(const uint8_t* a, const uint8_t* b) {
|
||||
static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {
|
||||
uint32_t sum;
|
||||
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
|
||||
|
@ -774,10 +778,10 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
|
|||
return sum;
|
||||
}
|
||||
|
||||
static int SSE4x4(const uint8_t* a, const uint8_t* b) {
|
||||
static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {
|
||||
uint32_t sum = 0;
|
||||
uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
|
||||
v16u8 src, ref, tmp0, tmp1;
|
||||
v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1;
|
||||
v8i16 diff0, diff1;
|
||||
v4i32 out0, out1;
|
||||
|
||||
|
@ -796,8 +800,8 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
|
|||
//------------------------------------------------------------------------------
|
||||
// Quantization
|
||||
|
||||
static int QuantizeBlock(int16_t in[16], int16_t out[16],
|
||||
const VP8Matrix* const mtx) {
|
||||
static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
|
||||
const VP8Matrix* const mtx) {
|
||||
int sum;
|
||||
v8i16 in0, in1, sh0, sh1, out0, out1;
|
||||
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
|
||||
|
@ -828,7 +832,7 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
|
|||
tmp1 = (tmp3 > maxlevel);
|
||||
tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)maxlevel, (v16u8)tmp0);
|
||||
tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)maxlevel, (v16u8)tmp1);
|
||||
SUB2(0, tmp2, 0, tmp3, tmp0, tmp1);
|
||||
SUB2(zero, tmp2, zero, tmp3, tmp0, tmp1);
|
||||
tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)tmp0, (v16u8)sign0);
|
||||
tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)tmp1, (v16u8)sign1);
|
||||
LD_SW4(&mtx->zthresh_[0], 4, t0, t1, t2, t3); // zthresh
|
||||
|
@ -849,8 +853,8 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
|
|||
return (sum > 0);
|
||||
}
|
||||
|
||||
static int Quantize2Blocks(int16_t in[32], int16_t out[32],
|
||||
const VP8Matrix* const mtx) {
|
||||
static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32],
|
||||
const VP8Matrix* const mtx) {
|
||||
int nz;
|
||||
nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
|
||||
nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
|
||||
|
@ -863,26 +867,26 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
|
|||
extern void VP8EncDspInitMSA(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) {
|
||||
VP8ITransform = ITransform;
|
||||
VP8FTransform = FTransform;
|
||||
VP8FTransformWHT = FTransformWHT;
|
||||
VP8ITransform = ITransform_MSA;
|
||||
VP8FTransform = FTransform_MSA;
|
||||
VP8FTransformWHT = FTransformWHT_MSA;
|
||||
|
||||
VP8TDisto4x4 = Disto4x4;
|
||||
VP8TDisto16x16 = Disto16x16;
|
||||
VP8CollectHistogram = CollectHistogram;
|
||||
VP8TDisto4x4 = Disto4x4_MSA;
|
||||
VP8TDisto16x16 = Disto16x16_MSA;
|
||||
VP8CollectHistogram = CollectHistogram_MSA;
|
||||
|
||||
VP8EncPredLuma4 = Intra4Preds;
|
||||
VP8EncPredLuma16 = Intra16Preds;
|
||||
VP8EncPredChroma8 = IntraChromaPreds;
|
||||
VP8EncPredLuma4 = Intra4Preds_MSA;
|
||||
VP8EncPredLuma16 = Intra16Preds_MSA;
|
||||
VP8EncPredChroma8 = IntraChromaPreds_MSA;
|
||||
|
||||
VP8SSE16x16 = SSE16x16;
|
||||
VP8SSE16x8 = SSE16x8;
|
||||
VP8SSE8x8 = SSE8x8;
|
||||
VP8SSE4x4 = SSE4x4;
|
||||
VP8SSE16x16 = SSE16x16_MSA;
|
||||
VP8SSE16x8 = SSE16x8_MSA;
|
||||
VP8SSE8x8 = SSE8x8_MSA;
|
||||
VP8SSE4x4 = SSE4x4_MSA;
|
||||
|
||||
VP8EncQuantizeBlock = QuantizeBlock;
|
||||
VP8EncQuantize2Blocks = Quantize2Blocks;
|
||||
VP8EncQuantizeBlockWHT = QuantizeBlock;
|
||||
VP8EncQuantizeBlock = QuantizeBlock_MSA;
|
||||
VP8EncQuantize2Blocks = Quantize2Blocks_MSA;
|
||||
VP8EncQuantizeBlockWHT = QuantizeBlock_MSA;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_MSA
|
|
@ -11,14 +11,14 @@
|
|||
//
|
||||
// adapted from libvpx (http://www.webmproject.org/code/)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include "./neon.h"
|
||||
#include "../enc/vp8i_enc.h"
|
||||
#include "src/dsp/neon.h"
|
||||
#include "src/enc/vp8i_enc.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Transforms (Paragraph 14.4)
|
||||
|
@ -37,15 +37,15 @@ static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
|
|||
#if defined(WEBP_USE_INTRINSICS)
|
||||
|
||||
// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
|
||||
static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
|
||||
static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
|
||||
return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
|
||||
}
|
||||
|
||||
// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
|
||||
// to the corresponding rows of 'dst'.
|
||||
static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
|
||||
const int16x8_t dst01,
|
||||
const int16x8_t dst23) {
|
||||
static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
|
||||
const int16x8_t dst01,
|
||||
const int16x8_t dst23) {
|
||||
// Unsigned saturate to 8b.
|
||||
const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
|
||||
const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
|
||||
|
@ -57,8 +57,10 @@ static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
|
|||
vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
|
||||
const uint8_t* const ref, uint8_t* const dst) {
|
||||
static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
|
||||
const int16x8_t row23,
|
||||
const uint8_t* const ref,
|
||||
uint8_t* const dst) {
|
||||
uint32x2_t dst01 = vdup_n_u32(0);
|
||||
uint32x2_t dst23 = vdup_n_u32(0);
|
||||
|
||||
|
@ -70,19 +72,20 @@ static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
|
|||
|
||||
{
|
||||
// Convert to 16b.
|
||||
const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
|
||||
const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
|
||||
const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
|
||||
const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
|
||||
|
||||
// Descale with rounding.
|
||||
const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
|
||||
const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
|
||||
// Add the inverse transform.
|
||||
SaturateAndStore4x4(dst, out01, out23);
|
||||
SaturateAndStore4x4_NEON(dst, out01, out23);
|
||||
}
|
||||
}
|
||||
|
||||
static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
|
||||
int16x8x2_t* const out) {
|
||||
static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
|
||||
const int16x8_t in1,
|
||||
int16x8x2_t* const out) {
|
||||
// a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
|
||||
// c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
|
||||
const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
|
||||
|
@ -90,7 +93,7 @@ static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
|
|||
*out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
|
||||
static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
|
||||
// {rows} = in0 | in4
|
||||
// in8 | in12
|
||||
// B1 = in4 | in12
|
||||
|
@ -113,22 +116,22 @@ static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
|
|||
const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
|
||||
const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
|
||||
const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
|
||||
Transpose8x2(E0, E1, rows);
|
||||
Transpose8x2_NEON(E0, E1, rows);
|
||||
}
|
||||
|
||||
static void ITransformOne(const uint8_t* ref,
|
||||
const int16_t* in, uint8_t* dst) {
|
||||
static void ITransformOne_NEON(const uint8_t* ref,
|
||||
const int16_t* in, uint8_t* dst) {
|
||||
int16x8x2_t rows;
|
||||
INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
|
||||
TransformPass(&rows);
|
||||
TransformPass(&rows);
|
||||
Add4x4(rows.val[0], rows.val[1], ref, dst);
|
||||
TransformPass_NEON(&rows);
|
||||
TransformPass_NEON(&rows);
|
||||
Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static void ITransformOne(const uint8_t* ref,
|
||||
const int16_t* in, uint8_t* dst) {
|
||||
static void ITransformOne_NEON(const uint8_t* ref,
|
||||
const int16_t* in, uint8_t* dst) {
|
||||
const int kBPS = BPS;
|
||||
const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
|
||||
|
||||
|
@ -243,16 +246,16 @@ static void ITransformOne(const uint8_t* ref,
|
|||
|
||||
#endif // WEBP_USE_INTRINSICS
|
||||
|
||||
static void ITransform(const uint8_t* ref,
|
||||
const int16_t* in, uint8_t* dst, int do_two) {
|
||||
ITransformOne(ref, in, dst);
|
||||
static void ITransform_NEON(const uint8_t* ref,
|
||||
const int16_t* in, uint8_t* dst, int do_two) {
|
||||
ITransformOne_NEON(ref, in, dst);
|
||||
if (do_two) {
|
||||
ITransformOne(ref + 4, in + 16, dst + 4);
|
||||
ITransformOne_NEON(ref + 4, in + 16, dst + 4);
|
||||
}
|
||||
}
|
||||
|
||||
// Load all 4x4 pixels into a single uint8x16_t variable.
|
||||
static uint8x16_t Load4x4(const uint8_t* src) {
|
||||
static uint8x16_t Load4x4_NEON(const uint8_t* src) {
|
||||
uint32x4_t out = vdupq_n_u32(0);
|
||||
out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
|
||||
out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
|
||||
|
@ -265,10 +268,12 @@ static uint8x16_t Load4x4(const uint8_t* src) {
|
|||
|
||||
#if defined(WEBP_USE_INTRINSICS)
|
||||
|
||||
static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
|
||||
const int16x4_t C, const int16x4_t D,
|
||||
int16x8_t* const out01,
|
||||
int16x8_t* const out32) {
|
||||
static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
|
||||
const int16x4_t B,
|
||||
const int16x4_t C,
|
||||
const int16x4_t D,
|
||||
int16x8_t* const out01,
|
||||
int16x8_t* const out32) {
|
||||
const int16x4x2_t AB = vtrn_s16(A, B);
|
||||
const int16x4x2_t CD = vtrn_s16(C, D);
|
||||
const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
|
||||
|
@ -283,24 +288,24 @@ static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
|
|||
vreinterpret_s64_s32(tmp02.val[1])));
|
||||
}
|
||||
|
||||
static WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a,
|
||||
const uint8x8_t b) {
|
||||
static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
|
||||
const uint8x8_t b) {
|
||||
return vreinterpretq_s16_u16(vsubl_u8(a, b));
|
||||
}
|
||||
|
||||
static void FTransform(const uint8_t* src, const uint8_t* ref,
|
||||
int16_t* out) {
|
||||
static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
|
||||
int16_t* out) {
|
||||
int16x8_t d0d1, d3d2; // working 4x4 int16 variables
|
||||
{
|
||||
const uint8x16_t S0 = Load4x4(src);
|
||||
const uint8x16_t R0 = Load4x4(ref);
|
||||
const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0));
|
||||
const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0));
|
||||
const uint8x16_t S0 = Load4x4_NEON(src);
|
||||
const uint8x16_t R0 = Load4x4_NEON(ref);
|
||||
const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
|
||||
const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
|
||||
const int16x4_t D0 = vget_low_s16(D0D1);
|
||||
const int16x4_t D1 = vget_high_s16(D0D1);
|
||||
const int16x4_t D2 = vget_low_s16(D2D3);
|
||||
const int16x4_t D3 = vget_high_s16(D2D3);
|
||||
Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2);
|
||||
Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
|
||||
}
|
||||
{ // 1rst pass
|
||||
const int32x4_t kCst937 = vdupq_n_s32(937);
|
||||
|
@ -318,7 +323,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
|
|||
const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
|
||||
const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
|
||||
const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
|
||||
Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
|
||||
Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
|
||||
}
|
||||
{ // 2nd pass
|
||||
// the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0)
|
||||
|
@ -358,8 +363,8 @@ static const int32_t kCoeff32[] = {
|
|||
51000, 51000, 51000, 51000
|
||||
};
|
||||
|
||||
static void FTransform(const uint8_t* src, const uint8_t* ref,
|
||||
int16_t* out) {
|
||||
static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
|
||||
int16_t* out) {
|
||||
const int kBPS = BPS;
|
||||
const uint8_t* src_ptr = src;
|
||||
const uint8_t* ref_ptr = ref;
|
||||
|
@ -478,7 +483,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
|
|||
src += stride; \
|
||||
} while (0)
|
||||
|
||||
static void FTransformWHT(const int16_t* src, int16_t* out) {
|
||||
static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
|
||||
const int stride = 16;
|
||||
const int16x4_t zero = vdup_n_s16(0);
|
||||
int32x4x4_t tmp0;
|
||||
|
@ -516,7 +521,7 @@ static void FTransformWHT(const int16_t* src, int16_t* out) {
|
|||
tmp0.val[3] = vsubq_s32(a0, a1);
|
||||
}
|
||||
{
|
||||
const int32x4x4_t tmp1 = Transpose4x4(tmp0);
|
||||
const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
|
||||
// a0 = tmp[0 + i] + tmp[ 8 + i]
|
||||
// a1 = tmp[4 + i] + tmp[12 + i]
|
||||
// a2 = tmp[4 + i] - tmp[12 + i]
|
||||
|
@ -560,7 +565,7 @@ static void FTransformWHT(const int16_t* src, int16_t* out) {
|
|||
// a 26ae, b 26ae
|
||||
// a 37bf, b 37bf
|
||||
//
|
||||
static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
|
||||
static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
|
||||
const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
|
||||
const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
|
||||
const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
|
||||
|
@ -574,7 +579,8 @@ static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
|
|||
return q4_in;
|
||||
}
|
||||
|
||||
static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) {
|
||||
static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
|
||||
const int16x8x4_t q4_in) {
|
||||
// {a0, a1} = {in[0] + in[2], in[1] + in[3]}
|
||||
// {a3, a2} = {in[0] - in[2], in[1] - in[3]}
|
||||
const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
|
||||
|
@ -593,7 +599,7 @@ static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) {
|
|||
return q4_out;
|
||||
}
|
||||
|
||||
static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) {
|
||||
static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
|
||||
const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
|
||||
q4_in.val[2]));
|
||||
const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
|
||||
|
@ -610,7 +616,7 @@ static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) {
|
|||
return q4_out;
|
||||
}
|
||||
|
||||
static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
|
||||
static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
|
||||
const uint16x8_t q_w07 = vld1q_u16(&w[0]);
|
||||
const uint16x8_t q_w8f = vld1q_u16(&w[8]);
|
||||
int16x4x4_t d4_w;
|
||||
|
@ -622,8 +628,8 @@ static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
|
|||
return d4_w;
|
||||
}
|
||||
|
||||
static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
|
||||
const int16x4x4_t d4_w) {
|
||||
static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
|
||||
const int16x4x4_t d4_w) {
|
||||
int32x2_t d_sum;
|
||||
// sum += w[ 0] * abs(b0);
|
||||
// sum += w[ 4] * abs(b1);
|
||||
|
@ -652,8 +658,8 @@ static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
|
|||
// Hadamard transform
|
||||
// Returns the weighted sum of the absolute value of transformed coefficients.
|
||||
// w[] contains a row-major 4 by 4 symmetric matrix.
|
||||
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
|
||||
uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
|
||||
uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
|
||||
|
@ -679,12 +685,12 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
|
|||
// Vertical pass first to avoid a transpose (vertical and horizontal passes
|
||||
// are commutative because w/kWeightY is symmetric) and subsequent
|
||||
// transpose.
|
||||
const int16x8x4_t q4_v = DistoVerticalPass(d4_in);
|
||||
const int16x4x4_t d4_w = DistoLoadW(w);
|
||||
const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
|
||||
const int16x4x4_t d4_w = DistoLoadW_NEON(w);
|
||||
// horizontal pass
|
||||
const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_v);
|
||||
const int16x8x4_t q4_h = DistoHorizontalPass(q4_t);
|
||||
int32x2_t d_sum = DistoSum(q4_h, d4_w);
|
||||
const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
|
||||
const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
|
||||
int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
|
||||
|
||||
// abs(sum2 - sum1) >> 5
|
||||
d_sum = vabs_s32(d_sum);
|
||||
|
@ -694,13 +700,13 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
|
|||
}
|
||||
#undef LOAD_LANE_32b
|
||||
|
||||
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
int D = 0;
|
||||
int x, y;
|
||||
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
|
||||
for (x = 0; x < 16; x += 4) {
|
||||
D += Disto4x4(a + x + y, b + x + y, w);
|
||||
D += Disto4x4_NEON(a + x + y, b + x + y, w);
|
||||
}
|
||||
}
|
||||
return D;
|
||||
|
@ -708,15 +714,15 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
|
|||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
|
||||
int start_block, int end_block,
|
||||
VP8Histogram* const histo) {
|
||||
static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
|
||||
int start_block, int end_block,
|
||||
VP8Histogram* const histo) {
|
||||
const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
|
||||
int j;
|
||||
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
|
||||
for (j = start_block; j < end_block; ++j) {
|
||||
int16_t out[16];
|
||||
FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
|
||||
FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
|
||||
{
|
||||
int k;
|
||||
const int16x8_t a0 = vld1q_s16(out + 0);
|
||||
|
@ -740,9 +746,9 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
|
|||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
|
||||
const uint8_t* const b,
|
||||
uint32x4_t* const sum) {
|
||||
static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
|
||||
const uint8_t* const b,
|
||||
uint32x4_t* const sum) {
|
||||
const uint8x16_t a0 = vld1q_u8(a);
|
||||
const uint8x16_t b0 = vld1q_u8(b);
|
||||
const uint8x16_t abs_diff = vabdq_u8(a0, b0);
|
||||
|
@ -757,7 +763,7 @@ static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
|
|||
}
|
||||
|
||||
// Horizontal sum of all four uint32_t values in 'sum'.
|
||||
static int SumToInt(uint32x4_t sum) {
|
||||
static int SumToInt_NEON(uint32x4_t sum) {
|
||||
const uint64x2_t sum2 = vpaddlq_u32(sum);
|
||||
const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
|
||||
return (int)sum3;
|
||||
|
@ -767,18 +773,18 @@ static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
|
|||
uint32x4_t sum = vdupq_n_u32(0);
|
||||
int y;
|
||||
for (y = 0; y < 16; ++y) {
|
||||
AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
|
||||
AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
|
||||
}
|
||||
return SumToInt(sum);
|
||||
return SumToInt_NEON(sum);
|
||||
}
|
||||
|
||||
static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
|
||||
uint32x4_t sum = vdupq_n_u32(0);
|
||||
int y;
|
||||
for (y = 0; y < 8; ++y) {
|
||||
AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
|
||||
AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
|
||||
}
|
||||
return SumToInt(sum);
|
||||
return SumToInt_NEON(sum);
|
||||
}
|
||||
|
||||
static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
|
||||
|
@ -791,12 +797,12 @@ static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
|
|||
const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
|
||||
sum = vpadalq_u16(sum, prod);
|
||||
}
|
||||
return SumToInt(sum);
|
||||
return SumToInt_NEON(sum);
|
||||
}
|
||||
|
||||
static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
|
||||
const uint8x16_t a0 = Load4x4(a);
|
||||
const uint8x16_t b0 = Load4x4(b);
|
||||
const uint8x16_t a0 = Load4x4_NEON(a);
|
||||
const uint8x16_t b0 = Load4x4_NEON(b);
|
||||
const uint8x16_t abs_diff = vabdq_u8(a0, b0);
|
||||
const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
|
||||
vget_low_u8(abs_diff));
|
||||
|
@ -805,7 +811,7 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
|
|||
/* pair-wise adds and widen */
|
||||
const uint32x4_t sum1 = vpaddlq_u16(prod1);
|
||||
const uint32x4_t sum2 = vpaddlq_u16(prod2);
|
||||
return SumToInt(vaddq_u32(sum1, sum2));
|
||||
return SumToInt_NEON(vaddq_u32(sum1, sum2));
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
@ -813,8 +819,8 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
|
|||
// Compilation with gcc-4.6.x is problematic for now.
|
||||
#if !defined(WORK_AROUND_GCC)
|
||||
|
||||
static int16x8_t Quantize(int16_t* const in,
|
||||
const VP8Matrix* const mtx, int offset) {
|
||||
static int16x8_t Quantize_NEON(int16_t* const in,
|
||||
const VP8Matrix* const mtx, int offset) {
|
||||
const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
|
||||
const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
|
||||
const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
|
||||
|
@ -847,10 +853,10 @@ static const uint8_t kShuffles[4][8] = {
|
|||
{ 14, 15, 22, 23, 28, 29, 30, 31 }
|
||||
};
|
||||
|
||||
static int QuantizeBlock(int16_t in[16], int16_t out[16],
|
||||
const VP8Matrix* const mtx) {
|
||||
const int16x8_t out0 = Quantize(in, mtx, 0);
|
||||
const int16x8_t out1 = Quantize(in, mtx, 8);
|
||||
static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
|
||||
const VP8Matrix* const mtx) {
|
||||
const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
|
||||
const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
|
||||
uint8x8x4_t shuffles;
|
||||
// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
|
||||
// non-standard versions there.
|
||||
|
@ -889,11 +895,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int Quantize2Blocks(int16_t in[32], int16_t out[32],
|
||||
const VP8Matrix* const mtx) {
|
||||
static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
|
||||
const VP8Matrix* const mtx) {
|
||||
int nz;
|
||||
nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
|
||||
nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
|
||||
nz = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
|
||||
nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
|
||||
return nz;
|
||||
}
|
||||
|
||||
|
@ -905,14 +911,14 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
|
|||
extern void VP8EncDspInitNEON(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
|
||||
VP8ITransform = ITransform;
|
||||
VP8FTransform = FTransform;
|
||||
VP8ITransform = ITransform_NEON;
|
||||
VP8FTransform = FTransform_NEON;
|
||||
|
||||
VP8FTransformWHT = FTransformWHT;
|
||||
VP8FTransformWHT = FTransformWHT_NEON;
|
||||
|
||||
VP8TDisto4x4 = Disto4x4;
|
||||
VP8TDisto16x16 = Disto16x16;
|
||||
VP8CollectHistogram = CollectHistogram;
|
||||
VP8TDisto4x4 = Disto4x4_NEON;
|
||||
VP8TDisto16x16 = Disto16x16_NEON;
|
||||
VP8CollectHistogram = CollectHistogram_NEON;
|
||||
|
||||
VP8SSE16x16 = SSE16x16_NEON;
|
||||
VP8SSE16x8 = SSE16x8_NEON;
|
||||
|
@ -920,8 +926,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
|
|||
VP8SSE4x4 = SSE4x4_NEON;
|
||||
|
||||
#if !defined(WORK_AROUND_GCC)
|
||||
VP8EncQuantizeBlock = QuantizeBlock;
|
||||
VP8EncQuantize2Blocks = Quantize2Blocks;
|
||||
VP8EncQuantizeBlock = QuantizeBlock_NEON;
|
||||
VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
|
||||
#endif
|
||||
}
|
||||
|
|
@ -11,23 +11,23 @@
|
|||
//
|
||||
// Author: Christian Duvivier (cduvivier@google.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
#include <assert.h>
|
||||
#include <stdlib.h> // for abs()
|
||||
#include <emmintrin.h>
|
||||
|
||||
#include "./common_sse2.h"
|
||||
#include "../enc/cost_enc.h"
|
||||
#include "../enc/vp8i_enc.h"
|
||||
#include "src/dsp/common_sse2.h"
|
||||
#include "src/enc/cost_enc.h"
|
||||
#include "src/enc/vp8i_enc.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Transforms (Paragraph 14.4)
|
||||
|
||||
// Does one or two inverse transforms.
|
||||
static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
|
||||
int do_two) {
|
||||
static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
|
||||
int do_two) {
|
||||
// This implementation makes use of 16-bit fixed point versions of two
|
||||
// multiply constants:
|
||||
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
|
||||
|
@ -193,10 +193,10 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
|
|||
}
|
||||
}
|
||||
|
||||
static void FTransformPass1(const __m128i* const in01,
|
||||
const __m128i* const in23,
|
||||
__m128i* const out01,
|
||||
__m128i* const out32) {
|
||||
static void FTransformPass1_SSE2(const __m128i* const in01,
|
||||
const __m128i* const in23,
|
||||
__m128i* const out01,
|
||||
__m128i* const out32) {
|
||||
const __m128i k937 = _mm_set1_epi32(937);
|
||||
const __m128i k1812 = _mm_set1_epi32(1812);
|
||||
|
||||
|
@ -239,8 +239,9 @@ static void FTransformPass1(const __m128i* const in01,
|
|||
*out32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 3 2 3 2 3 2..
|
||||
}
|
||||
|
||||
static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
|
||||
int16_t* out) {
|
||||
static void FTransformPass2_SSE2(const __m128i* const v01,
|
||||
const __m128i* const v32,
|
||||
int16_t* out) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i seven = _mm_set1_epi16(7);
|
||||
const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217,
|
||||
|
@ -291,7 +292,8 @@ static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
|
|||
_mm_storeu_si128((__m128i*)&out[8], d2_f3);
|
||||
}
|
||||
|
||||
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
||||
static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref,
|
||||
int16_t* out) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
// Load src.
|
||||
const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
|
||||
|
@ -328,13 +330,14 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
|||
__m128i v01, v32;
|
||||
|
||||
// First pass
|
||||
FTransformPass1(&row01, &row23, &v01, &v32);
|
||||
FTransformPass1_SSE2(&row01, &row23, &v01, &v32);
|
||||
|
||||
// Second pass
|
||||
FTransformPass2(&v01, &v32, out);
|
||||
FTransformPass2_SSE2(&v01, &v32, out);
|
||||
}
|
||||
|
||||
static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
||||
static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref,
|
||||
int16_t* out) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
|
||||
// Load src and convert to 16b.
|
||||
|
@ -374,15 +377,15 @@ static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
|||
__m128i v01h, v32h;
|
||||
|
||||
// First pass
|
||||
FTransformPass1(&shuf01l, &shuf23l, &v01l, &v32l);
|
||||
FTransformPass1(&shuf01h, &shuf23h, &v01h, &v32h);
|
||||
FTransformPass1_SSE2(&shuf01l, &shuf23l, &v01l, &v32l);
|
||||
FTransformPass1_SSE2(&shuf01h, &shuf23h, &v01h, &v32h);
|
||||
|
||||
// Second pass
|
||||
FTransformPass2(&v01l, &v32l, out + 0);
|
||||
FTransformPass2(&v01h, &v32h, out + 16);
|
||||
FTransformPass2_SSE2(&v01l, &v32l, out + 0);
|
||||
FTransformPass2_SSE2(&v01h, &v32h, out + 16);
|
||||
}
|
||||
|
||||
static void FTransformWHTRow(const int16_t* const in, __m128i* const out) {
|
||||
static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) {
|
||||
const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1);
|
||||
const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]);
|
||||
const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]);
|
||||
|
@ -398,14 +401,14 @@ static void FTransformWHTRow(const int16_t* const in, __m128i* const out) {
|
|||
*out = _mm_madd_epi16(D, kMult);
|
||||
}
|
||||
|
||||
static void FTransformWHT(const int16_t* in, int16_t* out) {
|
||||
static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) {
|
||||
// Input is 12b signed.
|
||||
__m128i row0, row1, row2, row3;
|
||||
// Rows are 14b signed.
|
||||
FTransformWHTRow(in + 0 * 64, &row0);
|
||||
FTransformWHTRow(in + 1 * 64, &row1);
|
||||
FTransformWHTRow(in + 2 * 64, &row2);
|
||||
FTransformWHTRow(in + 3 * 64, &row3);
|
||||
FTransformWHTRow_SSE2(in + 0 * 64, &row0);
|
||||
FTransformWHTRow_SSE2(in + 1 * 64, &row1);
|
||||
FTransformWHTRow_SSE2(in + 2 * 64, &row2);
|
||||
FTransformWHTRow_SSE2(in + 3 * 64, &row3);
|
||||
|
||||
{
|
||||
// The a* are 15b signed.
|
||||
|
@ -431,9 +434,9 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
|
|||
// Compute susceptibility based on DCT-coeff histograms:
|
||||
// the higher, the "easier" the macroblock is to compress.
|
||||
|
||||
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
|
||||
int start_block, int end_block,
|
||||
VP8Histogram* const histo) {
|
||||
static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred,
|
||||
int start_block, int end_block,
|
||||
VP8Histogram* const histo) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
|
||||
int j;
|
||||
|
@ -442,7 +445,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
|
|||
int16_t out[16];
|
||||
int k;
|
||||
|
||||
FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
|
||||
FTransform_SSE2(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
|
||||
|
||||
// Convert coefficients to bin (within out[]).
|
||||
{
|
||||
|
@ -476,7 +479,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
|
|||
// Intra predictions
|
||||
|
||||
// helper for chroma-DC predictions
|
||||
static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
|
||||
static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
|
||||
int j;
|
||||
const __m128i values = _mm_set1_epi8(v);
|
||||
for (j = 0; j < 8; ++j) {
|
||||
|
@ -484,7 +487,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
|
|||
}
|
||||
}
|
||||
|
||||
static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
|
||||
static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
|
||||
int j;
|
||||
const __m128i values = _mm_set1_epi8(v);
|
||||
for (j = 0; j < 16; ++j) {
|
||||
|
@ -492,20 +495,20 @@ static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
|
|||
}
|
||||
}
|
||||
|
||||
static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
|
||||
static WEBP_INLINE void Fill_SSE2(uint8_t* dst, int value, int size) {
|
||||
if (size == 4) {
|
||||
int j;
|
||||
for (j = 0; j < 4; ++j) {
|
||||
memset(dst + j * BPS, value, 4);
|
||||
}
|
||||
} else if (size == 8) {
|
||||
Put8x8uv(value, dst);
|
||||
Put8x8uv_SSE2(value, dst);
|
||||
} else {
|
||||
Put16(value, dst);
|
||||
Put16_SSE2(value, dst);
|
||||
}
|
||||
}
|
||||
|
||||
static WEBP_INLINE void VE8uv(uint8_t* dst, const uint8_t* top) {
|
||||
static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) {
|
||||
int j;
|
||||
const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
|
||||
for (j = 0; j < 8; ++j) {
|
||||
|
@ -513,7 +516,7 @@ static WEBP_INLINE void VE8uv(uint8_t* dst, const uint8_t* top) {
|
|||
}
|
||||
}
|
||||
|
||||
static WEBP_INLINE void VE16(uint8_t* dst, const uint8_t* top) {
|
||||
static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) {
|
||||
const __m128i top_values = _mm_load_si128((const __m128i*)top);
|
||||
int j;
|
||||
for (j = 0; j < 16; ++j) {
|
||||
|
@ -521,20 +524,20 @@ static WEBP_INLINE void VE16(uint8_t* dst, const uint8_t* top) {
|
|||
}
|
||||
}
|
||||
|
||||
static WEBP_INLINE void VerticalPred(uint8_t* dst,
|
||||
const uint8_t* top, int size) {
|
||||
static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst,
|
||||
const uint8_t* top, int size) {
|
||||
if (top != NULL) {
|
||||
if (size == 8) {
|
||||
VE8uv(dst, top);
|
||||
VE8uv_SSE2(dst, top);
|
||||
} else {
|
||||
VE16(dst, top);
|
||||
VE16_SSE2(dst, top);
|
||||
}
|
||||
} else {
|
||||
Fill(dst, 127, size);
|
||||
Fill_SSE2(dst, 127, size);
|
||||
}
|
||||
}
|
||||
|
||||
static WEBP_INLINE void HE8uv(uint8_t* dst, const uint8_t* left) {
|
||||
static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
|
||||
int j;
|
||||
for (j = 0; j < 8; ++j) {
|
||||
const __m128i values = _mm_set1_epi8(left[j]);
|
||||
|
@ -543,7 +546,7 @@ static WEBP_INLINE void HE8uv(uint8_t* dst, const uint8_t* left) {
|
|||
}
|
||||
}
|
||||
|
||||
static WEBP_INLINE void HE16(uint8_t* dst, const uint8_t* left) {
|
||||
static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) {
|
||||
int j;
|
||||
for (j = 0; j < 16; ++j) {
|
||||
const __m128i values = _mm_set1_epi8(left[j]);
|
||||
|
@ -552,21 +555,21 @@ static WEBP_INLINE void HE16(uint8_t* dst, const uint8_t* left) {
|
|||
}
|
||||
}
|
||||
|
||||
static WEBP_INLINE void HorizontalPred(uint8_t* dst,
|
||||
const uint8_t* left, int size) {
|
||||
static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst,
|
||||
const uint8_t* left, int size) {
|
||||
if (left != NULL) {
|
||||
if (size == 8) {
|
||||
HE8uv(dst, left);
|
||||
HE8uv_SSE2(dst, left);
|
||||
} else {
|
||||
HE16(dst, left);
|
||||
HE16_SSE2(dst, left);
|
||||
}
|
||||
} else {
|
||||
Fill(dst, 129, size);
|
||||
Fill_SSE2(dst, 129, size);
|
||||
}
|
||||
}
|
||||
|
||||
static WEBP_INLINE void TM(uint8_t* dst, const uint8_t* left,
|
||||
const uint8_t* top, int size) {
|
||||
static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left,
|
||||
const uint8_t* top, int size) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
int y;
|
||||
if (size == 8) {
|
||||
|
@ -593,13 +596,13 @@ static WEBP_INLINE void TM(uint8_t* dst, const uint8_t* left,
|
|||
}
|
||||
}
|
||||
|
||||
static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
|
||||
const uint8_t* top, int size) {
|
||||
static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left,
|
||||
const uint8_t* top, int size) {
|
||||
if (left != NULL) {
|
||||
if (top != NULL) {
|
||||
TM(dst, left, top, size);
|
||||
TM_SSE2(dst, left, top, size);
|
||||
} else {
|
||||
HorizontalPred(dst, left, size);
|
||||
HorizontalPred_SSE2(dst, left, size);
|
||||
}
|
||||
} else {
|
||||
// true motion without left samples (hence: with default 129 value)
|
||||
|
@ -607,90 +610,90 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
|
|||
// Note that if top samples are not available, the default value is
|
||||
// then 129, and not 127 as in the VerticalPred case.
|
||||
if (top != NULL) {
|
||||
VerticalPred(dst, top, size);
|
||||
VerticalPred_SSE2(dst, top, size);
|
||||
} else {
|
||||
Fill(dst, 129, size);
|
||||
Fill_SSE2(dst, 129, size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static WEBP_INLINE void DC8uv(uint8_t* dst, const uint8_t* left,
|
||||
const uint8_t* top) {
|
||||
static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left,
|
||||
const uint8_t* top) {
|
||||
const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
|
||||
const __m128i left_values = _mm_loadl_epi64((const __m128i*)left);
|
||||
const __m128i combined = _mm_unpacklo_epi64(top_values, left_values);
|
||||
const int DC = VP8HorizontalAdd8b(&combined) + 8;
|
||||
Put8x8uv(DC >> 4, dst);
|
||||
Put8x8uv_SSE2(DC >> 4, dst);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void DC8uvNoLeft(uint8_t* dst, const uint8_t* top) {
|
||||
static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
|
||||
const __m128i sum = _mm_sad_epu8(top_values, zero);
|
||||
const int DC = _mm_cvtsi128_si32(sum) + 4;
|
||||
Put8x8uv(DC >> 3, dst);
|
||||
Put8x8uv_SSE2(DC >> 3, dst);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void DC8uvNoTop(uint8_t* dst, const uint8_t* left) {
|
||||
static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* dst, const uint8_t* left) {
|
||||
// 'left' is contiguous so we can reuse the top summation.
|
||||
DC8uvNoLeft(dst, left);
|
||||
DC8uvNoLeft_SSE2(dst, left);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void DC8uvNoTopLeft(uint8_t* dst) {
|
||||
Put8x8uv(0x80, dst);
|
||||
static WEBP_INLINE void DC8uvNoTopLeft_SSE2(uint8_t* dst) {
|
||||
Put8x8uv_SSE2(0x80, dst);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void DC8uvMode(uint8_t* dst, const uint8_t* left,
|
||||
const uint8_t* top) {
|
||||
static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left,
|
||||
const uint8_t* top) {
|
||||
if (top != NULL) {
|
||||
if (left != NULL) { // top and left present
|
||||
DC8uv(dst, left, top);
|
||||
DC8uv_SSE2(dst, left, top);
|
||||
} else { // top, but no left
|
||||
DC8uvNoLeft(dst, top);
|
||||
DC8uvNoLeft_SSE2(dst, top);
|
||||
}
|
||||
} else if (left != NULL) { // left but no top
|
||||
DC8uvNoTop(dst, left);
|
||||
DC8uvNoTop_SSE2(dst, left);
|
||||
} else { // no top, no left, nothing.
|
||||
DC8uvNoTopLeft(dst);
|
||||
DC8uvNoTopLeft_SSE2(dst);
|
||||
}
|
||||
}
|
||||
|
||||
static WEBP_INLINE void DC16(uint8_t* dst, const uint8_t* left,
|
||||
const uint8_t* top) {
|
||||
static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left,
|
||||
const uint8_t* top) {
|
||||
const __m128i top_row = _mm_load_si128((const __m128i*)top);
|
||||
const __m128i left_row = _mm_load_si128((const __m128i*)left);
|
||||
const int DC =
|
||||
VP8HorizontalAdd8b(&top_row) + VP8HorizontalAdd8b(&left_row) + 16;
|
||||
Put16(DC >> 5, dst);
|
||||
Put16_SSE2(DC >> 5, dst);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void DC16NoLeft(uint8_t* dst, const uint8_t* top) {
|
||||
static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
|
||||
const __m128i top_row = _mm_load_si128((const __m128i*)top);
|
||||
const int DC = VP8HorizontalAdd8b(&top_row) + 8;
|
||||
Put16(DC >> 4, dst);
|
||||
Put16_SSE2(DC >> 4, dst);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void DC16NoTop(uint8_t* dst, const uint8_t* left) {
|
||||
static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* dst, const uint8_t* left) {
|
||||
// 'left' is contiguous so we can reuse the top summation.
|
||||
DC16NoLeft(dst, left);
|
||||
DC16NoLeft_SSE2(dst, left);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void DC16NoTopLeft(uint8_t* dst) {
|
||||
Put16(0x80, dst);
|
||||
static WEBP_INLINE void DC16NoTopLeft_SSE2(uint8_t* dst) {
|
||||
Put16_SSE2(0x80, dst);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void DC16Mode(uint8_t* dst, const uint8_t* left,
|
||||
const uint8_t* top) {
|
||||
static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left,
|
||||
const uint8_t* top) {
|
||||
if (top != NULL) {
|
||||
if (left != NULL) { // top and left present
|
||||
DC16(dst, left, top);
|
||||
DC16_SSE2(dst, left, top);
|
||||
} else { // top, but no left
|
||||
DC16NoLeft(dst, top);
|
||||
DC16NoLeft_SSE2(dst, top);
|
||||
}
|
||||
} else if (left != NULL) { // left but no top
|
||||
DC16NoTop(dst, left);
|
||||
DC16NoTop_SSE2(dst, left);
|
||||
} else { // no top, no left, nothing.
|
||||
DC16NoTopLeft(dst);
|
||||
DC16NoTopLeft_SSE2(dst);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -709,7 +712,8 @@ static WEBP_INLINE void DC16Mode(uint8_t* dst, const uint8_t* left,
|
|||
// where: AC = (a + b + 1) >> 1, BC = (b + c + 1) >> 1
|
||||
// and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
|
||||
|
||||
static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical
|
||||
static WEBP_INLINE void VE4_SSE2(uint8_t* dst,
|
||||
const uint8_t* top) { // vertical
|
||||
const __m128i one = _mm_set1_epi8(1);
|
||||
const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1));
|
||||
const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
|
||||
|
@ -725,7 +729,8 @@ static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical
|
|||
}
|
||||
}
|
||||
|
||||
static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
|
||||
static WEBP_INLINE void HE4_SSE2(uint8_t* dst,
|
||||
const uint8_t* top) { // horizontal
|
||||
const int X = top[-1];
|
||||
const int I = top[-2];
|
||||
const int J = top[-3];
|
||||
|
@ -737,14 +742,15 @@ static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
|
|||
WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
|
||||
}
|
||||
|
||||
static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
|
||||
static WEBP_INLINE void DC4_SSE2(uint8_t* dst, const uint8_t* top) {
|
||||
uint32_t dc = 4;
|
||||
int i;
|
||||
for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
|
||||
Fill(dst, dc >> 3, 4);
|
||||
Fill_SSE2(dst, dc >> 3, 4);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) { // Down-Left
|
||||
static WEBP_INLINE void LD4_SSE2(uint8_t* dst,
|
||||
const uint8_t* top) { // Down-Left
|
||||
const __m128i one = _mm_set1_epi8(1);
|
||||
const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
|
||||
const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
|
||||
|
@ -760,8 +766,8 @@ static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) { // Down-Left
|
|||
WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
|
||||
}
|
||||
|
||||
static WEBP_INLINE void VR4(uint8_t* dst,
|
||||
const uint8_t* top) { // Vertical-Right
|
||||
static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
|
||||
const uint8_t* top) { // Vertical-Right
|
||||
const __m128i one = _mm_set1_epi8(1);
|
||||
const int I = top[-2];
|
||||
const int J = top[-3];
|
||||
|
@ -786,8 +792,8 @@ static WEBP_INLINE void VR4(uint8_t* dst,
|
|||
DST(0, 3) = AVG3(K, J, I);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void VL4(uint8_t* dst,
|
||||
const uint8_t* top) { // Vertical-Left
|
||||
static WEBP_INLINE void VL4_SSE2(uint8_t* dst,
|
||||
const uint8_t* top) { // Vertical-Left
|
||||
const __m128i one = _mm_set1_epi8(1);
|
||||
const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
|
||||
const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
|
||||
|
@ -812,7 +818,8 @@ static WEBP_INLINE void VL4(uint8_t* dst,
|
|||
DST(3, 3) = (extra_out >> 8) & 0xff;
|
||||
}
|
||||
|
||||
static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) { // Down-right
|
||||
static WEBP_INLINE void RD4_SSE2(uint8_t* dst,
|
||||
const uint8_t* top) { // Down-right
|
||||
const __m128i one = _mm_set1_epi8(1);
|
||||
const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5));
|
||||
const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4);
|
||||
|
@ -828,7 +835,7 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) { // Down-right
|
|||
WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
|
||||
}
|
||||
|
||||
static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
|
||||
static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) {
|
||||
const int I = top[-2];
|
||||
const int J = top[-3];
|
||||
const int K = top[-4];
|
||||
|
@ -843,7 +850,7 @@ static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
|
|||
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
|
||||
}
|
||||
|
||||
static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
|
||||
static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) {
|
||||
const int X = top[-1];
|
||||
const int I = top[-2];
|
||||
const int J = top[-3];
|
||||
|
@ -866,7 +873,7 @@ static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
|
|||
DST(1, 3) = AVG3(L, K, J);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
|
||||
static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top));
|
||||
const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
|
||||
|
@ -888,55 +895,56 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
|
|||
|
||||
// Left samples are top[-5 .. -2], top_left is top[-1], top are
|
||||
// located at top[0..3], and top right is top[4..7]
|
||||
static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
|
||||
DC4(I4DC4 + dst, top);
|
||||
TM4(I4TM4 + dst, top);
|
||||
VE4(I4VE4 + dst, top);
|
||||
HE4(I4HE4 + dst, top);
|
||||
RD4(I4RD4 + dst, top);
|
||||
VR4(I4VR4 + dst, top);
|
||||
LD4(I4LD4 + dst, top);
|
||||
VL4(I4VL4 + dst, top);
|
||||
HD4(I4HD4 + dst, top);
|
||||
HU4(I4HU4 + dst, top);
|
||||
static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) {
|
||||
DC4_SSE2(I4DC4 + dst, top);
|
||||
TM4_SSE2(I4TM4 + dst, top);
|
||||
VE4_SSE2(I4VE4 + dst, top);
|
||||
HE4_SSE2(I4HE4 + dst, top);
|
||||
RD4_SSE2(I4RD4 + dst, top);
|
||||
VR4_SSE2(I4VR4 + dst, top);
|
||||
LD4_SSE2(I4LD4 + dst, top);
|
||||
VL4_SSE2(I4VL4 + dst, top);
|
||||
HD4_SSE2(I4HD4 + dst, top);
|
||||
HU4_SSE2(I4HU4 + dst, top);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Chroma 8x8 prediction (paragraph 12.2)
|
||||
|
||||
static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
|
||||
const uint8_t* top) {
|
||||
static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left,
|
||||
const uint8_t* top) {
|
||||
// U block
|
||||
DC8uvMode(C8DC8 + dst, left, top);
|
||||
VerticalPred(C8VE8 + dst, top, 8);
|
||||
HorizontalPred(C8HE8 + dst, left, 8);
|
||||
TrueMotion(C8TM8 + dst, left, top, 8);
|
||||
DC8uvMode_SSE2(C8DC8 + dst, left, top);
|
||||
VerticalPred_SSE2(C8VE8 + dst, top, 8);
|
||||
HorizontalPred_SSE2(C8HE8 + dst, left, 8);
|
||||
TrueMotion_SSE2(C8TM8 + dst, left, top, 8);
|
||||
// V block
|
||||
dst += 8;
|
||||
if (top != NULL) top += 8;
|
||||
if (left != NULL) left += 16;
|
||||
DC8uvMode(C8DC8 + dst, left, top);
|
||||
VerticalPred(C8VE8 + dst, top, 8);
|
||||
HorizontalPred(C8HE8 + dst, left, 8);
|
||||
TrueMotion(C8TM8 + dst, left, top, 8);
|
||||
DC8uvMode_SSE2(C8DC8 + dst, left, top);
|
||||
VerticalPred_SSE2(C8VE8 + dst, top, 8);
|
||||
HorizontalPred_SSE2(C8HE8 + dst, left, 8);
|
||||
TrueMotion_SSE2(C8TM8 + dst, left, top, 8);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// luma 16x16 prediction (paragraph 12.3)
|
||||
|
||||
static void Intra16Preds(uint8_t* dst,
|
||||
const uint8_t* left, const uint8_t* top) {
|
||||
DC16Mode(I16DC16 + dst, left, top);
|
||||
VerticalPred(I16VE16 + dst, top, 16);
|
||||
HorizontalPred(I16HE16 + dst, left, 16);
|
||||
TrueMotion(I16TM16 + dst, left, top, 16);
|
||||
static void Intra16Preds_SSE2(uint8_t* dst,
|
||||
const uint8_t* left, const uint8_t* top) {
|
||||
DC16Mode_SSE2(I16DC16 + dst, left, top);
|
||||
VerticalPred_SSE2(I16VE16 + dst, top, 16);
|
||||
HorizontalPred_SSE2(I16HE16 + dst, left, 16);
|
||||
TrueMotion_SSE2(I16TM16 + dst, left, top, 16);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Metric
|
||||
|
||||
static WEBP_INLINE void SubtractAndAccumulate(const __m128i a, const __m128i b,
|
||||
__m128i* const sum) {
|
||||
static WEBP_INLINE void SubtractAndAccumulate_SSE2(const __m128i a,
|
||||
const __m128i b,
|
||||
__m128i* const sum) {
|
||||
// take abs(a-b) in 8b
|
||||
const __m128i a_b = _mm_subs_epu8(a, b);
|
||||
const __m128i b_a = _mm_subs_epu8(b, a);
|
||||
|
@ -951,8 +959,8 @@ static WEBP_INLINE void SubtractAndAccumulate(const __m128i a, const __m128i b,
|
|||
*sum = _mm_add_epi32(sum1, sum2);
|
||||
}
|
||||
|
||||
static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
|
||||
int num_pairs) {
|
||||
static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b,
|
||||
int num_pairs) {
|
||||
__m128i sum = _mm_setzero_si128();
|
||||
int32_t tmp[4];
|
||||
int i;
|
||||
|
@ -963,8 +971,8 @@ static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
|
|||
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[BPS * 1]);
|
||||
const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[BPS * 1]);
|
||||
__m128i sum1, sum2;
|
||||
SubtractAndAccumulate(a0, b0, &sum1);
|
||||
SubtractAndAccumulate(a1, b1, &sum2);
|
||||
SubtractAndAccumulate_SSE2(a0, b0, &sum1);
|
||||
SubtractAndAccumulate_SSE2(a1, b1, &sum2);
|
||||
sum = _mm_add_epi32(sum, _mm_add_epi32(sum1, sum2));
|
||||
a += 2 * BPS;
|
||||
b += 2 * BPS;
|
||||
|
@ -973,18 +981,18 @@ static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
|
|||
return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
|
||||
}
|
||||
|
||||
static int SSE16x16(const uint8_t* a, const uint8_t* b) {
|
||||
return SSE_16xN(a, b, 8);
|
||||
static int SSE16x16_SSE2(const uint8_t* a, const uint8_t* b) {
|
||||
return SSE_16xN_SSE2(a, b, 8);
|
||||
}
|
||||
|
||||
static int SSE16x8(const uint8_t* a, const uint8_t* b) {
|
||||
return SSE_16xN(a, b, 4);
|
||||
static int SSE16x8_SSE2(const uint8_t* a, const uint8_t* b) {
|
||||
return SSE_16xN_SSE2(a, b, 4);
|
||||
}
|
||||
|
||||
#define LOAD_8x16b(ptr) \
|
||||
_mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero)
|
||||
|
||||
static int SSE8x8(const uint8_t* a, const uint8_t* b) {
|
||||
static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
int num_pairs = 4;
|
||||
__m128i sum = zero;
|
||||
|
@ -1011,7 +1019,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
|
|||
}
|
||||
#undef LOAD_8x16b
|
||||
|
||||
static int SSE4x4(const uint8_t* a, const uint8_t* b) {
|
||||
static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
|
||||
// Load values. Note that we read 8 pixels instead of 4,
|
||||
|
@ -1048,7 +1056,7 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
|
|||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
|
||||
static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) {
|
||||
const __m128i mask = _mm_set1_epi16(0x00ff);
|
||||
const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]);
|
||||
const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]);
|
||||
|
@ -1086,8 +1094,8 @@ static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
|
|||
// Hadamard transform
|
||||
// Returns the weighted sum of the absolute value of transformed coefficients.
|
||||
// w[] contains a row-major 4 by 4 symmetric matrix.
|
||||
static int TTransform(const uint8_t* inA, const uint8_t* inB,
|
||||
const uint16_t* const w) {
|
||||
static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB,
|
||||
const uint16_t* const w) {
|
||||
int32_t sum[4];
|
||||
__m128i tmp_0, tmp_1, tmp_2, tmp_3;
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
|
@ -1187,19 +1195,19 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB,
|
|||
return sum[0] + sum[1] + sum[2] + sum[3];
|
||||
}
|
||||
|
||||
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
const int diff_sum = TTransform(a, b, w);
|
||||
static int Disto4x4_SSE2(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
const int diff_sum = TTransform_SSE2(a, b, w);
|
||||
return abs(diff_sum) >> 5;
|
||||
}
|
||||
|
||||
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
int D = 0;
|
||||
int x, y;
|
||||
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
|
||||
for (x = 0; x < 16; x += 4) {
|
||||
D += Disto4x4(a + x + y, b + x + y, w);
|
||||
D += Disto4x4_SSE2(a + x + y, b + x + y, w);
|
||||
}
|
||||
}
|
||||
return D;
|
||||
|
@ -1209,9 +1217,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
|
|||
// Quantization
|
||||
//
|
||||
|
||||
static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
|
||||
const uint16_t* const sharpen,
|
||||
const VP8Matrix* const mtx) {
|
||||
static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
|
||||
const uint16_t* const sharpen,
|
||||
const VP8Matrix* const mtx) {
|
||||
const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m128i coeff0, coeff8;
|
||||
|
@ -1321,22 +1329,22 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
|
|||
return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
|
||||
}
|
||||
|
||||
static int QuantizeBlock(int16_t in[16], int16_t out[16],
|
||||
const VP8Matrix* const mtx) {
|
||||
return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
|
||||
static int QuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
|
||||
const VP8Matrix* const mtx) {
|
||||
return DoQuantizeBlock_SSE2(in, out, &mtx->sharpen_[0], mtx);
|
||||
}
|
||||
|
||||
static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
|
||||
const VP8Matrix* const mtx) {
|
||||
return DoQuantizeBlock(in, out, NULL, mtx);
|
||||
static int QuantizeBlockWHT_SSE2(int16_t in[16], int16_t out[16],
|
||||
const VP8Matrix* const mtx) {
|
||||
return DoQuantizeBlock_SSE2(in, out, NULL, mtx);
|
||||
}
|
||||
|
||||
static int Quantize2Blocks(int16_t in[32], int16_t out[32],
|
||||
const VP8Matrix* const mtx) {
|
||||
static int Quantize2Blocks_SSE2(int16_t in[32], int16_t out[32],
|
||||
const VP8Matrix* const mtx) {
|
||||
int nz;
|
||||
const uint16_t* const sharpen = &mtx->sharpen_[0];
|
||||
nz = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
|
||||
nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
|
||||
nz = DoQuantizeBlock_SSE2(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
|
||||
nz |= DoQuantizeBlock_SSE2(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
|
||||
return nz;
|
||||
}
|
||||
|
||||
|
@ -1346,139 +1354,28 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
|
|||
extern void VP8EncDspInitSSE2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) {
|
||||
VP8CollectHistogram = CollectHistogram;
|
||||
VP8EncPredLuma16 = Intra16Preds;
|
||||
VP8EncPredChroma8 = IntraChromaPreds;
|
||||
VP8EncPredLuma4 = Intra4Preds;
|
||||
VP8EncQuantizeBlock = QuantizeBlock;
|
||||
VP8EncQuantize2Blocks = Quantize2Blocks;
|
||||
VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
|
||||
VP8ITransform = ITransform;
|
||||
VP8FTransform = FTransform;
|
||||
VP8FTransform2 = FTransform2;
|
||||
VP8FTransformWHT = FTransformWHT;
|
||||
VP8SSE16x16 = SSE16x16;
|
||||
VP8SSE16x8 = SSE16x8;
|
||||
VP8SSE8x8 = SSE8x8;
|
||||
VP8SSE4x4 = SSE4x4;
|
||||
VP8TDisto4x4 = Disto4x4;
|
||||
VP8TDisto16x16 = Disto16x16;
|
||||
VP8Mean16x4 = Mean16x4;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// SSIM / PSNR entry point (TODO(skal): move to its own file later)
|
||||
|
||||
static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
|
||||
const uint8_t* src2, int len) {
|
||||
int i = 0;
|
||||
uint32_t sse2 = 0;
|
||||
if (len >= 16) {
|
||||
const int limit = len - 32;
|
||||
int32_t tmp[4];
|
||||
__m128i sum1;
|
||||
__m128i sum = _mm_setzero_si128();
|
||||
__m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
|
||||
__m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
|
||||
i += 16;
|
||||
while (i <= limit) {
|
||||
const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
|
||||
const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
|
||||
__m128i sum2;
|
||||
i += 16;
|
||||
SubtractAndAccumulate(a0, b0, &sum1);
|
||||
sum = _mm_add_epi32(sum, sum1);
|
||||
a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
|
||||
b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
|
||||
i += 16;
|
||||
SubtractAndAccumulate(a1, b1, &sum2);
|
||||
sum = _mm_add_epi32(sum, sum2);
|
||||
}
|
||||
SubtractAndAccumulate(a0, b0, &sum1);
|
||||
sum = _mm_add_epi32(sum, sum1);
|
||||
_mm_storeu_si128((__m128i*)tmp, sum);
|
||||
sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
|
||||
}
|
||||
|
||||
for (; i < len; ++i) {
|
||||
const int32_t diff = src1[i] - src2[i];
|
||||
sse2 += diff * diff;
|
||||
}
|
||||
return sse2;
|
||||
}
|
||||
|
||||
static uint32_t HorizontalAdd16b(const __m128i* const m) {
|
||||
uint16_t tmp[8];
|
||||
const __m128i a = _mm_srli_si128(*m, 8);
|
||||
const __m128i b = _mm_add_epi16(*m, a);
|
||||
_mm_storeu_si128((__m128i*)tmp, b);
|
||||
return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
|
||||
}
|
||||
|
||||
static uint32_t HorizontalAdd32b(const __m128i* const m) {
|
||||
const __m128i a = _mm_srli_si128(*m, 8);
|
||||
const __m128i b = _mm_add_epi32(*m, a);
|
||||
const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
|
||||
return (uint32_t)_mm_cvtsi128_si32(c);
|
||||
}
|
||||
|
||||
static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
|
||||
|
||||
#define ACCUMULATE_ROW(WEIGHT) do { \
|
||||
/* compute row weight (Wx * Wy) */ \
|
||||
const __m128i Wy = _mm_set1_epi16((WEIGHT)); \
|
||||
const __m128i W = _mm_mullo_epi16(Wx, Wy); \
|
||||
/* process 8 bytes at a time (7 bytes, actually) */ \
|
||||
const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
|
||||
const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
|
||||
/* convert to 16b and multiply by weight */ \
|
||||
const __m128i a1 = _mm_unpacklo_epi8(a0, zero); \
|
||||
const __m128i b1 = _mm_unpacklo_epi8(b0, zero); \
|
||||
const __m128i wa1 = _mm_mullo_epi16(a1, W); \
|
||||
const __m128i wb1 = _mm_mullo_epi16(b1, W); \
|
||||
/* accumulate */ \
|
||||
xm = _mm_add_epi16(xm, wa1); \
|
||||
ym = _mm_add_epi16(ym, wb1); \
|
||||
xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1)); \
|
||||
xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1)); \
|
||||
yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1)); \
|
||||
src1 += stride1; \
|
||||
src2 += stride2; \
|
||||
} while (0)
|
||||
|
||||
static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
|
||||
const uint8_t* src2, int stride2) {
|
||||
VP8DistoStats stats;
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m128i xm = zero, ym = zero; // 16b accums
|
||||
__m128i xxm = zero, yym = zero, xym = zero; // 32b accum
|
||||
const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
|
||||
assert(2 * VP8_SSIM_KERNEL + 1 == 7);
|
||||
ACCUMULATE_ROW(1);
|
||||
ACCUMULATE_ROW(2);
|
||||
ACCUMULATE_ROW(3);
|
||||
ACCUMULATE_ROW(4);
|
||||
ACCUMULATE_ROW(3);
|
||||
ACCUMULATE_ROW(2);
|
||||
ACCUMULATE_ROW(1);
|
||||
stats.xm = HorizontalAdd16b(&xm);
|
||||
stats.ym = HorizontalAdd16b(&ym);
|
||||
stats.xxm = HorizontalAdd32b(&xxm);
|
||||
stats.xym = HorizontalAdd32b(&xym);
|
||||
stats.yym = HorizontalAdd32b(&yym);
|
||||
return VP8SSIMFromStats(&stats);
|
||||
}
|
||||
|
||||
extern void VP8SSIMDspInitSSE2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
|
||||
VP8AccumulateSSE = AccumulateSSE_SSE2;
|
||||
VP8SSIMGet = SSIMGet_SSE2;
|
||||
VP8CollectHistogram = CollectHistogram_SSE2;
|
||||
VP8EncPredLuma16 = Intra16Preds_SSE2;
|
||||
VP8EncPredChroma8 = IntraChromaPreds_SSE2;
|
||||
VP8EncPredLuma4 = Intra4Preds_SSE2;
|
||||
VP8EncQuantizeBlock = QuantizeBlock_SSE2;
|
||||
VP8EncQuantize2Blocks = Quantize2Blocks_SSE2;
|
||||
VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE2;
|
||||
VP8ITransform = ITransform_SSE2;
|
||||
VP8FTransform = FTransform_SSE2;
|
||||
VP8FTransform2 = FTransform2_SSE2;
|
||||
VP8FTransformWHT = FTransformWHT_SSE2;
|
||||
VP8SSE16x16 = SSE16x16_SSE2;
|
||||
VP8SSE16x8 = SSE16x8_SSE2;
|
||||
VP8SSE8x8 = SSE8x8_SSE2;
|
||||
VP8SSE4x4 = SSE4x4_SSE2;
|
||||
VP8TDisto4x4 = Disto4x4_SSE2;
|
||||
VP8TDisto16x16 = Disto16x16_SSE2;
|
||||
VP8Mean16x4 = Mean16x4_SSE2;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_SSE2
|
||||
|
||||
WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2)
|
||||
WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
|
||||
|
||||
#endif // WEBP_USE_SSE2
|
|
@ -11,21 +11,21 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_SSE41)
|
||||
#include <smmintrin.h>
|
||||
#include <stdlib.h> // for abs()
|
||||
|
||||
#include "./common_sse2.h"
|
||||
#include "../enc/vp8i_enc.h"
|
||||
#include "src/dsp/common_sse2.h"
|
||||
#include "src/enc/vp8i_enc.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Compute susceptibility based on DCT-coeff histograms.
|
||||
|
||||
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
|
||||
int start_block, int end_block,
|
||||
VP8Histogram* const histo) {
|
||||
static void CollectHistogram_SSE41(const uint8_t* ref, const uint8_t* pred,
|
||||
int start_block, int end_block,
|
||||
VP8Histogram* const histo) {
|
||||
const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
|
||||
int j;
|
||||
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
|
||||
|
@ -70,8 +70,8 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
|
|||
// Hadamard transform
|
||||
// Returns the weighted sum of the absolute value of transformed coefficients.
|
||||
// w[] contains a row-major 4 by 4 symmetric matrix.
|
||||
static int TTransform(const uint8_t* inA, const uint8_t* inB,
|
||||
const uint16_t* const w) {
|
||||
static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB,
|
||||
const uint16_t* const w) {
|
||||
int32_t sum[4];
|
||||
__m128i tmp_0, tmp_1, tmp_2, tmp_3;
|
||||
|
||||
|
@ -168,19 +168,19 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB,
|
|||
return sum[0] + sum[1] + sum[2] + sum[3];
|
||||
}
|
||||
|
||||
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
const int diff_sum = TTransform(a, b, w);
|
||||
static int Disto4x4_SSE41(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
const int diff_sum = TTransform_SSE41(a, b, w);
|
||||
return abs(diff_sum) >> 5;
|
||||
}
|
||||
|
||||
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
static int Disto16x16_SSE41(const uint8_t* const a, const uint8_t* const b,
|
||||
const uint16_t* const w) {
|
||||
int D = 0;
|
||||
int x, y;
|
||||
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
|
||||
for (x = 0; x < 16; x += 4) {
|
||||
D += Disto4x4(a + x + y, b + x + y, w);
|
||||
D += Disto4x4_SSE41(a + x + y, b + x + y, w);
|
||||
}
|
||||
}
|
||||
return D;
|
||||
|
@ -197,9 +197,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
|
|||
2 * (D) + 1, 2 * (D) + 0, 2 * (C) + 1, 2 * (C) + 0, \
|
||||
2 * (B) + 1, 2 * (B) + 0, 2 * (A) + 1, 2 * (A) + 0)
|
||||
|
||||
static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
|
||||
const uint16_t* const sharpen,
|
||||
const VP8Matrix* const mtx) {
|
||||
static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
|
||||
const uint16_t* const sharpen,
|
||||
const VP8Matrix* const mtx) {
|
||||
const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m128i out0, out8;
|
||||
|
@ -300,22 +300,22 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
|
|||
|
||||
#undef PSHUFB_CST
|
||||
|
||||
static int QuantizeBlock(int16_t in[16], int16_t out[16],
|
||||
const VP8Matrix* const mtx) {
|
||||
return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
|
||||
static int QuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
|
||||
const VP8Matrix* const mtx) {
|
||||
return DoQuantizeBlock_SSE41(in, out, &mtx->sharpen_[0], mtx);
|
||||
}
|
||||
|
||||
static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
|
||||
const VP8Matrix* const mtx) {
|
||||
return DoQuantizeBlock(in, out, NULL, mtx);
|
||||
static int QuantizeBlockWHT_SSE41(int16_t in[16], int16_t out[16],
|
||||
const VP8Matrix* const mtx) {
|
||||
return DoQuantizeBlock_SSE41(in, out, NULL, mtx);
|
||||
}
|
||||
|
||||
static int Quantize2Blocks(int16_t in[32], int16_t out[32],
|
||||
const VP8Matrix* const mtx) {
|
||||
static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32],
|
||||
const VP8Matrix* const mtx) {
|
||||
int nz;
|
||||
const uint16_t* const sharpen = &mtx->sharpen_[0];
|
||||
nz = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
|
||||
nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
|
||||
nz = DoQuantizeBlock_SSE41(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
|
||||
nz |= DoQuantizeBlock_SSE41(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
|
||||
return nz;
|
||||
}
|
||||
|
||||
|
@ -324,12 +324,12 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
|
|||
|
||||
extern void VP8EncDspInitSSE41(void);
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE41(void) {
|
||||
VP8CollectHistogram = CollectHistogram;
|
||||
VP8EncQuantizeBlock = QuantizeBlock;
|
||||
VP8EncQuantize2Blocks = Quantize2Blocks;
|
||||
VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
|
||||
VP8TDisto4x4 = Disto4x4;
|
||||
VP8TDisto16x16 = Disto16x16;
|
||||
VP8CollectHistogram = CollectHistogram_SSE41;
|
||||
VP8EncQuantizeBlock = QuantizeBlock_SSE41;
|
||||
VP8EncQuantize2Blocks = Quantize2Blocks_SSE41;
|
||||
VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE41;
|
||||
VP8TDisto4x4 = Disto4x4_SSE41;
|
||||
VP8TDisto16x16 = Disto16x16_SSE41;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_SSE41
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// Author: Urvang (urvang@google.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
@ -20,16 +20,17 @@
|
|||
// Helpful macro.
|
||||
|
||||
# define SANITY_CHECK(in, out) \
|
||||
assert(in != NULL); \
|
||||
assert(out != NULL); \
|
||||
assert((in) != NULL); \
|
||||
assert((out) != NULL); \
|
||||
assert(width > 0); \
|
||||
assert(height > 0); \
|
||||
assert(stride >= width); \
|
||||
assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
|
||||
(void)height; // Silence unused warning.
|
||||
|
||||
static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
|
||||
uint8_t* dst, int length, int inverse) {
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred,
|
||||
uint8_t* dst, int length, int inverse) {
|
||||
int i;
|
||||
if (inverse) {
|
||||
for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i];
|
||||
|
@ -41,7 +42,44 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
|
|||
//------------------------------------------------------------------------------
|
||||
// Horizontal filter.
|
||||
|
||||
static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
|
||||
static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in,
|
||||
int width, int height, int stride,
|
||||
int row, int num_rows,
|
||||
int inverse, uint8_t* out) {
|
||||
const uint8_t* preds;
|
||||
const size_t start_offset = row * stride;
|
||||
const int last_row = row + num_rows;
|
||||
SANITY_CHECK(in, out);
|
||||
in += start_offset;
|
||||
out += start_offset;
|
||||
preds = inverse ? out : in;
|
||||
|
||||
if (row == 0) {
|
||||
// Leftmost pixel is the same as input for topmost scanline.
|
||||
out[0] = in[0];
|
||||
PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
|
||||
row = 1;
|
||||
preds += stride;
|
||||
in += stride;
|
||||
out += stride;
|
||||
}
|
||||
|
||||
// Filter line-by-line.
|
||||
while (row < last_row) {
|
||||
// Leftmost pixel is predicted from above.
|
||||
PredictLine_C(in, preds - stride, out, 1, inverse);
|
||||
PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
|
||||
++row;
|
||||
preds += stride;
|
||||
in += stride;
|
||||
out += stride;
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Vertical filter.
|
||||
|
||||
static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* in,
|
||||
int width, int height, int stride,
|
||||
int row, int num_rows,
|
||||
int inverse, uint8_t* out) {
|
||||
|
@ -53,48 +91,11 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
|
|||
out += start_offset;
|
||||
preds = inverse ? out : in;
|
||||
|
||||
if (row == 0) {
|
||||
// Leftmost pixel is the same as input for topmost scanline.
|
||||
out[0] = in[0];
|
||||
PredictLine(in + 1, preds, out + 1, width - 1, inverse);
|
||||
row = 1;
|
||||
preds += stride;
|
||||
in += stride;
|
||||
out += stride;
|
||||
}
|
||||
|
||||
// Filter line-by-line.
|
||||
while (row < last_row) {
|
||||
// Leftmost pixel is predicted from above.
|
||||
PredictLine(in, preds - stride, out, 1, inverse);
|
||||
PredictLine(in + 1, preds, out + 1, width - 1, inverse);
|
||||
++row;
|
||||
preds += stride;
|
||||
in += stride;
|
||||
out += stride;
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Vertical filter.
|
||||
|
||||
static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
|
||||
int width, int height, int stride,
|
||||
int row, int num_rows,
|
||||
int inverse, uint8_t* out) {
|
||||
const uint8_t* preds;
|
||||
const size_t start_offset = row * stride;
|
||||
const int last_row = row + num_rows;
|
||||
SANITY_CHECK(in, out);
|
||||
in += start_offset;
|
||||
out += start_offset;
|
||||
preds = inverse ? out : in;
|
||||
|
||||
if (row == 0) {
|
||||
// Very first top-left pixel is copied.
|
||||
out[0] = in[0];
|
||||
// Rest of top scan-line is left-predicted.
|
||||
PredictLine(in + 1, preds, out + 1, width - 1, inverse);
|
||||
PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
|
||||
row = 1;
|
||||
in += stride;
|
||||
out += stride;
|
||||
|
@ -105,26 +106,28 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
|
|||
|
||||
// Filter line-by-line.
|
||||
while (row < last_row) {
|
||||
PredictLine(in, preds, out, width, inverse);
|
||||
PredictLine_C(in, preds, out, width, inverse);
|
||||
++row;
|
||||
preds += stride;
|
||||
in += stride;
|
||||
out += stride;
|
||||
}
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Gradient filter.
|
||||
|
||||
static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
|
||||
static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) {
|
||||
const int g = a + b - c;
|
||||
return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255; // clip to 8bit
|
||||
}
|
||||
|
||||
static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
|
||||
int width, int height, int stride,
|
||||
int row, int num_rows,
|
||||
int inverse, uint8_t* out) {
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
|
||||
int width, int height, int stride,
|
||||
int row, int num_rows,
|
||||
int inverse, uint8_t* out) {
|
||||
const uint8_t* preds;
|
||||
const size_t start_offset = row * stride;
|
||||
const int last_row = row + num_rows;
|
||||
|
@ -136,7 +139,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
|
|||
// left prediction for top scan-line
|
||||
if (row == 0) {
|
||||
out[0] = in[0];
|
||||
PredictLine(in + 1, preds, out + 1, width - 1, inverse);
|
||||
PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
|
||||
row = 1;
|
||||
preds += stride;
|
||||
in += stride;
|
||||
|
@ -147,11 +150,11 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
|
|||
while (row < last_row) {
|
||||
int w;
|
||||
// leftmost pixel: predict from above.
|
||||
PredictLine(in, preds - stride, out, 1, inverse);
|
||||
PredictLine_C(in, preds - stride, out, 1, inverse);
|
||||
for (w = 1; w < width; ++w) {
|
||||
const int pred = GradientPredictor(preds[w - 1],
|
||||
preds[w - stride],
|
||||
preds[w - stride - 1]);
|
||||
const int pred = GradientPredictor_C(preds[w - 1],
|
||||
preds[w - stride],
|
||||
preds[w - stride - 1]);
|
||||
out[w] = in[w] + (inverse ? pred : -pred);
|
||||
}
|
||||
++row;
|
||||
|
@ -160,32 +163,34 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
|
|||
out += stride;
|
||||
}
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
#undef SANITY_CHECK
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static void HorizontalFilter(const uint8_t* data, int width, int height,
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static void HorizontalFilter_C(const uint8_t* data, int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
DoHorizontalFilter_C(data, width, height, stride, 0, height, 0,
|
||||
filtered_data);
|
||||
}
|
||||
|
||||
static void VerticalFilter_C(const uint8_t* data, int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data);
|
||||
DoVerticalFilter_C(data, width, height, stride, 0, height, 0, filtered_data);
|
||||
}
|
||||
|
||||
static void VerticalFilter(const uint8_t* data, int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data);
|
||||
static void GradientFilter_C(const uint8_t* data, int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
DoGradientFilter_C(data, width, height, stride, 0, height, 0, filtered_data);
|
||||
}
|
||||
|
||||
|
||||
static void GradientFilter(const uint8_t* data, int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data);
|
||||
}
|
||||
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
|
||||
uint8_t* out, int width) {
|
||||
static void HorizontalUnfilter_C(const uint8_t* prev, const uint8_t* in,
|
||||
uint8_t* out, int width) {
|
||||
uint8_t pred = (prev == NULL) ? 0 : prev[0];
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
|
@ -194,26 +199,28 @@ static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
|
|||
}
|
||||
}
|
||||
|
||||
static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
|
||||
uint8_t* out, int width) {
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static void VerticalUnfilter_C(const uint8_t* prev, const uint8_t* in,
|
||||
uint8_t* out, int width) {
|
||||
if (prev == NULL) {
|
||||
HorizontalUnfilter(NULL, in, out, width);
|
||||
HorizontalUnfilter_C(NULL, in, out, width);
|
||||
} else {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) out[i] = prev[i] + in[i];
|
||||
}
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
|
||||
uint8_t* out, int width) {
|
||||
static void GradientUnfilter_C(const uint8_t* prev, const uint8_t* in,
|
||||
uint8_t* out, int width) {
|
||||
if (prev == NULL) {
|
||||
HorizontalUnfilter(NULL, in, out, width);
|
||||
HorizontalUnfilter_C(NULL, in, out, width);
|
||||
} else {
|
||||
uint8_t top = prev[0], top_left = top, left = top;
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
top = prev[i]; // need to read this first, in case prev==out
|
||||
left = in[i] + GradientPredictor(left, top, top_left);
|
||||
left = in[i] + GradientPredictor_C(left, top, top_left);
|
||||
top_left = top;
|
||||
out[i] = left;
|
||||
}
|
||||
|
@ -238,14 +245,18 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
|
|||
if (filters_last_cpuinfo_used == VP8GetCPUInfo) return;
|
||||
|
||||
WebPUnfilters[WEBP_FILTER_NONE] = NULL;
|
||||
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
|
||||
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
|
||||
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_C;
|
||||
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_C;
|
||||
#endif
|
||||
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_C;
|
||||
|
||||
WebPFilters[WEBP_FILTER_NONE] = NULL;
|
||||
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
|
||||
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
|
||||
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_C;
|
||||
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_C;
|
||||
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_C;
|
||||
#endif
|
||||
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
|
@ -253,11 +264,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
|
|||
VP8FiltersInitSSE2();
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_NEON)
|
||||
if (VP8GetCPUInfo(kNEON)) {
|
||||
VP8FiltersInitNEON();
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_MIPS_DSP_R2)
|
||||
if (VP8GetCPUInfo(kMIPSdspR2)) {
|
||||
VP8FiltersInitMIPSdspR2();
|
||||
|
@ -269,5 +275,20 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
|
|||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
if (WEBP_NEON_OMIT_C_CODE ||
|
||||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
|
||||
VP8FiltersInitNEON();
|
||||
}
|
||||
#endif
|
||||
|
||||
assert(WebPUnfilters[WEBP_FILTER_HORIZONTAL] != NULL);
|
||||
assert(WebPUnfilters[WEBP_FILTER_VERTICAL] != NULL);
|
||||
assert(WebPUnfilters[WEBP_FILTER_GRADIENT] != NULL);
|
||||
assert(WebPFilters[WEBP_FILTER_HORIZONTAL] != NULL);
|
||||
assert(WebPFilters[WEBP_FILTER_VERTICAL] != NULL);
|
||||
assert(WebPFilters[WEBP_FILTER_GRADIENT] != NULL);
|
||||
|
||||
filters_last_cpuinfo_used = VP8GetCPUInfo;
|
||||
}
|
|
@ -12,11 +12,11 @@
|
|||
// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
|
||||
// Djordje Pesut (djordje.pesut@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MIPS_DSP_R2)
|
||||
|
||||
#include "../dsp/dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
@ -101,8 +101,8 @@
|
|||
); \
|
||||
} while (0)
|
||||
|
||||
static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst,
|
||||
int length) {
|
||||
static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* src, uint8_t* dst,
|
||||
int length) {
|
||||
DO_PREDICT_LINE(src, dst, length, 0);
|
||||
}
|
||||
|
||||
|
@ -192,10 +192,11 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst,
|
|||
} \
|
||||
} while (0)
|
||||
|
||||
static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
|
||||
int width, int height, int stride,
|
||||
int row, int num_rows,
|
||||
uint8_t* out) {
|
||||
static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(const uint8_t* in,
|
||||
int width, int height,
|
||||
int stride,
|
||||
int row, int num_rows,
|
||||
uint8_t* out) {
|
||||
const uint8_t* preds;
|
||||
const size_t start_offset = row * stride;
|
||||
const int last_row = row + num_rows;
|
||||
|
@ -207,7 +208,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
|
|||
if (row == 0) {
|
||||
// Leftmost pixel is the same as input for topmost scanline.
|
||||
out[0] = in[0];
|
||||
PredictLine(in + 1, out + 1, width - 1);
|
||||
PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
|
||||
row = 1;
|
||||
preds += stride;
|
||||
in += stride;
|
||||
|
@ -219,9 +220,11 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
|
|||
}
|
||||
#undef FILTER_LINE_BY_LINE
|
||||
|
||||
static void HorizontalFilter(const uint8_t* data, int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data);
|
||||
static void HorizontalFilter_MIPSdspR2(const uint8_t* data,
|
||||
int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
DoHorizontalFilter_MIPSdspR2(data, width, height, stride, 0, height,
|
||||
filtered_data);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
@ -237,9 +240,11 @@ static void HorizontalFilter(const uint8_t* data, int width, int height,
|
|||
} \
|
||||
} while (0)
|
||||
|
||||
static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
|
||||
int width, int height, int stride,
|
||||
int row, int num_rows, uint8_t* out) {
|
||||
static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(const uint8_t* in,
|
||||
int width, int height,
|
||||
int stride,
|
||||
int row, int num_rows,
|
||||
uint8_t* out) {
|
||||
const uint8_t* preds;
|
||||
const size_t start_offset = row * stride;
|
||||
const int last_row = row + num_rows;
|
||||
|
@ -252,7 +257,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
|
|||
// Very first top-left pixel is copied.
|
||||
out[0] = in[0];
|
||||
// Rest of top scan-line is left-predicted.
|
||||
PredictLine(in + 1, out + 1, width - 1);
|
||||
PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
|
||||
row = 1;
|
||||
in += stride;
|
||||
out += stride;
|
||||
|
@ -266,15 +271,16 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
|
|||
}
|
||||
#undef FILTER_LINE_BY_LINE
|
||||
|
||||
static void VerticalFilter(const uint8_t* data, int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
DoVerticalFilter(data, width, height, stride, 0, height, filtered_data);
|
||||
static void VerticalFilter_MIPSdspR2(const uint8_t* data, int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
DoVerticalFilter_MIPSdspR2(data, width, height, stride, 0, height,
|
||||
filtered_data);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Gradient filter.
|
||||
|
||||
static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
|
||||
static int GradientPredictor_MIPSdspR2(uint8_t a, uint8_t b, uint8_t c) {
|
||||
int temp0;
|
||||
__asm__ volatile (
|
||||
"addu %[temp0], %[a], %[b] \n\t"
|
||||
|
@ -293,9 +299,9 @@ static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
|
|||
int w; \
|
||||
PREDICT_LINE_ONE_PASS(in, PREDS - stride, out); \
|
||||
for (w = 1; w < width; ++w) { \
|
||||
const int pred = GradientPredictor(PREDS[w - 1], \
|
||||
PREDS[w - stride], \
|
||||
PREDS[w - stride - 1]); \
|
||||
const int pred = GradientPredictor_MIPSdspR2(PREDS[w - 1], \
|
||||
PREDS[w - stride], \
|
||||
PREDS[w - stride - 1]); \
|
||||
out[w] = in[w] OPERATION pred; \
|
||||
} \
|
||||
++row; \
|
||||
|
@ -304,9 +310,9 @@ static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
|
|||
} \
|
||||
} while (0)
|
||||
|
||||
static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
|
||||
int width, int height, int stride,
|
||||
int row, int num_rows, uint8_t* out) {
|
||||
static void DoGradientFilter_MIPSdspR2(const uint8_t* in,
|
||||
int width, int height, int stride,
|
||||
int row, int num_rows, uint8_t* out) {
|
||||
const uint8_t* preds;
|
||||
const size_t start_offset = row * stride;
|
||||
const int last_row = row + num_rows;
|
||||
|
@ -318,7 +324,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
|
|||
// left prediction for top scan-line
|
||||
if (row == 0) {
|
||||
out[0] = in[0];
|
||||
PredictLine(in + 1, out + 1, width - 1);
|
||||
PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
|
||||
row = 1;
|
||||
preds += stride;
|
||||
in += stride;
|
||||
|
@ -330,38 +336,39 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
|
|||
}
|
||||
#undef FILTER_LINE_BY_LINE
|
||||
|
||||
static void GradientFilter(const uint8_t* data, int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
DoGradientFilter(data, width, height, stride, 0, height, filtered_data);
|
||||
static void GradientFilter_MIPSdspR2(const uint8_t* data, int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
DoGradientFilter_MIPSdspR2(data, width, height, stride, 0, height,
|
||||
filtered_data);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
|
||||
uint8_t* out, int width) {
|
||||
static void HorizontalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
|
||||
uint8_t* out, int width) {
|
||||
out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
|
||||
DO_PREDICT_LINE(in + 1, out + 1, width - 1, 1);
|
||||
}
|
||||
|
||||
static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
|
||||
uint8_t* out, int width) {
|
||||
static void VerticalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
|
||||
uint8_t* out, int width) {
|
||||
if (prev == NULL) {
|
||||
HorizontalUnfilter(NULL, in, out, width);
|
||||
HorizontalUnfilter_MIPSdspR2(NULL, in, out, width);
|
||||
} else {
|
||||
DO_PREDICT_LINE_VERTICAL(in, prev, out, width, 1);
|
||||
}
|
||||
}
|
||||
|
||||
static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
|
||||
uint8_t* out, int width) {
|
||||
static void GradientUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
|
||||
uint8_t* out, int width) {
|
||||
if (prev == NULL) {
|
||||
HorizontalUnfilter(NULL, in, out, width);
|
||||
HorizontalUnfilter_MIPSdspR2(NULL, in, out, width);
|
||||
} else {
|
||||
uint8_t top = prev[0], top_left = top, left = top;
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
top = prev[i]; // need to read this first, in case prev==dst
|
||||
left = in[i] + GradientPredictor(left, top, top_left);
|
||||
left = in[i] + GradientPredictor_MIPSdspR2(left, top, top_left);
|
||||
top_left = top;
|
||||
out[i] = left;
|
||||
}
|
||||
|
@ -379,13 +386,13 @@ static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
|
|||
extern void VP8FiltersInitMIPSdspR2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMIPSdspR2(void) {
|
||||
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
|
||||
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
|
||||
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
|
||||
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_MIPSdspR2;
|
||||
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_MIPSdspR2;
|
||||
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_MIPSdspR2;
|
||||
|
||||
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
|
||||
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
|
||||
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
|
||||
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MIPSdspR2;
|
||||
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MIPSdspR2;
|
||||
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MIPSdspR2;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_MIPS_DSP_R2
|
|
@ -11,11 +11,11 @@
|
|||
//
|
||||
// Author: Prashant Patil (prashant.patil@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MSA)
|
||||
|
||||
#include "./msa_macro.h"
|
||||
#include "src/dsp/msa_macro.h"
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
|
@ -66,8 +66,8 @@ static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
|
|||
//------------------------------------------------------------------------------
|
||||
// Horrizontal filter
|
||||
|
||||
static void HorizontalFilter(const uint8_t* data, int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
static void HorizontalFilter_MSA(const uint8_t* data, int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
const uint8_t* preds = data;
|
||||
const uint8_t* in = data;
|
||||
uint8_t* out = filtered_data;
|
||||
|
@ -129,8 +129,8 @@ static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
|
|||
}
|
||||
|
||||
|
||||
static void GradientFilter(const uint8_t* data, int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
static void GradientFilter_MSA(const uint8_t* data, int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
const uint8_t* in = data;
|
||||
const uint8_t* preds = data;
|
||||
uint8_t* out = filtered_data;
|
||||
|
@ -157,8 +157,8 @@ static void GradientFilter(const uint8_t* data, int width, int height,
|
|||
//------------------------------------------------------------------------------
|
||||
// Vertical filter
|
||||
|
||||
static void VerticalFilter(const uint8_t* data, int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
static void VerticalFilter_MSA(const uint8_t* data, int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
const uint8_t* in = data;
|
||||
const uint8_t* preds = data;
|
||||
uint8_t* out = filtered_data;
|
||||
|
@ -190,9 +190,9 @@ static void VerticalFilter(const uint8_t* data, int width, int height,
|
|||
extern void VP8FiltersInitMSA(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMSA(void) {
|
||||
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
|
||||
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
|
||||
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
|
||||
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MSA;
|
||||
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MSA;
|
||||
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MSA;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_MSA
|
|
@ -11,12 +11,12 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
|
||||
#include <assert.h>
|
||||
#include "./neon.h"
|
||||
#include "src/dsp/neon.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Helpful macros.
|
||||
|
@ -134,7 +134,7 @@ static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in,
|
|||
}
|
||||
|
||||
static void VerticalFilter_NEON(const uint8_t* data, int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
int stride, uint8_t* filtered_data) {
|
||||
DoVerticalFilter_NEON(data, width, height, stride, 0, height,
|
||||
filtered_data);
|
||||
}
|
||||
|
@ -196,7 +196,7 @@ static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in,
|
|||
}
|
||||
|
||||
static void GradientFilter_NEON(const uint8_t* data, int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
int stride, uint8_t* filtered_data) {
|
||||
DoGradientFilter_NEON(data, width, height, stride, 0, height,
|
||||
filtered_data);
|
||||
}
|
||||
|
@ -251,9 +251,11 @@ static void VerticalUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
|
|||
// GradientUnfilter_NEON is correct but slower than the C-version,
|
||||
// at least on ARM64. For armv7, it's a wash.
|
||||
// So best is to disable it for now, but keep the idea around...
|
||||
// #define USE_GRADIENT_UNFILTER
|
||||
#if !defined(USE_GRADIENT_UNFILTER)
|
||||
#define USE_GRADIENT_UNFILTER 0 // ALTERNATE_CODE
|
||||
#endif
|
||||
|
||||
#if defined(USE_GRADIENT_UNFILTER)
|
||||
#if (USE_GRADIENT_UNFILTER == 1)
|
||||
#define GRAD_PROCESS_LANE(L) do { \
|
||||
const uint8x8_t tmp1 = ROTATE_RIGHT_N(pred, 1); /* rotate predictor in */ \
|
||||
const int16x8_t tmp2 = vaddq_s16(BC, U8_TO_S16(tmp1)); \
|
||||
|
@ -292,7 +294,7 @@ static void GradientPredictInverse_NEON(const uint8_t* const in,
|
|||
#undef GRAD_PROCESS_LANE
|
||||
|
||||
static void GradientUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
|
||||
uint8_t* out, int width) {
|
||||
uint8_t* out, int width) {
|
||||
if (prev == NULL) {
|
||||
HorizontalUnfilter_NEON(NULL, in, out, width);
|
||||
} else {
|
||||
|
@ -311,7 +313,7 @@ extern void VP8FiltersInitNEON(void);
|
|||
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitNEON(void) {
|
||||
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_NEON;
|
||||
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_NEON;
|
||||
#if defined(USE_GRADIENT_UNFILTER)
|
||||
#if (USE_GRADIENT_UNFILTER == 1)
|
||||
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_NEON;
|
||||
#endif
|
||||
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
|
||||
|
@ -24,16 +24,16 @@
|
|||
// Helpful macro.
|
||||
|
||||
# define SANITY_CHECK(in, out) \
|
||||
assert(in != NULL); \
|
||||
assert(out != NULL); \
|
||||
assert((in) != NULL); \
|
||||
assert((out) != NULL); \
|
||||
assert(width > 0); \
|
||||
assert(height > 0); \
|
||||
assert(stride >= width); \
|
||||
assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
|
||||
(void)height; // Silence unused warning.
|
||||
|
||||
static void PredictLineTop(const uint8_t* src, const uint8_t* pred,
|
||||
uint8_t* dst, int length) {
|
||||
static void PredictLineTop_SSE2(const uint8_t* src, const uint8_t* pred,
|
||||
uint8_t* dst, int length) {
|
||||
int i;
|
||||
const int max_pos = length & ~31;
|
||||
assert(length >= 0);
|
||||
|
@ -51,7 +51,7 @@ static void PredictLineTop(const uint8_t* src, const uint8_t* pred,
|
|||
}
|
||||
|
||||
// Special case for left-based prediction (when preds==dst-1 or preds==src-1).
|
||||
static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length) {
|
||||
static void PredictLineLeft_SSE2(const uint8_t* src, uint8_t* dst, int length) {
|
||||
int i;
|
||||
const int max_pos = length & ~31;
|
||||
assert(length >= 0);
|
||||
|
@ -71,10 +71,11 @@ static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length) {
|
|||
//------------------------------------------------------------------------------
|
||||
// Horizontal filter.
|
||||
|
||||
static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
|
||||
int width, int height, int stride,
|
||||
int row, int num_rows,
|
||||
uint8_t* out) {
|
||||
static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in,
|
||||
int width, int height,
|
||||
int stride,
|
||||
int row, int num_rows,
|
||||
uint8_t* out) {
|
||||
const size_t start_offset = row * stride;
|
||||
const int last_row = row + num_rows;
|
||||
SANITY_CHECK(in, out);
|
||||
|
@ -84,7 +85,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
|
|||
if (row == 0) {
|
||||
// Leftmost pixel is the same as input for topmost scanline.
|
||||
out[0] = in[0];
|
||||
PredictLineLeft(in + 1, out + 1, width - 1);
|
||||
PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
|
||||
row = 1;
|
||||
in += stride;
|
||||
out += stride;
|
||||
|
@ -94,7 +95,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
|
|||
while (row < last_row) {
|
||||
// Leftmost pixel is predicted from above.
|
||||
out[0] = in[0] - in[-stride];
|
||||
PredictLineLeft(in + 1, out + 1, width - 1);
|
||||
PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
|
||||
++row;
|
||||
in += stride;
|
||||
out += stride;
|
||||
|
@ -104,9 +105,10 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
|
|||
//------------------------------------------------------------------------------
|
||||
// Vertical filter.
|
||||
|
||||
static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
|
||||
int width, int height, int stride,
|
||||
int row, int num_rows, uint8_t* out) {
|
||||
static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* in,
|
||||
int width, int height, int stride,
|
||||
int row, int num_rows,
|
||||
uint8_t* out) {
|
||||
const size_t start_offset = row * stride;
|
||||
const int last_row = row + num_rows;
|
||||
SANITY_CHECK(in, out);
|
||||
|
@ -117,7 +119,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
|
|||
// Very first top-left pixel is copied.
|
||||
out[0] = in[0];
|
||||
// Rest of top scan-line is left-predicted.
|
||||
PredictLineLeft(in + 1, out + 1, width - 1);
|
||||
PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
|
||||
row = 1;
|
||||
in += stride;
|
||||
out += stride;
|
||||
|
@ -125,7 +127,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
|
|||
|
||||
// Filter line-by-line.
|
||||
while (row < last_row) {
|
||||
PredictLineTop(in, in - stride, out, width);
|
||||
PredictLineTop_SSE2(in, in - stride, out, width);
|
||||
++row;
|
||||
in += stride;
|
||||
out += stride;
|
||||
|
@ -135,14 +137,14 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
|
|||
//------------------------------------------------------------------------------
|
||||
// Gradient filter.
|
||||
|
||||
static WEBP_INLINE int GradientPredictorC(uint8_t a, uint8_t b, uint8_t c) {
|
||||
static WEBP_INLINE int GradientPredictor_SSE2(uint8_t a, uint8_t b, uint8_t c) {
|
||||
const int g = a + b - c;
|
||||
return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255; // clip to 8bit
|
||||
}
|
||||
|
||||
static void GradientPredictDirect(const uint8_t* const row,
|
||||
const uint8_t* const top,
|
||||
uint8_t* const out, int length) {
|
||||
static void GradientPredictDirect_SSE2(const uint8_t* const row,
|
||||
const uint8_t* const top,
|
||||
uint8_t* const out, int length) {
|
||||
const int max_pos = length & ~7;
|
||||
int i;
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
|
@ -161,14 +163,14 @@ static void GradientPredictDirect(const uint8_t* const row,
|
|||
_mm_storel_epi64((__m128i*)(out + i), H);
|
||||
}
|
||||
for (; i < length; ++i) {
|
||||
out[i] = row[i] - GradientPredictorC(row[i - 1], top[i], top[i - 1]);
|
||||
out[i] = row[i] - GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
|
||||
}
|
||||
}
|
||||
|
||||
static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
|
||||
int width, int height, int stride,
|
||||
int row, int num_rows,
|
||||
uint8_t* out) {
|
||||
static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in,
|
||||
int width, int height, int stride,
|
||||
int row, int num_rows,
|
||||
uint8_t* out) {
|
||||
const size_t start_offset = row * stride;
|
||||
const int last_row = row + num_rows;
|
||||
SANITY_CHECK(in, out);
|
||||
|
@ -178,7 +180,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
|
|||
// left prediction for top scan-line
|
||||
if (row == 0) {
|
||||
out[0] = in[0];
|
||||
PredictLineLeft(in + 1, out + 1, width - 1);
|
||||
PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
|
||||
row = 1;
|
||||
in += stride;
|
||||
out += stride;
|
||||
|
@ -187,7 +189,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
|
|||
// Filter line-by-line.
|
||||
while (row < last_row) {
|
||||
out[0] = in[0] - in[-stride];
|
||||
GradientPredictDirect(in + 1, in + 1 - stride, out + 1, width - 1);
|
||||
GradientPredictDirect_SSE2(in + 1, in + 1 - stride, out + 1, width - 1);
|
||||
++row;
|
||||
in += stride;
|
||||
out += stride;
|
||||
|
@ -198,26 +200,27 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
|
|||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static void HorizontalFilter(const uint8_t* data, int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data);
|
||||
static void HorizontalFilter_SSE2(const uint8_t* data, int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
DoHorizontalFilter_SSE2(data, width, height, stride, 0, height,
|
||||
filtered_data);
|
||||
}
|
||||
|
||||
static void VerticalFilter(const uint8_t* data, int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
DoVerticalFilter(data, width, height, stride, 0, height, filtered_data);
|
||||
static void VerticalFilter_SSE2(const uint8_t* data, int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
DoVerticalFilter_SSE2(data, width, height, stride, 0, height, filtered_data);
|
||||
}
|
||||
|
||||
static void GradientFilter(const uint8_t* data, int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
DoGradientFilter(data, width, height, stride, 0, height, filtered_data);
|
||||
static void GradientFilter_SSE2(const uint8_t* data, int width, int height,
|
||||
int stride, uint8_t* filtered_data) {
|
||||
DoGradientFilter_SSE2(data, width, height, stride, 0, height, filtered_data);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Inverse transforms
|
||||
|
||||
static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
|
||||
uint8_t* out, int width) {
|
||||
static void HorizontalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
|
||||
uint8_t* out, int width) {
|
||||
int i;
|
||||
__m128i last;
|
||||
out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
|
||||
|
@ -238,10 +241,10 @@ static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
|
|||
for (; i < width; ++i) out[i] = in[i] + out[i - 1];
|
||||
}
|
||||
|
||||
static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
|
||||
uint8_t* out, int width) {
|
||||
static void VerticalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
|
||||
uint8_t* out, int width) {
|
||||
if (prev == NULL) {
|
||||
HorizontalUnfilter(NULL, in, out, width);
|
||||
HorizontalUnfilter_SSE2(NULL, in, out, width);
|
||||
} else {
|
||||
int i;
|
||||
const int max_pos = width & ~31;
|
||||
|
@ -260,9 +263,9 @@ static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
|
|||
}
|
||||
}
|
||||
|
||||
static void GradientPredictInverse(const uint8_t* const in,
|
||||
const uint8_t* const top,
|
||||
uint8_t* const row, int length) {
|
||||
static void GradientPredictInverse_SSE2(const uint8_t* const in,
|
||||
const uint8_t* const top,
|
||||
uint8_t* const row, int length) {
|
||||
if (length > 0) {
|
||||
int i;
|
||||
const int max_pos = length & ~7;
|
||||
|
@ -293,18 +296,18 @@ static void GradientPredictInverse(const uint8_t* const in,
|
|||
_mm_storel_epi64((__m128i*)&row[i], out);
|
||||
}
|
||||
for (; i < length; ++i) {
|
||||
row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]);
|
||||
row[i] = in[i] + GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
|
||||
uint8_t* out, int width) {
|
||||
static void GradientUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
|
||||
uint8_t* out, int width) {
|
||||
if (prev == NULL) {
|
||||
HorizontalUnfilter(NULL, in, out, width);
|
||||
HorizontalUnfilter_SSE2(NULL, in, out, width);
|
||||
} else {
|
||||
out[0] = in[0] + prev[0]; // predict from above
|
||||
GradientPredictInverse(in + 1, prev + 1, out + 1, width - 1);
|
||||
GradientPredictInverse_SSE2(in + 1, prev + 1, out + 1, width - 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -314,13 +317,13 @@ static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
|
|||
extern void VP8FiltersInitSSE2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitSSE2(void) {
|
||||
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
|
||||
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
|
||||
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
|
||||
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_SSE2;
|
||||
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_SSE2;
|
||||
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_SSE2;
|
||||
|
||||
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
|
||||
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
|
||||
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
|
||||
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_SSE2;
|
||||
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_SSE2;
|
||||
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_SSE2;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_SSE2
|
|
@ -13,14 +13,15 @@
|
|||
// Jyrki Alakuijala (jyrki@google.com)
|
||||
// Urvang Joshi (urvang@google.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include "../dec/vp8li_dec.h"
|
||||
#include "../utils/endian_inl_utils.h"
|
||||
#include "./lossless.h"
|
||||
#include "./lossless_common.h"
|
||||
#include "src/dec/vp8li_dec.h"
|
||||
#include "src/utils/endian_inl_utils.h"
|
||||
#include "src/dsp/lossless.h"
|
||||
#include "src/dsp/lossless_common.h"
|
||||
|
||||
#define MAX_DIFF_COST (1e30f)
|
||||
|
||||
|
@ -80,8 +81,9 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
|
|||
return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
|
||||
}
|
||||
|
||||
// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined.
|
||||
#if defined(__arm__) && LOCAL_GCC_VERSION == 0x409
|
||||
// gcc <= 4.9 on ARM generates incorrect code in Select() when Sub3() is
|
||||
// inlined.
|
||||
#if defined(__arm__) && LOCAL_GCC_VERSION <= 0x409
|
||||
# define LOCAL_INLINE __attribute__ ((noinline))
|
||||
#else
|
||||
# define LOCAL_INLINE WEBP_INLINE
|
||||
|
@ -107,69 +109,69 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
|
|||
//------------------------------------------------------------------------------
|
||||
// Predictors
|
||||
|
||||
static uint32_t Predictor0(uint32_t left, const uint32_t* const top) {
|
||||
static uint32_t Predictor0_C(uint32_t left, const uint32_t* const top) {
|
||||
(void)top;
|
||||
(void)left;
|
||||
return ARGB_BLACK;
|
||||
}
|
||||
static uint32_t Predictor1(uint32_t left, const uint32_t* const top) {
|
||||
static uint32_t Predictor1_C(uint32_t left, const uint32_t* const top) {
|
||||
(void)top;
|
||||
return left;
|
||||
}
|
||||
static uint32_t Predictor2(uint32_t left, const uint32_t* const top) {
|
||||
static uint32_t Predictor2_C(uint32_t left, const uint32_t* const top) {
|
||||
(void)left;
|
||||
return top[0];
|
||||
}
|
||||
static uint32_t Predictor3(uint32_t left, const uint32_t* const top) {
|
||||
static uint32_t Predictor3_C(uint32_t left, const uint32_t* const top) {
|
||||
(void)left;
|
||||
return top[1];
|
||||
}
|
||||
static uint32_t Predictor4(uint32_t left, const uint32_t* const top) {
|
||||
static uint32_t Predictor4_C(uint32_t left, const uint32_t* const top) {
|
||||
(void)left;
|
||||
return top[-1];
|
||||
}
|
||||
static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
|
||||
static uint32_t Predictor5_C(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Average3(left, top[0], top[1]);
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
|
||||
static uint32_t Predictor6_C(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Average2(left, top[-1]);
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
|
||||
static uint32_t Predictor7_C(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Average2(left, top[0]);
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
|
||||
static uint32_t Predictor8_C(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Average2(top[-1], top[0]);
|
||||
(void)left;
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
|
||||
static uint32_t Predictor9_C(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Average2(top[0], top[1]);
|
||||
(void)left;
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
|
||||
static uint32_t Predictor10_C(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
|
||||
static uint32_t Predictor11_C(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Select(top[0], left, top[-1]);
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
|
||||
static uint32_t Predictor12_C(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
|
||||
static uint32_t Predictor13_C(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
|
||||
return pred;
|
||||
}
|
||||
|
||||
GENERATE_PREDICTOR_ADD(Predictor0, PredictorAdd0)
|
||||
static void PredictorAdd1(const uint32_t* in, const uint32_t* upper,
|
||||
int num_pixels, uint32_t* out) {
|
||||
GENERATE_PREDICTOR_ADD(Predictor0_C, PredictorAdd0_C)
|
||||
static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper,
|
||||
int num_pixels, uint32_t* out) {
|
||||
int i;
|
||||
uint32_t left = out[-1];
|
||||
for (i = 0; i < num_pixels; ++i) {
|
||||
|
@ -177,29 +179,29 @@ static void PredictorAdd1(const uint32_t* in, const uint32_t* upper,
|
|||
}
|
||||
(void)upper;
|
||||
}
|
||||
GENERATE_PREDICTOR_ADD(Predictor2, PredictorAdd2)
|
||||
GENERATE_PREDICTOR_ADD(Predictor3, PredictorAdd3)
|
||||
GENERATE_PREDICTOR_ADD(Predictor4, PredictorAdd4)
|
||||
GENERATE_PREDICTOR_ADD(Predictor5, PredictorAdd5)
|
||||
GENERATE_PREDICTOR_ADD(Predictor6, PredictorAdd6)
|
||||
GENERATE_PREDICTOR_ADD(Predictor7, PredictorAdd7)
|
||||
GENERATE_PREDICTOR_ADD(Predictor8, PredictorAdd8)
|
||||
GENERATE_PREDICTOR_ADD(Predictor9, PredictorAdd9)
|
||||
GENERATE_PREDICTOR_ADD(Predictor10, PredictorAdd10)
|
||||
GENERATE_PREDICTOR_ADD(Predictor11, PredictorAdd11)
|
||||
GENERATE_PREDICTOR_ADD(Predictor12, PredictorAdd12)
|
||||
GENERATE_PREDICTOR_ADD(Predictor13, PredictorAdd13)
|
||||
GENERATE_PREDICTOR_ADD(Predictor2_C, PredictorAdd2_C)
|
||||
GENERATE_PREDICTOR_ADD(Predictor3_C, PredictorAdd3_C)
|
||||
GENERATE_PREDICTOR_ADD(Predictor4_C, PredictorAdd4_C)
|
||||
GENERATE_PREDICTOR_ADD(Predictor5_C, PredictorAdd5_C)
|
||||
GENERATE_PREDICTOR_ADD(Predictor6_C, PredictorAdd6_C)
|
||||
GENERATE_PREDICTOR_ADD(Predictor7_C, PredictorAdd7_C)
|
||||
GENERATE_PREDICTOR_ADD(Predictor8_C, PredictorAdd8_C)
|
||||
GENERATE_PREDICTOR_ADD(Predictor9_C, PredictorAdd9_C)
|
||||
GENERATE_PREDICTOR_ADD(Predictor10_C, PredictorAdd10_C)
|
||||
GENERATE_PREDICTOR_ADD(Predictor11_C, PredictorAdd11_C)
|
||||
GENERATE_PREDICTOR_ADD(Predictor12_C, PredictorAdd12_C)
|
||||
GENERATE_PREDICTOR_ADD(Predictor13_C, PredictorAdd13_C)
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
// Inverse prediction.
|
||||
static void PredictorInverseTransform(const VP8LTransform* const transform,
|
||||
int y_start, int y_end,
|
||||
const uint32_t* in, uint32_t* out) {
|
||||
static void PredictorInverseTransform_C(const VP8LTransform* const transform,
|
||||
int y_start, int y_end,
|
||||
const uint32_t* in, uint32_t* out) {
|
||||
const int width = transform->xsize_;
|
||||
if (y_start == 0) { // First Row follows the L (mode=1) mode.
|
||||
PredictorAdd0(in, NULL, 1, out);
|
||||
PredictorAdd1(in + 1, NULL, width - 1, out + 1);
|
||||
PredictorAdd0_C(in, NULL, 1, out);
|
||||
PredictorAdd1_C(in + 1, NULL, width - 1, out + 1);
|
||||
in += width;
|
||||
out += width;
|
||||
++y_start;
|
||||
|
@ -217,7 +219,7 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
|
|||
const uint32_t* pred_mode_src = pred_mode_base;
|
||||
int x = 1;
|
||||
// First pixel follows the T (mode=2) mode.
|
||||
PredictorAdd2(in, out - width, 1, out);
|
||||
PredictorAdd2_C(in, out - width, 1, out);
|
||||
// .. the rest:
|
||||
while (x < width) {
|
||||
const VP8LPredictorAddSubFunc pred_func =
|
||||
|
@ -272,8 +274,8 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
|
|||
const uint32_t argb = src[i];
|
||||
const uint32_t green = argb >> 8;
|
||||
const uint32_t red = argb >> 16;
|
||||
int new_red = red;
|
||||
int new_blue = argb;
|
||||
int new_red = red & 0xff;
|
||||
int new_blue = argb & 0xff;
|
||||
new_red += ColorTransformDelta(m->green_to_red_, green);
|
||||
new_red &= 0xff;
|
||||
new_blue += ColorTransformDelta(m->green_to_blue_, green);
|
||||
|
@ -284,9 +286,9 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
|
|||
}
|
||||
|
||||
// Color space inverse transform.
|
||||
static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
|
||||
int y_start, int y_end,
|
||||
const uint32_t* src, uint32_t* dst) {
|
||||
static void ColorSpaceInverseTransform_C(const VP8LTransform* const transform,
|
||||
int y_start, int y_end,
|
||||
const uint32_t* src, uint32_t* dst) {
|
||||
const int width = transform->xsize_;
|
||||
const int tile_width = 1 << transform->bits_;
|
||||
const int mask = tile_width - 1;
|
||||
|
@ -362,10 +364,10 @@ STATIC_DECL void FUNC_NAME(const VP8LTransform* const transform, \
|
|||
} \
|
||||
}
|
||||
|
||||
COLOR_INDEX_INVERSE(ColorIndexInverseTransform, MapARGB, static, uint32_t, 32b,
|
||||
VP8GetARGBIndex, VP8GetARGBValue)
|
||||
COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, MapAlpha, , uint8_t,
|
||||
8b, VP8GetAlphaIndex, VP8GetAlphaValue)
|
||||
COLOR_INDEX_INVERSE(ColorIndexInverseTransform_C, MapARGB_C, static,
|
||||
uint32_t, 32b, VP8GetARGBIndex, VP8GetARGBValue)
|
||||
COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, MapAlpha_C, ,
|
||||
uint8_t, 8b, VP8GetAlphaIndex, VP8GetAlphaValue)
|
||||
|
||||
#undef COLOR_INDEX_INVERSE
|
||||
|
||||
|
@ -380,7 +382,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
|
|||
VP8LAddGreenToBlueAndRed(in, (row_end - row_start) * width, out);
|
||||
break;
|
||||
case PREDICTOR_TRANSFORM:
|
||||
PredictorInverseTransform(transform, row_start, row_end, in, out);
|
||||
PredictorInverseTransform_C(transform, row_start, row_end, in, out);
|
||||
if (row_end != transform->ysize_) {
|
||||
// The last predicted row in this iteration will be the top-pred row
|
||||
// for the first row in next iteration.
|
||||
|
@ -389,7 +391,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
|
|||
}
|
||||
break;
|
||||
case CROSS_COLOR_TRANSFORM:
|
||||
ColorSpaceInverseTransform(transform, row_start, row_end, in, out);
|
||||
ColorSpaceInverseTransform_C(transform, row_start, row_end, in, out);
|
||||
break;
|
||||
case COLOR_INDEXING_TRANSFORM:
|
||||
if (in == out && transform->bits_ > 0) {
|
||||
|
@ -403,9 +405,9 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
|
|||
VP8LSubSampleSize(transform->xsize_, transform->bits_);
|
||||
uint32_t* const src = out + out_stride - in_stride;
|
||||
memmove(src, out, in_stride * sizeof(*src));
|
||||
ColorIndexInverseTransform(transform, row_start, row_end, src, out);
|
||||
ColorIndexInverseTransform_C(transform, row_start, row_end, src, out);
|
||||
} else {
|
||||
ColorIndexInverseTransform(transform, row_start, row_end, in, out);
|
||||
ColorIndexInverseTransform_C(transform, row_start, row_end, in, out);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -452,7 +454,7 @@ void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
|
|||
const uint32_t argb = *src++;
|
||||
const uint8_t rg = ((argb >> 16) & 0xf0) | ((argb >> 12) & 0xf);
|
||||
const uint8_t ba = ((argb >> 0) & 0xf0) | ((argb >> 28) & 0xf);
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
#if (WEBP_SWAP_16BIT_CSP == 1)
|
||||
*dst++ = ba;
|
||||
*dst++ = rg;
|
||||
#else
|
||||
|
@ -469,7 +471,7 @@ void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
|
|||
const uint32_t argb = *src++;
|
||||
const uint8_t rg = ((argb >> 16) & 0xf8) | ((argb >> 13) & 0x7);
|
||||
const uint8_t gb = ((argb >> 5) & 0xe0) | ((argb >> 3) & 0x1f);
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
#if (WEBP_SWAP_16BIT_CSP == 1)
|
||||
*dst++ = gb;
|
||||
*dst++ = rg;
|
||||
#else
|
||||
|
@ -496,22 +498,7 @@ static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
|
|||
const uint32_t* const src_end = src + num_pixels;
|
||||
while (src < src_end) {
|
||||
const uint32_t argb = *src++;
|
||||
|
||||
#if !defined(WORDS_BIGENDIAN)
|
||||
#if !defined(WEBP_REFERENCE_IMPLEMENTATION)
|
||||
WebPUint32ToMem(dst, BSwap32(argb));
|
||||
#else // WEBP_REFERENCE_IMPLEMENTATION
|
||||
dst[0] = (argb >> 24) & 0xff;
|
||||
dst[1] = (argb >> 16) & 0xff;
|
||||
dst[2] = (argb >> 8) & 0xff;
|
||||
dst[3] = (argb >> 0) & 0xff;
|
||||
#endif
|
||||
#else // WORDS_BIGENDIAN
|
||||
dst[0] = (argb >> 0) & 0xff;
|
||||
dst[1] = (argb >> 8) & 0xff;
|
||||
dst[2] = (argb >> 16) & 0xff;
|
||||
dst[3] = (argb >> 24) & 0xff;
|
||||
#endif
|
||||
dst += sizeof(argb);
|
||||
}
|
||||
} else {
|
||||
|
@ -593,23 +580,23 @@ extern void VP8LDspInitMSA(void);
|
|||
static volatile VP8CPUInfo lossless_last_cpuinfo_used =
|
||||
(VP8CPUInfo)&lossless_last_cpuinfo_used;
|
||||
|
||||
#define COPY_PREDICTOR_ARRAY(IN, OUT) do { \
|
||||
(OUT)[0] = IN##0; \
|
||||
(OUT)[1] = IN##1; \
|
||||
(OUT)[2] = IN##2; \
|
||||
(OUT)[3] = IN##3; \
|
||||
(OUT)[4] = IN##4; \
|
||||
(OUT)[5] = IN##5; \
|
||||
(OUT)[6] = IN##6; \
|
||||
(OUT)[7] = IN##7; \
|
||||
(OUT)[8] = IN##8; \
|
||||
(OUT)[9] = IN##9; \
|
||||
(OUT)[10] = IN##10; \
|
||||
(OUT)[11] = IN##11; \
|
||||
(OUT)[12] = IN##12; \
|
||||
(OUT)[13] = IN##13; \
|
||||
(OUT)[14] = IN##0; /* <- padding security sentinels*/ \
|
||||
(OUT)[15] = IN##0; \
|
||||
#define COPY_PREDICTOR_ARRAY(IN, OUT) do { \
|
||||
(OUT)[0] = IN##0_C; \
|
||||
(OUT)[1] = IN##1_C; \
|
||||
(OUT)[2] = IN##2_C; \
|
||||
(OUT)[3] = IN##3_C; \
|
||||
(OUT)[4] = IN##4_C; \
|
||||
(OUT)[5] = IN##5_C; \
|
||||
(OUT)[6] = IN##6_C; \
|
||||
(OUT)[7] = IN##7_C; \
|
||||
(OUT)[8] = IN##8_C; \
|
||||
(OUT)[9] = IN##9_C; \
|
||||
(OUT)[10] = IN##10_C; \
|
||||
(OUT)[11] = IN##11_C; \
|
||||
(OUT)[12] = IN##12_C; \
|
||||
(OUT)[13] = IN##13_C; \
|
||||
(OUT)[14] = IN##0_C; /* <- padding security sentinels*/ \
|
||||
(OUT)[15] = IN##0_C; \
|
||||
} while (0);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
|
||||
|
@ -620,18 +607,21 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
|
|||
COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd)
|
||||
COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd_C)
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
VP8LAddGreenToBlueAndRed = VP8LAddGreenToBlueAndRed_C;
|
||||
|
||||
VP8LTransformColorInverse = VP8LTransformColorInverse_C;
|
||||
|
||||
VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
|
||||
VP8LConvertBGRAToRGBA = VP8LConvertBGRAToRGBA_C;
|
||||
VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
|
||||
VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;
|
||||
#endif
|
||||
|
||||
VP8LConvertBGRAToRGBA4444 = VP8LConvertBGRAToRGBA4444_C;
|
||||
VP8LConvertBGRAToRGB565 = VP8LConvertBGRAToRGB565_C;
|
||||
VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;
|
||||
|
||||
VP8LMapColor32b = MapARGB;
|
||||
VP8LMapColor8b = MapAlpha;
|
||||
VP8LMapColor32b = MapARGB_C;
|
||||
VP8LMapColor8b = MapAlpha_C;
|
||||
|
||||
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
|
@ -640,11 +630,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
|
|||
VP8LDspInitSSE2();
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_NEON)
|
||||
if (VP8GetCPUInfo(kNEON)) {
|
||||
VP8LDspInitNEON();
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_MIPS_DSP_R2)
|
||||
if (VP8GetCPUInfo(kMIPSdspR2)) {
|
||||
VP8LDspInitMIPSdspR2();
|
||||
|
@ -656,6 +641,24 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
|
|||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
if (WEBP_NEON_OMIT_C_CODE ||
|
||||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
|
||||
VP8LDspInitNEON();
|
||||
}
|
||||
#endif
|
||||
|
||||
assert(VP8LAddGreenToBlueAndRed != NULL);
|
||||
assert(VP8LTransformColorInverse != NULL);
|
||||
assert(VP8LConvertBGRAToRGBA != NULL);
|
||||
assert(VP8LConvertBGRAToRGB != NULL);
|
||||
assert(VP8LConvertBGRAToBGR != NULL);
|
||||
assert(VP8LConvertBGRAToRGBA4444 != NULL);
|
||||
assert(VP8LConvertBGRAToRGB565 != NULL);
|
||||
assert(VP8LMapColor32b != NULL);
|
||||
assert(VP8LMapColor8b != NULL);
|
||||
|
||||
lossless_last_cpuinfo_used = VP8GetCPUInfo;
|
||||
}
|
||||
#undef COPY_PREDICTOR_ARRAY
|
|
@ -15,18 +15,18 @@
|
|||
#ifndef WEBP_DSP_LOSSLESS_H_
|
||||
#define WEBP_DSP_LOSSLESS_H_
|
||||
|
||||
#include "../webp/types.h"
|
||||
#include "../webp/decode.h"
|
||||
#include "src/webp/types.h"
|
||||
#include "src/webp/decode.h"
|
||||
|
||||
#include "../enc/histogram_enc.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "src/enc/histogram_enc.h"
|
||||
#include "src/utils/utils.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef WEBP_EXPERIMENTAL_FEATURES
|
||||
#include "../enc/delta_palettization_enc.h"
|
||||
#include "src/enc/delta_palettization_enc.h"
|
||||
#endif // WEBP_EXPERIMENTAL_FEATURES
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
@ -124,7 +124,7 @@ void VP8LDspInit(void);
|
|||
typedef void (*VP8LProcessEncBlueAndRedFunc)(uint32_t* dst, int num_pixels);
|
||||
extern VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
|
||||
typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
|
||||
uint32_t* const dst, int num_pixels);
|
||||
uint32_t* dst, int num_pixels);
|
||||
extern VP8LTransformColorFunc VP8LTransformColor;
|
||||
typedef void (*VP8LCollectColorBlueTransformsFunc)(
|
||||
const uint32_t* argb, int stride,
|
|
@ -16,9 +16,9 @@
|
|||
#ifndef WEBP_DSP_LOSSLESS_COMMON_H_
|
||||
#define WEBP_DSP_LOSSLESS_COMMON_H_
|
||||
|
||||
#include "../webp/types.h"
|
||||
#include "src/webp/types.h"
|
||||
|
||||
#include "../utils/utils.h"
|
||||
#include "src/utils/utils.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
@ -93,14 +93,6 @@ static WEBP_INLINE float VP8LFastSLog2(uint32_t v) {
|
|||
// -----------------------------------------------------------------------------
|
||||
// PrefixEncode()
|
||||
|
||||
static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
|
||||
const int log_floor = BitsLog2Floor(n);
|
||||
if (n == (n & ~(n - 1))) { // zero or a power of two.
|
||||
return log_floor;
|
||||
}
|
||||
return log_floor + 1;
|
||||
}
|
||||
|
||||
// Splitting of distance and length codes into prefixes and
|
||||
// extra bits. The prefixes are encoded with an entropy code
|
||||
// while the extra bits are stored just as normal bits.
|
|
@ -13,15 +13,16 @@
|
|||
// Jyrki Alakuijala (jyrki@google.com)
|
||||
// Urvang Joshi (urvang@google.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include "../dec/vp8li_dec.h"
|
||||
#include "../utils/endian_inl_utils.h"
|
||||
#include "./lossless.h"
|
||||
#include "./lossless_common.h"
|
||||
#include "./yuv.h"
|
||||
#include "src/dec/vp8li_dec.h"
|
||||
#include "src/utils/endian_inl_utils.h"
|
||||
#include "src/dsp/lossless.h"
|
||||
#include "src/dsp/lossless_common.h"
|
||||
#include "src/dsp/yuv.h"
|
||||
|
||||
// lookup table for small values of log2(int)
|
||||
const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
|
||||
|
@ -325,7 +326,7 @@ const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
|
|||
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
|
||||
};
|
||||
|
||||
static float FastSLog2Slow(uint32_t v) {
|
||||
static float FastSLog2Slow_C(uint32_t v) {
|
||||
assert(v >= LOG_LOOKUP_IDX_MAX);
|
||||
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
|
||||
int log_cnt = 0;
|
||||
|
@ -351,7 +352,7 @@ static float FastSLog2Slow(uint32_t v) {
|
|||
}
|
||||
}
|
||||
|
||||
static float FastLog2Slow(uint32_t v) {
|
||||
static float FastLog2Slow_C(uint32_t v) {
|
||||
assert(v >= LOG_LOOKUP_IDX_MAX);
|
||||
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
|
||||
int log_cnt = 0;
|
||||
|
@ -380,7 +381,7 @@ static float FastLog2Slow(uint32_t v) {
|
|||
// Methods to calculate Entropy (Shannon).
|
||||
|
||||
// Compute the combined Shanon's entropy for distribution {X} and {X+Y}
|
||||
static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
|
||||
static float CombinedShannonEntropy_C(const int X[256], const int Y[256]) {
|
||||
int i;
|
||||
double retval = 0.;
|
||||
int sumX = 0, sumXY = 0;
|
||||
|
@ -453,9 +454,9 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
|
|||
*i_prev = i;
|
||||
}
|
||||
|
||||
static void GetEntropyUnrefined(const uint32_t X[], int length,
|
||||
VP8LBitEntropy* const bit_entropy,
|
||||
VP8LStreaks* const stats) {
|
||||
static void GetEntropyUnrefined_C(const uint32_t X[], int length,
|
||||
VP8LBitEntropy* const bit_entropy,
|
||||
VP8LStreaks* const stats) {
|
||||
int i;
|
||||
int i_prev = 0;
|
||||
uint32_t x_prev = X[0];
|
||||
|
@ -474,10 +475,11 @@ static void GetEntropyUnrefined(const uint32_t X[], int length,
|
|||
bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
|
||||
}
|
||||
|
||||
static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[],
|
||||
int length,
|
||||
VP8LBitEntropy* const bit_entropy,
|
||||
VP8LStreaks* const stats) {
|
||||
static void GetCombinedEntropyUnrefined_C(const uint32_t X[],
|
||||
const uint32_t Y[],
|
||||
int length,
|
||||
VP8LBitEntropy* const bit_entropy,
|
||||
VP8LStreaks* const stats) {
|
||||
int i = 1;
|
||||
int i_prev = 0;
|
||||
uint32_t xy_prev = X[0] + Y[0];
|
||||
|
@ -520,8 +522,8 @@ void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
|
|||
const uint32_t argb = data[i];
|
||||
const uint32_t green = argb >> 8;
|
||||
const uint32_t red = argb >> 16;
|
||||
int new_red = red;
|
||||
int new_blue = argb;
|
||||
int new_red = red & 0xff;
|
||||
int new_blue = argb & 0xff;
|
||||
new_red -= ColorTransformDelta(m->green_to_red_, green);
|
||||
new_red &= 0xff;
|
||||
new_blue -= ColorTransformDelta(m->green_to_blue_, green);
|
||||
|
@ -577,8 +579,8 @@ void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
|
|||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static int VectorMismatch(const uint32_t* const array1,
|
||||
const uint32_t* const array2, int length) {
|
||||
static int VectorMismatch_C(const uint32_t* const array1,
|
||||
const uint32_t* const array2, int length) {
|
||||
int match_len = 0;
|
||||
|
||||
while (match_len < length && array1[match_len] == array2[match_len]) {
|
||||
|
@ -610,15 +612,15 @@ void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
|
|||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static double ExtraCost(const uint32_t* population, int length) {
|
||||
static double ExtraCost_C(const uint32_t* population, int length) {
|
||||
int i;
|
||||
double cost = 0.;
|
||||
for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
|
||||
return cost;
|
||||
}
|
||||
|
||||
static double ExtraCostCombined(const uint32_t* X, const uint32_t* Y,
|
||||
int length) {
|
||||
static double ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
|
||||
int length) {
|
||||
int i;
|
||||
double cost = 0.;
|
||||
for (i = 2; i < length - 2; ++i) {
|
||||
|
@ -630,9 +632,9 @@ static double ExtraCostCombined(const uint32_t* X, const uint32_t* Y,
|
|||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static void HistogramAdd(const VP8LHistogram* const a,
|
||||
const VP8LHistogram* const b,
|
||||
VP8LHistogram* const out) {
|
||||
static void HistogramAdd_C(const VP8LHistogram* const a,
|
||||
const VP8LHistogram* const b,
|
||||
VP8LHistogram* const out) {
|
||||
int i;
|
||||
const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
|
||||
assert(a->palette_code_bits_ == b->palette_code_bits_);
|
||||
|
@ -869,26 +871,28 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
|
|||
|
||||
VP8LDspInit();
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
VP8LSubtractGreenFromBlueAndRed = VP8LSubtractGreenFromBlueAndRed_C;
|
||||
|
||||
VP8LTransformColor = VP8LTransformColor_C;
|
||||
#endif
|
||||
|
||||
VP8LCollectColorBlueTransforms = VP8LCollectColorBlueTransforms_C;
|
||||
VP8LCollectColorRedTransforms = VP8LCollectColorRedTransforms_C;
|
||||
|
||||
VP8LFastLog2Slow = FastLog2Slow;
|
||||
VP8LFastSLog2Slow = FastSLog2Slow;
|
||||
VP8LFastLog2Slow = FastLog2Slow_C;
|
||||
VP8LFastSLog2Slow = FastSLog2Slow_C;
|
||||
|
||||
VP8LExtraCost = ExtraCost;
|
||||
VP8LExtraCostCombined = ExtraCostCombined;
|
||||
VP8LCombinedShannonEntropy = CombinedShannonEntropy;
|
||||
VP8LExtraCost = ExtraCost_C;
|
||||
VP8LExtraCostCombined = ExtraCostCombined_C;
|
||||
VP8LCombinedShannonEntropy = CombinedShannonEntropy_C;
|
||||
|
||||
VP8LGetEntropyUnrefined = GetEntropyUnrefined;
|
||||
VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined;
|
||||
VP8LGetEntropyUnrefined = GetEntropyUnrefined_C;
|
||||
VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_C;
|
||||
|
||||
VP8LHistogramAdd = HistogramAdd;
|
||||
VP8LHistogramAdd = HistogramAdd_C;
|
||||
|
||||
VP8LVectorMismatch = VectorMismatch;
|
||||
VP8LVectorMismatch = VectorMismatch_C;
|
||||
VP8LBundleColorMap = VP8LBundleColorMap_C;
|
||||
|
||||
VP8LPredictorsSub[0] = PredictorSub0_C;
|
||||
|
@ -937,11 +941,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
|
|||
#endif
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_NEON)
|
||||
if (VP8GetCPUInfo(kNEON)) {
|
||||
VP8LEncDspInitNEON();
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_MIPS32)
|
||||
if (VP8GetCPUInfo(kMIPS32)) {
|
||||
VP8LEncDspInitMIPS32();
|
||||
|
@ -958,6 +957,61 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
|
|||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
if (WEBP_NEON_OMIT_C_CODE ||
|
||||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
|
||||
VP8LEncDspInitNEON();
|
||||
}
|
||||
#endif
|
||||
|
||||
assert(VP8LSubtractGreenFromBlueAndRed != NULL);
|
||||
assert(VP8LTransformColor != NULL);
|
||||
assert(VP8LCollectColorBlueTransforms != NULL);
|
||||
assert(VP8LCollectColorRedTransforms != NULL);
|
||||
assert(VP8LFastLog2Slow != NULL);
|
||||
assert(VP8LFastSLog2Slow != NULL);
|
||||
assert(VP8LExtraCost != NULL);
|
||||
assert(VP8LExtraCostCombined != NULL);
|
||||
assert(VP8LCombinedShannonEntropy != NULL);
|
||||
assert(VP8LGetEntropyUnrefined != NULL);
|
||||
assert(VP8LGetCombinedEntropyUnrefined != NULL);
|
||||
assert(VP8LHistogramAdd != NULL);
|
||||
assert(VP8LVectorMismatch != NULL);
|
||||
assert(VP8LBundleColorMap != NULL);
|
||||
assert(VP8LPredictorsSub[0] != NULL);
|
||||
assert(VP8LPredictorsSub[1] != NULL);
|
||||
assert(VP8LPredictorsSub[2] != NULL);
|
||||
assert(VP8LPredictorsSub[3] != NULL);
|
||||
assert(VP8LPredictorsSub[4] != NULL);
|
||||
assert(VP8LPredictorsSub[5] != NULL);
|
||||
assert(VP8LPredictorsSub[6] != NULL);
|
||||
assert(VP8LPredictorsSub[7] != NULL);
|
||||
assert(VP8LPredictorsSub[8] != NULL);
|
||||
assert(VP8LPredictorsSub[9] != NULL);
|
||||
assert(VP8LPredictorsSub[10] != NULL);
|
||||
assert(VP8LPredictorsSub[11] != NULL);
|
||||
assert(VP8LPredictorsSub[12] != NULL);
|
||||
assert(VP8LPredictorsSub[13] != NULL);
|
||||
assert(VP8LPredictorsSub[14] != NULL);
|
||||
assert(VP8LPredictorsSub[15] != NULL);
|
||||
assert(VP8LPredictorsSub_C[0] != NULL);
|
||||
assert(VP8LPredictorsSub_C[1] != NULL);
|
||||
assert(VP8LPredictorsSub_C[2] != NULL);
|
||||
assert(VP8LPredictorsSub_C[3] != NULL);
|
||||
assert(VP8LPredictorsSub_C[4] != NULL);
|
||||
assert(VP8LPredictorsSub_C[5] != NULL);
|
||||
assert(VP8LPredictorsSub_C[6] != NULL);
|
||||
assert(VP8LPredictorsSub_C[7] != NULL);
|
||||
assert(VP8LPredictorsSub_C[8] != NULL);
|
||||
assert(VP8LPredictorsSub_C[9] != NULL);
|
||||
assert(VP8LPredictorsSub_C[10] != NULL);
|
||||
assert(VP8LPredictorsSub_C[11] != NULL);
|
||||
assert(VP8LPredictorsSub_C[12] != NULL);
|
||||
assert(VP8LPredictorsSub_C[13] != NULL);
|
||||
assert(VP8LPredictorsSub_C[14] != NULL);
|
||||
assert(VP8LPredictorsSub_C[15] != NULL);
|
||||
|
||||
lossless_enc_last_cpuinfo_used = VP8GetCPUInfo;
|
||||
}
|
||||
|
|
@ -12,9 +12,9 @@
|
|||
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
|
||||
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "./lossless.h"
|
||||
#include "./lossless_common.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
#include "src/dsp/lossless.h"
|
||||
#include "src/dsp/lossless_common.h"
|
||||
|
||||
#if defined(WEBP_USE_MIPS32)
|
||||
|
||||
|
@ -23,7 +23,7 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
static float FastSLog2Slow(uint32_t v) {
|
||||
static float FastSLog2Slow_MIPS32(uint32_t v) {
|
||||
assert(v >= LOG_LOOKUP_IDX_MAX);
|
||||
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
|
||||
uint32_t log_cnt, y, correction;
|
||||
|
@ -59,7 +59,7 @@ static float FastSLog2Slow(uint32_t v) {
|
|||
}
|
||||
}
|
||||
|
||||
static float FastLog2Slow(uint32_t v) {
|
||||
static float FastLog2Slow_MIPS32(uint32_t v) {
|
||||
assert(v >= LOG_LOOKUP_IDX_MAX);
|
||||
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
|
||||
uint32_t log_cnt, y;
|
||||
|
@ -104,7 +104,7 @@ static float FastLog2Slow(uint32_t v) {
|
|||
// pop += 2;
|
||||
// }
|
||||
// return (double)cost;
|
||||
static double ExtraCost(const uint32_t* const population, int length) {
|
||||
static double ExtraCost_MIPS32(const uint32_t* const population, int length) {
|
||||
int i, temp0, temp1;
|
||||
const uint32_t* pop = &population[4];
|
||||
const uint32_t* const LoopEnd = &population[length];
|
||||
|
@ -149,8 +149,8 @@ static double ExtraCost(const uint32_t* const population, int length) {
|
|||
// pY += 2;
|
||||
// }
|
||||
// return (double)cost;
|
||||
static double ExtraCostCombined(const uint32_t* const X,
|
||||
const uint32_t* const Y, int length) {
|
||||
static double ExtraCostCombined_MIPS32(const uint32_t* const X,
|
||||
const uint32_t* const Y, int length) {
|
||||
int i, temp0, temp1, temp2, temp3;
|
||||
const uint32_t* pX = &X[4];
|
||||
const uint32_t* pY = &Y[4];
|
||||
|
@ -241,9 +241,9 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
|
|||
*i_prev = i;
|
||||
}
|
||||
|
||||
static void GetEntropyUnrefined(const uint32_t X[], int length,
|
||||
VP8LBitEntropy* const bit_entropy,
|
||||
VP8LStreaks* const stats) {
|
||||
static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length,
|
||||
VP8LBitEntropy* const bit_entropy,
|
||||
VP8LStreaks* const stats) {
|
||||
int i;
|
||||
int i_prev = 0;
|
||||
uint32_t x_prev = X[0];
|
||||
|
@ -262,26 +262,27 @@ static void GetEntropyUnrefined(const uint32_t X[], int length,
|
|||
bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
|
||||
}
|
||||
|
||||
static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[],
|
||||
int length,
|
||||
VP8LBitEntropy* const bit_entropy,
|
||||
VP8LStreaks* const stats) {
|
||||
static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
|
||||
const uint32_t Y[],
|
||||
int length,
|
||||
VP8LBitEntropy* const entropy,
|
||||
VP8LStreaks* const stats) {
|
||||
int i = 1;
|
||||
int i_prev = 0;
|
||||
uint32_t xy_prev = X[0] + Y[0];
|
||||
|
||||
memset(stats, 0, sizeof(*stats));
|
||||
VP8LBitEntropyInit(bit_entropy);
|
||||
VP8LBitEntropyInit(entropy);
|
||||
|
||||
for (i = 1; i < length; ++i) {
|
||||
const uint32_t xy = X[i] + Y[i];
|
||||
if (xy != xy_prev) {
|
||||
GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, bit_entropy, stats);
|
||||
GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, entropy, stats);
|
||||
}
|
||||
}
|
||||
GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats);
|
||||
GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, entropy, stats);
|
||||
|
||||
bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
|
||||
entropy->entropy += VP8LFastSLog2(entropy->sum);
|
||||
}
|
||||
|
||||
#define ASM_START \
|
||||
|
@ -374,9 +375,9 @@ static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[],
|
|||
} \
|
||||
} while (0)
|
||||
|
||||
static void HistogramAdd(const VP8LHistogram* const a,
|
||||
const VP8LHistogram* const b,
|
||||
VP8LHistogram* const out) {
|
||||
static void HistogramAdd_MIPS32(const VP8LHistogram* const a,
|
||||
const VP8LHistogram* const b,
|
||||
VP8LHistogram* const out) {
|
||||
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
||||
const int extra_cache_size = VP8LHistogramNumCodes(a->palette_code_bits_)
|
||||
- (NUM_LITERAL_CODES + NUM_LENGTH_CODES);
|
||||
|
@ -415,13 +416,13 @@ static void HistogramAdd(const VP8LHistogram* const a,
|
|||
extern void VP8LEncDspInitMIPS32(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) {
|
||||
VP8LFastSLog2Slow = FastSLog2Slow;
|
||||
VP8LFastLog2Slow = FastLog2Slow;
|
||||
VP8LExtraCost = ExtraCost;
|
||||
VP8LExtraCostCombined = ExtraCostCombined;
|
||||
VP8LGetEntropyUnrefined = GetEntropyUnrefined;
|
||||
VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined;
|
||||
VP8LHistogramAdd = HistogramAdd;
|
||||
VP8LFastSLog2Slow = FastSLog2Slow_MIPS32;
|
||||
VP8LFastLog2Slow = FastLog2Slow_MIPS32;
|
||||
VP8LExtraCost = ExtraCost_MIPS32;
|
||||
VP8LExtraCostCombined = ExtraCostCombined_MIPS32;
|
||||
VP8LGetEntropyUnrefined = GetEntropyUnrefined_MIPS32;
|
||||
VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_MIPS32;
|
||||
VP8LHistogramAdd = HistogramAdd_MIPS32;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_MIPS32
|
|
@ -12,14 +12,14 @@
|
|||
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
|
||||
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MIPS_DSP_R2)
|
||||
|
||||
#include "./lossless.h"
|
||||
#include "src/dsp/lossless.h"
|
||||
|
||||
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data,
|
||||
int num_pixels) {
|
||||
static void SubtractGreenFromBlueAndRed_MIPSdspR2(uint32_t* argb_data,
|
||||
int num_pixels) {
|
||||
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
||||
uint32_t* const p_loop1_end = argb_data + (num_pixels & ~3);
|
||||
uint32_t* const p_loop2_end = p_loop1_end + (num_pixels & 3);
|
||||
|
@ -78,8 +78,8 @@ static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
|
|||
return (uint32_t)((int)(color_pred) * color) >> 5;
|
||||
}
|
||||
|
||||
static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
|
||||
int num_pixels) {
|
||||
static void TransformColor_MIPSdspR2(const VP8LMultipliers* const m,
|
||||
uint32_t* data, int num_pixels) {
|
||||
int temp0, temp1, temp2, temp3, temp4, temp5;
|
||||
uint32_t argb, argb1, new_red, new_red1;
|
||||
const uint32_t G_to_R = m->green_to_red_;
|
||||
|
@ -171,10 +171,13 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
|
|||
return (new_blue & 0xff);
|
||||
}
|
||||
|
||||
static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
|
||||
int tile_width, int tile_height,
|
||||
int green_to_blue, int red_to_blue,
|
||||
int histo[]) {
|
||||
static void CollectColorBlueTransforms_MIPSdspR2(const uint32_t* argb,
|
||||
int stride,
|
||||
int tile_width,
|
||||
int tile_height,
|
||||
int green_to_blue,
|
||||
int red_to_blue,
|
||||
int histo[]) {
|
||||
const int rtb = (red_to_blue << 16) | (red_to_blue & 0xffff);
|
||||
const int gtb = (green_to_blue << 16) | (green_to_blue & 0xffff);
|
||||
const uint32_t mask = 0xff00ffu;
|
||||
|
@ -222,9 +225,12 @@ static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
|
|||
return (new_red & 0xff);
|
||||
}
|
||||
|
||||
static void CollectColorRedTransforms(const uint32_t* argb, int stride,
|
||||
int tile_width, int tile_height,
|
||||
int green_to_red, int histo[]) {
|
||||
static void CollectColorRedTransforms_MIPSdspR2(const uint32_t* argb,
|
||||
int stride,
|
||||
int tile_width,
|
||||
int tile_height,
|
||||
int green_to_red,
|
||||
int histo[]) {
|
||||
const int gtr = (green_to_red << 16) | (green_to_red & 0xffff);
|
||||
while (tile_height-- > 0) {
|
||||
int x;
|
||||
|
@ -262,10 +268,10 @@ static void CollectColorRedTransforms(const uint32_t* argb, int stride,
|
|||
extern void VP8LEncDspInitMIPSdspR2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPSdspR2(void) {
|
||||
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
|
||||
VP8LTransformColor = TransformColor;
|
||||
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms;
|
||||
VP8LCollectColorRedTransforms = CollectColorRedTransforms;
|
||||
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MIPSdspR2;
|
||||
VP8LTransformColor = TransformColor_MIPSdspR2;
|
||||
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_MIPSdspR2;
|
||||
VP8LCollectColorRedTransforms = CollectColorRedTransforms_MIPSdspR2;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_MIPS_DSP_R2
|
|
@ -11,12 +11,12 @@
|
|||
//
|
||||
// Authors: Prashant Patil (Prashant.Patil@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MSA)
|
||||
|
||||
#include "./lossless.h"
|
||||
#include "./msa_macro.h"
|
||||
#include "src/dsp/lossless.h"
|
||||
#include "src/dsp/msa_macro.h"
|
||||
|
||||
#define TRANSFORM_COLOR_8(src0, src1, dst0, dst1, c0, c1, mask0, mask1) do { \
|
||||
v8i16 g0, g1, t0, t1, t2, t3; \
|
||||
|
@ -48,8 +48,8 @@
|
|||
dst = VSHF_UB(src, t0, mask1); \
|
||||
} while (0)
|
||||
|
||||
static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
|
||||
int num_pixels) {
|
||||
static void TransformColor_MSA(const VP8LMultipliers* const m, uint32_t* data,
|
||||
int num_pixels) {
|
||||
v16u8 src0, dst0;
|
||||
const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
|
||||
(m->green_to_red_ << 16));
|
||||
|
@ -94,7 +94,8 @@ static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
|
|||
}
|
||||
}
|
||||
|
||||
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
|
||||
static void SubtractGreenFromBlueAndRed_MSA(uint32_t* argb_data,
|
||||
int num_pixels) {
|
||||
int i;
|
||||
uint8_t* ptemp_data = (uint8_t*)argb_data;
|
||||
v16u8 src0, dst0, tmp0;
|
||||
|
@ -136,8 +137,8 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
|
|||
extern void VP8LEncDspInitMSA(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMSA(void) {
|
||||
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
|
||||
VP8LTransformColor = TransformColor;
|
||||
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MSA;
|
||||
VP8LTransformColor = TransformColor_MSA;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_MSA
|
|
@ -11,14 +11,14 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "./lossless.h"
|
||||
#include "./neon.h"
|
||||
#include "src/dsp/lossless.h"
|
||||
#include "src/dsp/neon.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Subtract-Green Transform
|
||||
|
@ -36,8 +36,8 @@ static const uint8_t kGreenShuffle[16] = {
|
|||
1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
|
||||
};
|
||||
|
||||
static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
|
||||
const uint8x16_t shuffle) {
|
||||
static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
|
||||
const uint8x16_t shuffle) {
|
||||
return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
|
||||
vtbl1q_u8(argb, vget_high_u8(shuffle)));
|
||||
}
|
||||
|
@ -45,14 +45,15 @@ static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
|
|||
// 255 = byte will be zeroed
|
||||
static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 };
|
||||
|
||||
static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
|
||||
const uint8x8_t shuffle) {
|
||||
static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
|
||||
const uint8x8_t shuffle) {
|
||||
return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
|
||||
vtbl1_u8(vget_high_u8(argb), shuffle));
|
||||
}
|
||||
#endif // USE_VTBLQ
|
||||
|
||||
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
|
||||
static void SubtractGreenFromBlueAndRed_NEON(uint32_t* argb_data,
|
||||
int num_pixels) {
|
||||
const uint32_t* const end = argb_data + (num_pixels & ~3);
|
||||
#ifdef USE_VTBLQ
|
||||
const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
|
||||
|
@ -61,7 +62,7 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
|
|||
#endif
|
||||
for (; argb_data < end; argb_data += 4) {
|
||||
const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
|
||||
const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
|
||||
const uint8x16_t greens = DoGreenShuffle_NEON(argb, shuffle);
|
||||
vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
|
||||
}
|
||||
// fallthrough and finish off with plain-C
|
||||
|
@ -71,8 +72,8 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
|
|||
//------------------------------------------------------------------------------
|
||||
// Color Transform
|
||||
|
||||
static void TransformColor(const VP8LMultipliers* const m,
|
||||
uint32_t* argb_data, int num_pixels) {
|
||||
static void TransformColor_NEON(const VP8LMultipliers* const m,
|
||||
uint32_t* argb_data, int num_pixels) {
|
||||
// sign-extended multiplying constants, pre-shifted by 6.
|
||||
#define CST(X) (((int16_t)(m->X << 8)) >> 6)
|
||||
const int16_t rb[8] = {
|
||||
|
@ -102,7 +103,7 @@ static void TransformColor(const VP8LMultipliers* const m,
|
|||
for (i = 0; i + 4 <= num_pixels; i += 4) {
|
||||
const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i));
|
||||
// 0 g 0 g
|
||||
const uint8x16_t greens = DoGreenShuffle(in, shuffle);
|
||||
const uint8x16_t greens = DoGreenShuffle_NEON(in, shuffle);
|
||||
// x dr x db1
|
||||
const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
|
||||
// r 0 b 0
|
||||
|
@ -132,8 +133,8 @@ static void TransformColor(const VP8LMultipliers* const m,
|
|||
extern void VP8LEncDspInitNEON(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitNEON(void) {
|
||||
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
|
||||
VP8LTransformColor = TransformColor;
|
||||
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_NEON;
|
||||
VP8LTransformColor = TransformColor_NEON;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_NEON
|
|
@ -11,22 +11,23 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
#include <assert.h>
|
||||
#include <emmintrin.h>
|
||||
#include "./lossless.h"
|
||||
#include "./common_sse2.h"
|
||||
#include "./lossless_common.h"
|
||||
#include "src/dsp/lossless.h"
|
||||
#include "src/dsp/common_sse2.h"
|
||||
#include "src/dsp/lossless_common.h"
|
||||
|
||||
// For sign-extended multiplying constants, pre-shifted by 5:
|
||||
#define CST_5b(X) (((int16_t)((uint16_t)X << 8)) >> 5)
|
||||
#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5)
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Subtract-Green Transform
|
||||
|
||||
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
|
||||
static void SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data,
|
||||
int num_pixels) {
|
||||
int i;
|
||||
for (i = 0; i + 4 <= num_pixels; i += 4) {
|
||||
const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
|
||||
|
@ -45,8 +46,8 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
|
|||
//------------------------------------------------------------------------------
|
||||
// Color Transform
|
||||
|
||||
static void TransformColor(const VP8LMultipliers* const m,
|
||||
uint32_t* argb_data, int num_pixels) {
|
||||
static void TransformColor_SSE2(const VP8LMultipliers* const m,
|
||||
uint32_t* argb_data, int num_pixels) {
|
||||
const __m128i mults_rb = _mm_set_epi16(
|
||||
CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
|
||||
CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
|
||||
|
@ -80,10 +81,10 @@ static void TransformColor(const VP8LMultipliers* const m,
|
|||
|
||||
//------------------------------------------------------------------------------
|
||||
#define SPAN 8
|
||||
static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
|
||||
int tile_width, int tile_height,
|
||||
int green_to_blue, int red_to_blue,
|
||||
int histo[]) {
|
||||
static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
|
||||
int tile_width, int tile_height,
|
||||
int green_to_blue, int red_to_blue,
|
||||
int histo[]) {
|
||||
const __m128i mults_r = _mm_set_epi16(
|
||||
CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0,
|
||||
CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0);
|
||||
|
@ -131,9 +132,9 @@ static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
|
|||
}
|
||||
}
|
||||
|
||||
static void CollectColorRedTransforms(const uint32_t* argb, int stride,
|
||||
int tile_width, int tile_height,
|
||||
int green_to_red, int histo[]) {
|
||||
static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
|
||||
int tile_width, int tile_height,
|
||||
int green_to_red, int histo[]) {
|
||||
const __m128i mults_g = _mm_set_epi16(
|
||||
0, CST_5b(green_to_red), 0, CST_5b(green_to_red),
|
||||
0, CST_5b(green_to_red), 0, CST_5b(green_to_red));
|
||||
|
@ -177,8 +178,8 @@ static void CollectColorRedTransforms(const uint32_t* argb, int stride,
|
|||
//------------------------------------------------------------------------------
|
||||
|
||||
#define LINE_SIZE 16 // 8 or 16
|
||||
static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out,
|
||||
int size) {
|
||||
static void AddVector_SSE2(const uint32_t* a, const uint32_t* b, uint32_t* out,
|
||||
int size) {
|
||||
int i;
|
||||
assert(size % LINE_SIZE == 0);
|
||||
for (i = 0; i < size; i += LINE_SIZE) {
|
||||
|
@ -203,7 +204,7 @@ static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out,
|
|||
}
|
||||
}
|
||||
|
||||
static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) {
|
||||
static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {
|
||||
int i;
|
||||
assert(size % LINE_SIZE == 0);
|
||||
for (i = 0; i < size; i += LINE_SIZE) {
|
||||
|
@ -231,22 +232,22 @@ static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) {
|
|||
|
||||
// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
|
||||
// that's ok since the histogram values are less than 1<<28 (max picture size).
|
||||
static void HistogramAdd(const VP8LHistogram* const a,
|
||||
const VP8LHistogram* const b,
|
||||
VP8LHistogram* const out) {
|
||||
static void HistogramAdd_SSE2(const VP8LHistogram* const a,
|
||||
const VP8LHistogram* const b,
|
||||
VP8LHistogram* const out) {
|
||||
int i;
|
||||
const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
|
||||
assert(a->palette_code_bits_ == b->palette_code_bits_);
|
||||
if (b != out) {
|
||||
AddVector(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES);
|
||||
AddVector(a->red_, b->red_, out->red_, NUM_LITERAL_CODES);
|
||||
AddVector(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES);
|
||||
AddVector(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES);
|
||||
AddVector_SSE2(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES);
|
||||
AddVector_SSE2(a->red_, b->red_, out->red_, NUM_LITERAL_CODES);
|
||||
AddVector_SSE2(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES);
|
||||
AddVector_SSE2(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES);
|
||||
} else {
|
||||
AddVectorEq(a->literal_, out->literal_, NUM_LITERAL_CODES);
|
||||
AddVectorEq(a->red_, out->red_, NUM_LITERAL_CODES);
|
||||
AddVectorEq(a->blue_, out->blue_, NUM_LITERAL_CODES);
|
||||
AddVectorEq(a->alpha_, out->alpha_, NUM_LITERAL_CODES);
|
||||
AddVectorEq_SSE2(a->literal_, out->literal_, NUM_LITERAL_CODES);
|
||||
AddVectorEq_SSE2(a->red_, out->red_, NUM_LITERAL_CODES);
|
||||
AddVectorEq_SSE2(a->blue_, out->blue_, NUM_LITERAL_CODES);
|
||||
AddVectorEq_SSE2(a->alpha_, out->alpha_, NUM_LITERAL_CODES);
|
||||
}
|
||||
for (i = NUM_LITERAL_CODES; i < literal_size; ++i) {
|
||||
out->literal_[i] = a->literal_[i] + b->literal_[i];
|
||||
|
@ -261,9 +262,9 @@ static void HistogramAdd(const VP8LHistogram* const a,
|
|||
|
||||
// Checks whether the X or Y contribution is worth computing and adding.
|
||||
// Used in loop unrolling.
|
||||
#define ANALYZE_X_OR_Y(x_or_y, j) \
|
||||
do { \
|
||||
if (x_or_y[i + j] != 0) retval -= VP8LFastSLog2(x_or_y[i + j]); \
|
||||
#define ANALYZE_X_OR_Y(x_or_y, j) \
|
||||
do { \
|
||||
if ((x_or_y)[i + (j)] != 0) retval -= VP8LFastSLog2((x_or_y)[i + (j)]); \
|
||||
} while (0)
|
||||
|
||||
// Checks whether the X + Y contribution is worth computing and adding.
|
||||
|
@ -276,7 +277,7 @@ static void HistogramAdd(const VP8LHistogram* const a,
|
|||
} \
|
||||
} while (0)
|
||||
|
||||
static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
|
||||
static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
|
||||
int i;
|
||||
double retval = 0.;
|
||||
int sumX, sumXY;
|
||||
|
@ -332,8 +333,8 @@ static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
|
|||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static int VectorMismatch(const uint32_t* const array1,
|
||||
const uint32_t* const array2, int length) {
|
||||
static int VectorMismatch_SSE2(const uint32_t* const array1,
|
||||
const uint32_t* const array2, int length) {
|
||||
int match_len;
|
||||
|
||||
if (length >= 12) {
|
||||
|
@ -574,8 +575,8 @@ static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper,
|
|||
}
|
||||
|
||||
// Predictor11: select.
|
||||
static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B,
|
||||
__m128i* const out) {
|
||||
static void GetSumAbsDiff32_SSE2(const __m128i* const A, const __m128i* const B,
|
||||
__m128i* const out) {
|
||||
// We can unpack with any value on the upper 32 bits, provided it's the same
|
||||
// on both operands (to that their sum of abs diff is zero). Here we use *A.
|
||||
const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
|
||||
|
@ -596,8 +597,8 @@ static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
|
|||
const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
|
||||
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
|
||||
__m128i pa, pb;
|
||||
GetSumAbsDiff32(&T, &TL, &pa); // pa = sum |T-TL|
|
||||
GetSumAbsDiff32(&L, &TL, &pb); // pb = sum |L-TL|
|
||||
GetSumAbsDiff32_SSE2(&T, &TL, &pa); // pa = sum |T-TL|
|
||||
GetSumAbsDiff32_SSE2(&L, &TL, &pb); // pb = sum |L-TL|
|
||||
{
|
||||
const __m128i mask = _mm_cmpgt_epi32(pb, pa);
|
||||
const __m128i A = _mm_and_si128(mask, L);
|
||||
|
@ -677,13 +678,13 @@ static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper,
|
|||
extern void VP8LEncDspInitSSE2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
|
||||
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
|
||||
VP8LTransformColor = TransformColor;
|
||||
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms;
|
||||
VP8LCollectColorRedTransforms = CollectColorRedTransforms;
|
||||
VP8LHistogramAdd = HistogramAdd;
|
||||
VP8LCombinedShannonEntropy = CombinedShannonEntropy;
|
||||
VP8LVectorMismatch = VectorMismatch;
|
||||
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE2;
|
||||
VP8LTransformColor = TransformColor_SSE2;
|
||||
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE2;
|
||||
VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE2;
|
||||
VP8LHistogramAdd = HistogramAdd_SSE2;
|
||||
VP8LCombinedShannonEntropy = CombinedShannonEntropy_SSE2;
|
||||
VP8LVectorMismatch = VectorMismatch_SSE2;
|
||||
VP8LBundleColorMap = BundleColorMap_SSE2;
|
||||
|
||||
VP8LPredictorsSub[0] = PredictorSub0_SSE2;
|
|
@ -11,17 +11,18 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_SSE41)
|
||||
#include <assert.h>
|
||||
#include <smmintrin.h>
|
||||
#include "./lossless.h"
|
||||
#include "src/dsp/lossless.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Subtract-Green Transform
|
||||
|
||||
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
|
||||
static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
|
||||
int num_pixels) {
|
||||
int i;
|
||||
const __m128i kCstShuffle = _mm_set_epi8(-1, 13, -1, 13, -1, 9, -1, 9,
|
||||
-1, 5, -1, 5, -1, 1, -1, 1);
|
||||
|
@ -43,7 +44,7 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
|
|||
extern void VP8LEncDspInitSSE41(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) {
|
||||
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
|
||||
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_SSE41
|
|
@ -12,12 +12,12 @@
|
|||
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
|
||||
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MIPS_DSP_R2)
|
||||
|
||||
#include "./lossless.h"
|
||||
#include "./lossless_common.h"
|
||||
#include "src/dsp/lossless.h"
|
||||
#include "src/dsp/lossless_common.h"
|
||||
|
||||
#define MAP_COLOR_FUNCS(FUNC_NAME, TYPE, GET_INDEX, GET_VALUE) \
|
||||
static void FUNC_NAME(const TYPE* src, \
|
||||
|
@ -86,8 +86,8 @@ static void FUNC_NAME(const TYPE* src, \
|
|||
} \
|
||||
}
|
||||
|
||||
MAP_COLOR_FUNCS(MapARGB, uint32_t, VP8GetARGBIndex, VP8GetARGBValue)
|
||||
MAP_COLOR_FUNCS(MapAlpha, uint8_t, VP8GetAlphaIndex, VP8GetAlphaValue)
|
||||
MAP_COLOR_FUNCS(MapARGB_MIPSdspR2, uint32_t, VP8GetARGBIndex, VP8GetARGBValue)
|
||||
MAP_COLOR_FUNCS(MapAlpha_MIPSdspR2, uint8_t, VP8GetAlphaIndex, VP8GetAlphaValue)
|
||||
|
||||
#undef MAP_COLOR_FUNCS
|
||||
|
||||
|
@ -188,48 +188,52 @@ static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
|
|||
return Average2(Average2(a0, a1), Average2(a2, a3));
|
||||
}
|
||||
|
||||
static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
|
||||
static uint32_t Predictor5_MIPSdspR2(uint32_t left, const uint32_t* const top) {
|
||||
return Average3(left, top[0], top[1]);
|
||||
}
|
||||
|
||||
static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
|
||||
static uint32_t Predictor6_MIPSdspR2(uint32_t left, const uint32_t* const top) {
|
||||
return Average2(left, top[-1]);
|
||||
}
|
||||
|
||||
static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
|
||||
static uint32_t Predictor7_MIPSdspR2(uint32_t left, const uint32_t* const top) {
|
||||
return Average2(left, top[0]);
|
||||
}
|
||||
|
||||
static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
|
||||
static uint32_t Predictor8_MIPSdspR2(uint32_t left, const uint32_t* const top) {
|
||||
(void)left;
|
||||
return Average2(top[-1], top[0]);
|
||||
}
|
||||
|
||||
static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
|
||||
static uint32_t Predictor9_MIPSdspR2(uint32_t left, const uint32_t* const top) {
|
||||
(void)left;
|
||||
return Average2(top[0], top[1]);
|
||||
}
|
||||
|
||||
static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
|
||||
static uint32_t Predictor10_MIPSdspR2(uint32_t left,
|
||||
const uint32_t* const top) {
|
||||
return Average4(left, top[-1], top[0], top[1]);
|
||||
}
|
||||
|
||||
static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
|
||||
static uint32_t Predictor11_MIPSdspR2(uint32_t left,
|
||||
const uint32_t* const top) {
|
||||
return Select(top[0], left, top[-1]);
|
||||
}
|
||||
|
||||
static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
|
||||
static uint32_t Predictor12_MIPSdspR2(uint32_t left,
|
||||
const uint32_t* const top) {
|
||||
return ClampedAddSubtractFull(left, top[0], top[-1]);
|
||||
}
|
||||
|
||||
static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
|
||||
static uint32_t Predictor13_MIPSdspR2(uint32_t left,
|
||||
const uint32_t* const top) {
|
||||
return ClampedAddSubtractHalf(left, top[0], top[-1]);
|
||||
}
|
||||
|
||||
// Add green to blue and red channels (i.e. perform the inverse transform of
|
||||
// 'subtract green').
|
||||
static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
|
||||
uint32_t* dst) {
|
||||
static void AddGreenToBlueAndRed_MIPSdspR2(const uint32_t* src, int num_pixels,
|
||||
uint32_t* dst) {
|
||||
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
||||
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
|
||||
const uint32_t* const p_loop2_end = src + num_pixels;
|
||||
|
@ -285,9 +289,9 @@ static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
|
|||
);
|
||||
}
|
||||
|
||||
static void TransformColorInverse(const VP8LMultipliers* const m,
|
||||
const uint32_t* src, int num_pixels,
|
||||
uint32_t* dst) {
|
||||
static void TransformColorInverse_MIPSdspR2(const VP8LMultipliers* const m,
|
||||
const uint32_t* src, int num_pixels,
|
||||
uint32_t* dst) {
|
||||
int temp0, temp1, temp2, temp3, temp4, temp5;
|
||||
uint32_t argb, argb1, new_red;
|
||||
const uint32_t G_to_R = m->green_to_red_;
|
||||
|
@ -356,8 +360,8 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
|
|||
if (num_pixels & 1) VP8LTransformColorInverse_C(m, src, 1, dst);
|
||||
}
|
||||
|
||||
static void ConvertBGRAToRGB(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
static void ConvertBGRAToRGB_MIPSdspR2(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
int temp0, temp1, temp2, temp3;
|
||||
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
|
||||
const uint32_t* const p_loop2_end = src + num_pixels;
|
||||
|
@ -408,8 +412,8 @@ static void ConvertBGRAToRGB(const uint32_t* src,
|
|||
);
|
||||
}
|
||||
|
||||
static void ConvertBGRAToRGBA(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
static void ConvertBGRAToRGBA_MIPSdspR2(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
int temp0, temp1, temp2, temp3;
|
||||
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
|
||||
const uint32_t* const p_loop2_end = src + num_pixels;
|
||||
|
@ -458,8 +462,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
|
|||
);
|
||||
}
|
||||
|
||||
static void ConvertBGRAToRGBA4444(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
static void ConvertBGRAToRGBA4444_MIPSdspR2(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
int temp0, temp1, temp2, temp3, temp4, temp5;
|
||||
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
|
||||
const uint32_t* const p_loop2_end = src + num_pixels;
|
||||
|
@ -492,7 +496,7 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
|
|||
"ins %[temp3], %[temp5], 16, 4 \n\t"
|
||||
"addiu %[src], %[src], 16 \n\t"
|
||||
"precr.qb.ph %[temp3], %[temp3], %[temp2] \n\t"
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
#if (WEBP_SWAP_16BIT_CSP == 1)
|
||||
"usw %[temp1], 0(%[dst]) \n\t"
|
||||
"usw %[temp3], 4(%[dst]) \n\t"
|
||||
#else
|
||||
|
@ -514,7 +518,7 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
|
|||
"ins %[temp0], %[temp5], 16, 4 \n\t"
|
||||
"addiu %[src], %[src], 4 \n\t"
|
||||
"precr.qb.ph %[temp0], %[temp0], %[temp0] \n\t"
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
#if (WEBP_SWAP_16BIT_CSP == 1)
|
||||
"ush %[temp0], 0(%[dst]) \n\t"
|
||||
#else
|
||||
"wsbh %[temp0], %[temp0] \n\t"
|
||||
|
@ -532,8 +536,8 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
|
|||
);
|
||||
}
|
||||
|
||||
static void ConvertBGRAToRGB565(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
static void ConvertBGRAToRGB565_MIPSdspR2(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
int temp0, temp1, temp2, temp3, temp4, temp5;
|
||||
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
|
||||
const uint32_t* const p_loop2_end = src + num_pixels;
|
||||
|
@ -570,7 +574,7 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
|
|||
"ins %[temp2], %[temp3], 0, 5 \n\t"
|
||||
"addiu %[src], %[src], 16 \n\t"
|
||||
"append %[temp2], %[temp1], 16 \n\t"
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
#if (WEBP_SWAP_16BIT_CSP == 1)
|
||||
"usw %[temp0], 0(%[dst]) \n\t"
|
||||
"usw %[temp2], 4(%[dst]) \n\t"
|
||||
#else
|
||||
|
@ -592,7 +596,7 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
|
|||
"ins %[temp4], %[temp5], 0, 11 \n\t"
|
||||
"addiu %[src], %[src], 4 \n\t"
|
||||
"ins %[temp4], %[temp0], 0, 5 \n\t"
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
#if (WEBP_SWAP_16BIT_CSP == 1)
|
||||
"ush %[temp4], 0(%[dst]) \n\t"
|
||||
#else
|
||||
"wsbh %[temp4], %[temp4] \n\t"
|
||||
|
@ -610,8 +614,8 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
|
|||
);
|
||||
}
|
||||
|
||||
static void ConvertBGRAToBGR(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
static void ConvertBGRAToBGR_MIPSdspR2(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
int temp0, temp1, temp2, temp3;
|
||||
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
|
||||
const uint32_t* const p_loop2_end = src + num_pixels;
|
||||
|
@ -662,24 +666,27 @@ static void ConvertBGRAToBGR(const uint32_t* src,
|
|||
extern void VP8LDspInitMIPSdspR2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMIPSdspR2(void) {
|
||||
VP8LMapColor32b = MapARGB;
|
||||
VP8LMapColor8b = MapAlpha;
|
||||
VP8LPredictors[5] = Predictor5;
|
||||
VP8LPredictors[6] = Predictor6;
|
||||
VP8LPredictors[7] = Predictor7;
|
||||
VP8LPredictors[8] = Predictor8;
|
||||
VP8LPredictors[9] = Predictor9;
|
||||
VP8LPredictors[10] = Predictor10;
|
||||
VP8LPredictors[11] = Predictor11;
|
||||
VP8LPredictors[12] = Predictor12;
|
||||
VP8LPredictors[13] = Predictor13;
|
||||
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
|
||||
VP8LTransformColorInverse = TransformColorInverse;
|
||||
VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
|
||||
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
|
||||
VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
|
||||
VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
|
||||
VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
|
||||
VP8LMapColor32b = MapARGB_MIPSdspR2;
|
||||
VP8LMapColor8b = MapAlpha_MIPSdspR2;
|
||||
|
||||
VP8LPredictors[5] = Predictor5_MIPSdspR2;
|
||||
VP8LPredictors[6] = Predictor6_MIPSdspR2;
|
||||
VP8LPredictors[7] = Predictor7_MIPSdspR2;
|
||||
VP8LPredictors[8] = Predictor8_MIPSdspR2;
|
||||
VP8LPredictors[9] = Predictor9_MIPSdspR2;
|
||||
VP8LPredictors[10] = Predictor10_MIPSdspR2;
|
||||
VP8LPredictors[11] = Predictor11_MIPSdspR2;
|
||||
VP8LPredictors[12] = Predictor12_MIPSdspR2;
|
||||
VP8LPredictors[13] = Predictor13_MIPSdspR2;
|
||||
|
||||
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_MIPSdspR2;
|
||||
VP8LTransformColorInverse = TransformColorInverse_MIPSdspR2;
|
||||
|
||||
VP8LConvertBGRAToRGB = ConvertBGRAToRGB_MIPSdspR2;
|
||||
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_MIPSdspR2;
|
||||
VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_MIPSdspR2;
|
||||
VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_MIPSdspR2;
|
||||
VP8LConvertBGRAToBGR = ConvertBGRAToBGR_MIPSdspR2;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_MIPS_DSP_R2
|
|
@ -11,12 +11,12 @@
|
|||
//
|
||||
// Author: Prashant Patil (prashant.patil@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MSA)
|
||||
|
||||
#include "./lossless.h"
|
||||
#include "./msa_macro.h"
|
||||
#include "src/dsp/lossless.h"
|
||||
#include "src/dsp/msa_macro.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Colorspace conversion functions
|
||||
|
@ -43,7 +43,7 @@
|
|||
|
||||
#define CONVERT8_BGRA_XXX(psrc, pdst, m0, m1) do { \
|
||||
uint64_t pix_d; \
|
||||
v16u8 src0, src1, src2, dst0, dst1; \
|
||||
v16u8 src0, src1, src2 = { 0 }, dst0, dst1; \
|
||||
LD_UB2(psrc, 16, src0, src1); \
|
||||
VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1); \
|
||||
ST_UB(dst0, pdst); \
|
||||
|
@ -109,8 +109,8 @@
|
|||
dst = VSHF_UB(src, t0, mask1); \
|
||||
} while (0)
|
||||
|
||||
static void ConvertBGRAToRGBA(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
static void ConvertBGRAToRGBA_MSA(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
int i;
|
||||
const uint8_t* ptemp_src = (const uint8_t*)src;
|
||||
uint8_t* ptemp_dst = (uint8_t*)dst;
|
||||
|
@ -150,8 +150,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
|
|||
}
|
||||
}
|
||||
|
||||
static void ConvertBGRAToBGR(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
static void ConvertBGRAToBGR_MSA(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
const uint8_t* ptemp_src = (const uint8_t*)src;
|
||||
uint8_t* ptemp_dst = (uint8_t*)dst;
|
||||
const v16u8 mask0 = { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14,
|
||||
|
@ -197,8 +197,8 @@ static void ConvertBGRAToBGR(const uint32_t* src,
|
|||
}
|
||||
}
|
||||
|
||||
static void ConvertBGRAToRGB(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
static void ConvertBGRAToRGB_MSA(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
const uint8_t* ptemp_src = (const uint8_t*)src;
|
||||
uint8_t* ptemp_dst = (uint8_t*)dst;
|
||||
const v16u8 mask0 = { 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12,
|
||||
|
@ -244,8 +244,8 @@ static void ConvertBGRAToRGB(const uint32_t* src,
|
|||
}
|
||||
}
|
||||
|
||||
static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
|
||||
uint32_t* dst) {
|
||||
static void AddGreenToBlueAndRed_MSA(const uint32_t* const src, int num_pixels,
|
||||
uint32_t* dst) {
|
||||
int i;
|
||||
const uint8_t* in = (const uint8_t*)src;
|
||||
uint8_t* out = (uint8_t*)dst;
|
||||
|
@ -286,9 +286,9 @@ static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
|
|||
}
|
||||
}
|
||||
|
||||
static void TransformColorInverse(const VP8LMultipliers* const m,
|
||||
const uint32_t* src, int num_pixels,
|
||||
uint32_t* dst) {
|
||||
static void TransformColorInverse_MSA(const VP8LMultipliers* const m,
|
||||
const uint32_t* src, int num_pixels,
|
||||
uint32_t* dst) {
|
||||
v16u8 src0, dst0;
|
||||
const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
|
||||
(m->green_to_red_ << 16));
|
||||
|
@ -341,11 +341,12 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
|
|||
extern void VP8LDspInitMSA(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMSA(void) {
|
||||
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
|
||||
VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
|
||||
VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
|
||||
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
|
||||
VP8LTransformColorInverse = TransformColorInverse;
|
||||
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_MSA;
|
||||
VP8LConvertBGRAToBGR = ConvertBGRAToBGR_MSA;
|
||||
VP8LConvertBGRAToRGB = ConvertBGRAToRGB_MSA;
|
||||
|
||||
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_MSA;
|
||||
VP8LTransformColorInverse = TransformColorInverse_MSA;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_MSA
|
|
@ -11,14 +11,14 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "./lossless.h"
|
||||
#include "./neon.h"
|
||||
#include "src/dsp/lossless.h"
|
||||
#include "src/dsp/neon.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Colorspace conversion functions
|
||||
|
@ -26,8 +26,8 @@
|
|||
#if !defined(WORK_AROUND_GCC)
|
||||
// gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for
|
||||
// gcc-4.8.x at least.
|
||||
static void ConvertBGRAToRGBA(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
static void ConvertBGRAToRGBA_NEON(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
const uint32_t* const end = src + (num_pixels & ~15);
|
||||
for (; src < end; src += 16) {
|
||||
uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
|
||||
|
@ -41,8 +41,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
|
|||
VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst); // left-overs
|
||||
}
|
||||
|
||||
static void ConvertBGRAToBGR(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
static void ConvertBGRAToBGR_NEON(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
const uint32_t* const end = src + (num_pixels & ~15);
|
||||
for (; src < end; src += 16) {
|
||||
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
|
||||
|
@ -53,8 +53,8 @@ static void ConvertBGRAToBGR(const uint32_t* src,
|
|||
VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst); // left-overs
|
||||
}
|
||||
|
||||
static void ConvertBGRAToRGB(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
static void ConvertBGRAToRGB_NEON(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
const uint32_t* const end = src + (num_pixels & ~15);
|
||||
for (; src < end; src += 16) {
|
||||
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
|
||||
|
@ -71,8 +71,8 @@ static void ConvertBGRAToRGB(const uint32_t* src,
|
|||
|
||||
static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 };
|
||||
|
||||
static void ConvertBGRAToRGBA(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
static void ConvertBGRAToRGBA_NEON(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
const uint32_t* const end = src + (num_pixels & ~1);
|
||||
const uint8x8_t shuffle = vld1_u8(kRGBAShuffle);
|
||||
for (; src < end; src += 2) {
|
||||
|
@ -89,8 +89,8 @@ static const uint8_t kBGRShuffle[3][8] = {
|
|||
{ 21, 22, 24, 25, 26, 28, 29, 30 }
|
||||
};
|
||||
|
||||
static void ConvertBGRAToBGR(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
static void ConvertBGRAToBGR_NEON(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
const uint32_t* const end = src + (num_pixels & ~7);
|
||||
const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]);
|
||||
const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);
|
||||
|
@ -116,8 +116,8 @@ static const uint8_t kRGBShuffle[3][8] = {
|
|||
{ 21, 20, 26, 25, 24, 30, 29, 28 }
|
||||
};
|
||||
|
||||
static void ConvertBGRAToRGB(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
static void ConvertBGRAToRGB_NEON(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
const uint32_t* const end = src + (num_pixels & ~7);
|
||||
const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]);
|
||||
const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]);
|
||||
|
@ -139,7 +139,6 @@ static void ConvertBGRAToRGB(const uint32_t* src,
|
|||
|
||||
#endif // !WORK_AROUND_GCC
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Predictor Transform
|
||||
|
||||
|
@ -506,8 +505,8 @@ static const uint8_t kGreenShuffle[16] = {
|
|||
1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
|
||||
};
|
||||
|
||||
static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
|
||||
const uint8x16_t shuffle) {
|
||||
static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
|
||||
const uint8x16_t shuffle) {
|
||||
return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
|
||||
vtbl1q_u8(argb, vget_high_u8(shuffle)));
|
||||
}
|
||||
|
@ -515,15 +514,15 @@ static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
|
|||
// 255 = byte will be zeroed
|
||||
static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 };
|
||||
|
||||
static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
|
||||
const uint8x8_t shuffle) {
|
||||
static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
|
||||
const uint8x8_t shuffle) {
|
||||
return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
|
||||
vtbl1_u8(vget_high_u8(argb), shuffle));
|
||||
}
|
||||
#endif // USE_VTBLQ
|
||||
|
||||
static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
|
||||
uint32_t* dst) {
|
||||
static void AddGreenToBlueAndRed_NEON(const uint32_t* src, int num_pixels,
|
||||
uint32_t* dst) {
|
||||
const uint32_t* const end = src + (num_pixels & ~3);
|
||||
#ifdef USE_VTBLQ
|
||||
const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
|
||||
|
@ -532,7 +531,7 @@ static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
|
|||
#endif
|
||||
for (; src < end; src += 4, dst += 4) {
|
||||
const uint8x16_t argb = vld1q_u8((const uint8_t*)src);
|
||||
const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
|
||||
const uint8x16_t greens = DoGreenShuffle_NEON(argb, shuffle);
|
||||
vst1q_u8((uint8_t*)dst, vaddq_u8(argb, greens));
|
||||
}
|
||||
// fallthrough and finish off with plain-C
|
||||
|
@ -542,9 +541,9 @@ static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
|
|||
//------------------------------------------------------------------------------
|
||||
// Color Transform
|
||||
|
||||
static void TransformColorInverse(const VP8LMultipliers* const m,
|
||||
const uint32_t* const src, int num_pixels,
|
||||
uint32_t* dst) {
|
||||
static void TransformColorInverse_NEON(const VP8LMultipliers* const m,
|
||||
const uint32_t* const src,
|
||||
int num_pixels, uint32_t* dst) {
|
||||
// sign-extended multiplying constants, pre-shifted by 6.
|
||||
#define CST(X) (((int16_t)(m->X << 8)) >> 6)
|
||||
const int16_t rb[8] = {
|
||||
|
@ -575,7 +574,7 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
|
|||
const uint8x16_t in = vld1q_u8((const uint8_t*)(src + i));
|
||||
const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag);
|
||||
// 0 g 0 g
|
||||
const uint8x16_t greens = DoGreenShuffle(in, shuffle);
|
||||
const uint8x16_t greens = DoGreenShuffle_NEON(in, shuffle);
|
||||
// x dr x db1
|
||||
const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
|
||||
// x r' x b'
|
||||
|
@ -627,12 +626,12 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) {
|
|||
VP8LPredictorsAdd[12] = PredictorAdd12_NEON;
|
||||
VP8LPredictorsAdd[13] = PredictorAdd13_NEON;
|
||||
|
||||
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
|
||||
VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
|
||||
VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
|
||||
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_NEON;
|
||||
VP8LConvertBGRAToBGR = ConvertBGRAToBGR_NEON;
|
||||
VP8LConvertBGRAToRGB = ConvertBGRAToRGB_NEON;
|
||||
|
||||
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
|
||||
VP8LTransformColorInverse = TransformColorInverse;
|
||||
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_NEON;
|
||||
VP8LTransformColorInverse = TransformColorInverse_NEON;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_NEON
|
|
@ -11,21 +11,22 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
|
||||
#include "./common_sse2.h"
|
||||
#include "./lossless.h"
|
||||
#include "./lossless_common.h"
|
||||
#include "src/dsp/common_sse2.h"
|
||||
#include "src/dsp/lossless.h"
|
||||
#include "src/dsp/lossless_common.h"
|
||||
#include <assert.h>
|
||||
#include <emmintrin.h>
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Predictor Transform
|
||||
|
||||
static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
|
||||
uint32_t c2) {
|
||||
static WEBP_INLINE uint32_t ClampedAddSubtractFull_SSE2(uint32_t c0,
|
||||
uint32_t c1,
|
||||
uint32_t c2) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
|
||||
const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
|
||||
|
@ -37,8 +38,9 @@ static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
|
|||
return output;
|
||||
}
|
||||
|
||||
static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
|
||||
uint32_t c2) {
|
||||
static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0,
|
||||
uint32_t c1,
|
||||
uint32_t c2) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
|
||||
const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
|
||||
|
@ -55,7 +57,7 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
|
|||
return output;
|
||||
}
|
||||
|
||||
static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
|
||||
static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) {
|
||||
int pa_minus_pb;
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i A0 = _mm_cvtsi32_si128(a);
|
||||
|
@ -88,8 +90,9 @@ static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
|
|||
*avg = _mm_sub_epi8(avg1, one);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1,
|
||||
__m128i* const avg) {
|
||||
static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0,
|
||||
const uint32_t a1,
|
||||
__m128i* const avg) {
|
||||
// (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
|
||||
const __m128i ones = _mm_set1_epi8(1);
|
||||
const __m128i A0 = _mm_cvtsi32_si128(a0);
|
||||
|
@ -99,7 +102,7 @@ static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1,
|
|||
*avg = _mm_sub_epi8(avg1, one);
|
||||
}
|
||||
|
||||
static WEBP_INLINE __m128i Average2_uint32_16(uint32_t a0, uint32_t a1) {
|
||||
static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero);
|
||||
const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
|
||||
|
@ -107,15 +110,16 @@ static WEBP_INLINE __m128i Average2_uint32_16(uint32_t a0, uint32_t a1) {
|
|||
return _mm_srli_epi16(sum, 1);
|
||||
}
|
||||
|
||||
static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
|
||||
static WEBP_INLINE uint32_t Average2_SSE2(uint32_t a0, uint32_t a1) {
|
||||
__m128i output;
|
||||
Average2_uint32(a0, a1, &output);
|
||||
Average2_uint32_SSE2(a0, a1, &output);
|
||||
return _mm_cvtsi128_si32(output);
|
||||
}
|
||||
|
||||
static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
|
||||
static WEBP_INLINE uint32_t Average3_SSE2(uint32_t a0, uint32_t a1,
|
||||
uint32_t a2) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i avg1 = Average2_uint32_16(a0, a2);
|
||||
const __m128i avg1 = Average2_uint32_16_SSE2(a0, a2);
|
||||
const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
|
||||
const __m128i sum = _mm_add_epi16(avg1, A1);
|
||||
const __m128i avg2 = _mm_srli_epi16(sum, 1);
|
||||
|
@ -124,10 +128,10 @@ static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
|
|||
return output;
|
||||
}
|
||||
|
||||
static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
|
||||
uint32_t a2, uint32_t a3) {
|
||||
const __m128i avg1 = Average2_uint32_16(a0, a1);
|
||||
const __m128i avg2 = Average2_uint32_16(a2, a3);
|
||||
static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
|
||||
uint32_t a2, uint32_t a3) {
|
||||
const __m128i avg1 = Average2_uint32_16_SSE2(a0, a1);
|
||||
const __m128i avg2 = Average2_uint32_16_SSE2(a2, a3);
|
||||
const __m128i sum = _mm_add_epi16(avg2, avg1);
|
||||
const __m128i avg3 = _mm_srli_epi16(sum, 1);
|
||||
const __m128i A0 = _mm_packus_epi16(avg3, avg3);
|
||||
|
@ -136,41 +140,41 @@ static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
|
|||
}
|
||||
|
||||
static uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Average3(left, top[0], top[1]);
|
||||
const uint32_t pred = Average3_SSE2(left, top[0], top[1]);
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor6_SSE2(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Average2(left, top[-1]);
|
||||
const uint32_t pred = Average2_SSE2(left, top[-1]);
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor7_SSE2(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Average2(left, top[0]);
|
||||
const uint32_t pred = Average2_SSE2(left, top[0]);
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor8_SSE2(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Average2(top[-1], top[0]);
|
||||
const uint32_t pred = Average2_SSE2(top[-1], top[0]);
|
||||
(void)left;
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor9_SSE2(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Average2(top[0], top[1]);
|
||||
const uint32_t pred = Average2_SSE2(top[0], top[1]);
|
||||
(void)left;
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor10_SSE2(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
|
||||
const uint32_t pred = Average4_SSE2(left, top[-1], top[0], top[1]);
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor11_SSE2(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Select(top[0], left, top[-1]);
|
||||
const uint32_t pred = Select_SSE2(top[0], left, top[-1]);
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor12_SSE2(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
|
||||
const uint32_t pred = ClampedAddSubtractFull_SSE2(left, top[0], top[-1]);
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
|
||||
const uint32_t pred = ClampedAddSubtractHalf_SSE2(left, top[0], top[-1]);
|
||||
return pred;
|
||||
}
|
||||
|
||||
|
@ -272,9 +276,24 @@ GENERATE_PREDICTOR_2(9, upper[i + 1])
|
|||
#undef GENERATE_PREDICTOR_2
|
||||
|
||||
// Predictor10: average of (average of (L,TL), average of (T, TR)).
|
||||
#define DO_PRED10(OUT) do { \
|
||||
__m128i avgLTL, avg; \
|
||||
Average2_m128i(&L, &TL, &avgLTL); \
|
||||
Average2_m128i(&avgTTR, &avgLTL, &avg); \
|
||||
L = _mm_add_epi8(avg, src); \
|
||||
out[i + (OUT)] = _mm_cvtsi128_si32(L); \
|
||||
} while (0)
|
||||
|
||||
#define DO_PRED10_SHIFT do { \
|
||||
/* Rotate the pre-computed values for the next iteration.*/ \
|
||||
avgTTR = _mm_srli_si128(avgTTR, 4); \
|
||||
TL = _mm_srli_si128(TL, 4); \
|
||||
src = _mm_srli_si128(src, 4); \
|
||||
} while (0)
|
||||
|
||||
static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
|
||||
int num_pixels, uint32_t* out) {
|
||||
int i, j;
|
||||
int i;
|
||||
__m128i L = _mm_cvtsi32_si128(out[-1]);
|
||||
for (i = 0; i + 4 <= num_pixels; i += 4) {
|
||||
__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
|
||||
|
@ -283,79 +302,90 @@ static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
|
|||
const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
|
||||
__m128i avgTTR;
|
||||
Average2_m128i(&T, &TR, &avgTTR);
|
||||
for (j = 0; j < 4; ++j) {
|
||||
__m128i avgLTL, avg;
|
||||
Average2_m128i(&L, &TL, &avgLTL);
|
||||
Average2_m128i(&avgTTR, &avgLTL, &avg);
|
||||
L = _mm_add_epi8(avg, src);
|
||||
out[i + j] = _mm_cvtsi128_si32(L);
|
||||
// Rotate the pre-computed values for the next iteration.
|
||||
avgTTR = _mm_srli_si128(avgTTR, 4);
|
||||
TL = _mm_srli_si128(TL, 4);
|
||||
src = _mm_srli_si128(src, 4);
|
||||
}
|
||||
DO_PRED10(0);
|
||||
DO_PRED10_SHIFT;
|
||||
DO_PRED10(1);
|
||||
DO_PRED10_SHIFT;
|
||||
DO_PRED10(2);
|
||||
DO_PRED10_SHIFT;
|
||||
DO_PRED10(3);
|
||||
}
|
||||
if (i != num_pixels) {
|
||||
VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
|
||||
}
|
||||
}
|
||||
#undef DO_PRED10
|
||||
#undef DO_PRED10_SHIFT
|
||||
|
||||
// Predictor11: select.
|
||||
static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B,
|
||||
__m128i* const out) {
|
||||
// We can unpack with any value on the upper 32 bits, provided it's the same
|
||||
// on both operands (to that their sum of abs diff is zero). Here we use *A.
|
||||
const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
|
||||
const __m128i B_lo = _mm_unpacklo_epi32(*B, *A);
|
||||
const __m128i A_hi = _mm_unpackhi_epi32(*A, *A);
|
||||
const __m128i B_hi = _mm_unpackhi_epi32(*B, *A);
|
||||
const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo);
|
||||
const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi);
|
||||
*out = _mm_packs_epi32(s_lo, s_hi);
|
||||
}
|
||||
#define DO_PRED11(OUT) do { \
|
||||
const __m128i L_lo = _mm_unpacklo_epi32(L, T); \
|
||||
const __m128i TL_lo = _mm_unpacklo_epi32(TL, T); \
|
||||
const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/ \
|
||||
const __m128i mask = _mm_cmpgt_epi32(pb, pa); \
|
||||
const __m128i A = _mm_and_si128(mask, L); \
|
||||
const __m128i B = _mm_andnot_si128(mask, T); \
|
||||
const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \
|
||||
L = _mm_add_epi8(src, pred); \
|
||||
out[i + (OUT)] = _mm_cvtsi128_si32(L); \
|
||||
} while (0)
|
||||
|
||||
#define DO_PRED11_SHIFT do { \
|
||||
/* Shift the pre-computed value for the next iteration.*/ \
|
||||
T = _mm_srli_si128(T, 4); \
|
||||
TL = _mm_srli_si128(TL, 4); \
|
||||
src = _mm_srli_si128(src, 4); \
|
||||
pa = _mm_srli_si128(pa, 4); \
|
||||
} while (0)
|
||||
|
||||
static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
|
||||
int num_pixels, uint32_t* out) {
|
||||
int i, j;
|
||||
int i;
|
||||
__m128i pa;
|
||||
__m128i L = _mm_cvtsi32_si128(out[-1]);
|
||||
for (i = 0; i + 4 <= num_pixels; i += 4) {
|
||||
__m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
|
||||
__m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
|
||||
__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
|
||||
__m128i pa;
|
||||
GetSumAbsDiff32(&T, &TL, &pa); // pa = sum |T-TL|
|
||||
for (j = 0; j < 4; ++j) {
|
||||
const __m128i L_lo = _mm_unpacklo_epi32(L, L);
|
||||
const __m128i TL_lo = _mm_unpacklo_epi32(TL, L);
|
||||
const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); // pb = sum |L-TL|
|
||||
const __m128i mask = _mm_cmpgt_epi32(pb, pa);
|
||||
const __m128i A = _mm_and_si128(mask, L);
|
||||
const __m128i B = _mm_andnot_si128(mask, T);
|
||||
const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T
|
||||
L = _mm_add_epi8(src, pred);
|
||||
out[i + j] = _mm_cvtsi128_si32(L);
|
||||
// Shift the pre-computed value for the next iteration.
|
||||
T = _mm_srli_si128(T, 4);
|
||||
TL = _mm_srli_si128(TL, 4);
|
||||
src = _mm_srli_si128(src, 4);
|
||||
pa = _mm_srli_si128(pa, 4);
|
||||
{
|
||||
// We can unpack with any value on the upper 32 bits, provided it's the
|
||||
// same on both operands (so that their sum of abs diff is zero). Here we
|
||||
// use T.
|
||||
const __m128i T_lo = _mm_unpacklo_epi32(T, T);
|
||||
const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);
|
||||
const __m128i T_hi = _mm_unpackhi_epi32(T, T);
|
||||
const __m128i TL_hi = _mm_unpackhi_epi32(TL, T);
|
||||
const __m128i s_lo = _mm_sad_epu8(T_lo, TL_lo);
|
||||
const __m128i s_hi = _mm_sad_epu8(T_hi, TL_hi);
|
||||
pa = _mm_packs_epi32(s_lo, s_hi); // pa = sum |T-TL|
|
||||
}
|
||||
DO_PRED11(0);
|
||||
DO_PRED11_SHIFT;
|
||||
DO_PRED11(1);
|
||||
DO_PRED11_SHIFT;
|
||||
DO_PRED11(2);
|
||||
DO_PRED11_SHIFT;
|
||||
DO_PRED11(3);
|
||||
}
|
||||
if (i != num_pixels) {
|
||||
VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
|
||||
}
|
||||
}
|
||||
#undef DO_PRED11
|
||||
#undef DO_PRED11_SHIFT
|
||||
|
||||
// Predictor12: ClampedAddSubtractFull.
|
||||
#define DO_PRED12(DIFF, LANE, OUT) \
|
||||
do { \
|
||||
const __m128i all = _mm_add_epi16(L, (DIFF)); \
|
||||
const __m128i alls = _mm_packus_epi16(all, all); \
|
||||
const __m128i res = _mm_add_epi8(src, alls); \
|
||||
out[i + (OUT)] = _mm_cvtsi128_si32(res); \
|
||||
L = _mm_unpacklo_epi8(res, zero); \
|
||||
#define DO_PRED12(DIFF, LANE, OUT) do { \
|
||||
const __m128i all = _mm_add_epi16(L, (DIFF)); \
|
||||
const __m128i alls = _mm_packus_epi16(all, all); \
|
||||
const __m128i res = _mm_add_epi8(src, alls); \
|
||||
out[i + (OUT)] = _mm_cvtsi128_si32(res); \
|
||||
L = _mm_unpacklo_epi8(res, zero); \
|
||||
} while (0)
|
||||
|
||||
#define DO_PRED12_SHIFT(DIFF, LANE) do { \
|
||||
/* Shift the pre-computed value for the next iteration.*/ \
|
||||
if (LANE == 0) (DIFF) = _mm_srli_si128((DIFF), 8); \
|
||||
if ((LANE) == 0) (DIFF) = _mm_srli_si128((DIFF), 8); \
|
||||
src = _mm_srli_si128(src, 4); \
|
||||
} while (0)
|
||||
|
||||
|
@ -377,8 +407,11 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
|
|||
__m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
|
||||
__m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
|
||||
DO_PRED12(diff_lo, 0, 0);
|
||||
DO_PRED12_SHIFT(diff_lo, 0);
|
||||
DO_PRED12(diff_lo, 1, 1);
|
||||
DO_PRED12_SHIFT(diff_lo, 1);
|
||||
DO_PRED12(diff_hi, 0, 2);
|
||||
DO_PRED12_SHIFT(diff_hi, 0);
|
||||
DO_PRED12(diff_hi, 1, 3);
|
||||
}
|
||||
if (i != num_pixels) {
|
||||
|
@ -386,6 +419,7 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
|
|||
}
|
||||
}
|
||||
#undef DO_PRED12
|
||||
#undef DO_PRED12_SHIFT
|
||||
|
||||
// Due to averages with integers, values cannot be accumulated in parallel for
|
||||
// predictors 13.
|
||||
|
@ -394,8 +428,8 @@ GENERATE_PREDICTOR_ADD(Predictor13_SSE2, PredictorAdd13_SSE2)
|
|||
//------------------------------------------------------------------------------
|
||||
// Subtract-Green Transform
|
||||
|
||||
static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
|
||||
uint32_t* dst) {
|
||||
static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels,
|
||||
uint32_t* dst) {
|
||||
int i;
|
||||
for (i = 0; i + 4 <= num_pixels; i += 4) {
|
||||
const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
|
||||
|
@ -414,9 +448,9 @@ static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
|
|||
//------------------------------------------------------------------------------
|
||||
// Color Transform
|
||||
|
||||
static void TransformColorInverse(const VP8LMultipliers* const m,
|
||||
const uint32_t* const src, int num_pixels,
|
||||
uint32_t* dst) {
|
||||
static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
|
||||
const uint32_t* const src,
|
||||
int num_pixels, uint32_t* dst) {
|
||||
// sign-extended multiplying constants, pre-shifted by 5.
|
||||
#define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend
|
||||
const __m128i mults_rb = _mm_set_epi16(
|
||||
|
@ -454,8 +488,8 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
|
|||
//------------------------------------------------------------------------------
|
||||
// Color-space conversion functions
|
||||
|
||||
static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels,
|
||||
uint8_t* dst) {
|
||||
static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
|
||||
uint8_t* dst) {
|
||||
const __m128i* in = (const __m128i*)src;
|
||||
__m128i* out = (__m128i*)dst;
|
||||
|
||||
|
@ -490,27 +524,26 @@ static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels,
|
|||
}
|
||||
}
|
||||
|
||||
static void ConvertBGRAToRGBA(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
|
||||
const __m128i* in = (const __m128i*)src;
|
||||
__m128i* out = (__m128i*)dst;
|
||||
while (num_pixels >= 8) {
|
||||
const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
|
||||
const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
|
||||
const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...
|
||||
const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...
|
||||
const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...
|
||||
const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...
|
||||
const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7
|
||||
const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7
|
||||
const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7
|
||||
const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7
|
||||
const __m128i rg0 = _mm_unpacklo_epi8(rb0, ga0); // r0g0r1g1 ... r6g6r7g7
|
||||
const __m128i ba0 = _mm_unpackhi_epi8(rb0, ga0); // b0a0b1a1 ... b6a6b7a7
|
||||
const __m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0); // rgba0|rgba1...
|
||||
const __m128i rgba4 = _mm_unpackhi_epi16(rg0, ba0); // rgba4|rgba5...
|
||||
_mm_storeu_si128(out++, rgba0);
|
||||
_mm_storeu_si128(out++, rgba4);
|
||||
const __m128i A1 = _mm_loadu_si128(in++);
|
||||
const __m128i A2 = _mm_loadu_si128(in++);
|
||||
const __m128i B1 = _mm_and_si128(A1, red_blue_mask); // R 0 B 0
|
||||
const __m128i B2 = _mm_and_si128(A2, red_blue_mask); // R 0 B 0
|
||||
const __m128i C1 = _mm_andnot_si128(red_blue_mask, A1); // 0 G 0 A
|
||||
const __m128i C2 = _mm_andnot_si128(red_blue_mask, A2); // 0 G 0 A
|
||||
const __m128i D1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
const __m128i D2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
const __m128i E1 = _mm_shufflehi_epi16(D1, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
const __m128i E2 = _mm_shufflehi_epi16(D2, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
const __m128i F1 = _mm_or_si128(E1, C1);
|
||||
const __m128i F2 = _mm_or_si128(E2, C2);
|
||||
_mm_storeu_si128(out++, F1);
|
||||
_mm_storeu_si128(out++, F2);
|
||||
num_pixels -= 8;
|
||||
}
|
||||
// left-overs
|
||||
|
@ -519,8 +552,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
|
|||
}
|
||||
}
|
||||
|
||||
static void ConvertBGRAToRGBA4444(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
|
||||
const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
|
||||
const __m128i* in = (const __m128i*)src;
|
||||
|
@ -541,7 +574,7 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
|
|||
const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f); // g0-|g1-|...|a6-|a7-
|
||||
const __m128i rgba0 = _mm_or_si128(ga2, rb1); // rg0..rg7 | ba0..ba7
|
||||
const __m128i rgba1 = _mm_srli_si128(rgba0, 8); // ba0..ba7 | 0
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
#if (WEBP_SWAP_16BIT_CSP == 1)
|
||||
const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0); // barg0...barg7
|
||||
#else
|
||||
const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1); // rgba0...rgba7
|
||||
|
@ -555,8 +588,8 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
|
|||
}
|
||||
}
|
||||
|
||||
static void ConvertBGRAToRGB565(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
static void ConvertBGRAToRGB565_SSE2(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
const __m128i mask_0xe0 = _mm_set1_epi8(0xe0);
|
||||
const __m128i mask_0xf8 = _mm_set1_epi8(0xf8);
|
||||
const __m128i mask_0x07 = _mm_set1_epi8(0x07);
|
||||
|
@ -582,7 +615,7 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
|
|||
const __m128i rg1 = _mm_or_si128(rb1, g_lo2); // gr0...gr7|xx
|
||||
const __m128i b1 = _mm_srli_epi16(b0, 3);
|
||||
const __m128i gb1 = _mm_or_si128(b1, g_hi2); // bg0...bg7|xx
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
#if (WEBP_SWAP_16BIT_CSP == 1)
|
||||
const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1); // rggb0...rggb7
|
||||
#else
|
||||
const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1); // bgrb0...bgrb7
|
||||
|
@ -596,8 +629,8 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
|
|||
}
|
||||
}
|
||||
|
||||
static void ConvertBGRAToBGR(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
static void ConvertBGRAToBGR_SSE2(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
|
||||
const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
|
||||
const __m128i* in = (const __m128i*)src;
|
||||
|
@ -660,14 +693,14 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) {
|
|||
VP8LPredictorsAdd[12] = PredictorAdd12_SSE2;
|
||||
VP8LPredictorsAdd[13] = PredictorAdd13_SSE2;
|
||||
|
||||
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
|
||||
VP8LTransformColorInverse = TransformColorInverse;
|
||||
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_SSE2;
|
||||
VP8LTransformColorInverse = TransformColorInverse_SSE2;
|
||||
|
||||
VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
|
||||
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
|
||||
VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
|
||||
VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
|
||||
VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
|
||||
VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE2;
|
||||
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_SSE2;
|
||||
VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_SSE2;
|
||||
VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_SSE2;
|
||||
VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE2;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_SSE2
|
|
@ -22,6 +22,7 @@
|
|||
#endif
|
||||
|
||||
#ifdef CLANG_BUILD
|
||||
#define ALPHAVAL (-1)
|
||||
#define ADDVI_H(a, b) __msa_addvi_h((v8i16)a, b)
|
||||
#define ADDVI_W(a, b) __msa_addvi_w((v4i32)a, b)
|
||||
#define SRAI_B(a, b) __msa_srai_b((v16i8)a, b)
|
||||
|
@ -32,6 +33,7 @@
|
|||
#define ANDI_B(a, b) __msa_andi_b((v16u8)a, b)
|
||||
#define ORI_B(a, b) __msa_ori_b((v16u8)a, b)
|
||||
#else
|
||||
#define ALPHAVAL (0xff)
|
||||
#define ADDVI_H(a, b) (a + b)
|
||||
#define ADDVI_W(a, b) (a + b)
|
||||
#define SRAI_B(a, b) (a >> b)
|
|
@ -14,11 +14,12 @@
|
|||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
// Right now, some intrinsics functions seem slower, so we disable them
|
||||
// everywhere except aarch64 where the inline assembly is incompatible.
|
||||
#if defined(__aarch64__)
|
||||
// everywhere except newer clang/gcc or aarch64 where the inline assembly is
|
||||
// incompatible.
|
||||
#if LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,9) || defined(__aarch64__)
|
||||
#define WEBP_USE_INTRINSICS // use intrinsics when possible
|
||||
#endif
|
||||
|
||||
|
@ -43,11 +44,11 @@
|
|||
// if using intrinsics, this flag avoids some functions that make gcc-4.6.3
|
||||
// crash ("internal compiler error: in immed_double_const, at emit-rtl.").
|
||||
// (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
|
||||
#if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
|
||||
#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
|
||||
#define WORK_AROUND_GCC
|
||||
#endif
|
||||
|
||||
static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
|
||||
static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) {
|
||||
uint64x2x2_t row01, row23;
|
||||
|
||||
row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
|
|
@ -13,8 +13,8 @@
|
|||
|
||||
#include <assert.h>
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "../utils/rescaler_utils.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
#include "src/utils/rescaler_utils.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Implementations of critical functions ImportRow / ExportRow
|
||||
|
@ -25,7 +25,8 @@
|
|||
//------------------------------------------------------------------------------
|
||||
// Row import
|
||||
|
||||
void WebPRescalerImportRowExpandC(WebPRescaler* const wrk, const uint8_t* src) {
|
||||
void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
|
||||
const uint8_t* src) {
|
||||
const int x_stride = wrk->num_channels;
|
||||
const int x_out_max = wrk->dst_width * wrk->num_channels;
|
||||
int channel;
|
||||
|
@ -56,7 +57,8 @@ void WebPRescalerImportRowExpandC(WebPRescaler* const wrk, const uint8_t* src) {
|
|||
}
|
||||
}
|
||||
|
||||
void WebPRescalerImportRowShrinkC(WebPRescaler* const wrk, const uint8_t* src) {
|
||||
void WebPRescalerImportRowShrink_C(WebPRescaler* const wrk,
|
||||
const uint8_t* src) {
|
||||
const int x_stride = wrk->num_channels;
|
||||
const int x_out_max = wrk->dst_width * wrk->num_channels;
|
||||
int channel;
|
||||
|
@ -92,7 +94,7 @@ void WebPRescalerImportRowShrinkC(WebPRescaler* const wrk, const uint8_t* src) {
|
|||
//------------------------------------------------------------------------------
|
||||
// Row export
|
||||
|
||||
void WebPRescalerExportRowExpandC(WebPRescaler* const wrk) {
|
||||
void WebPRescalerExportRowExpand_C(WebPRescaler* const wrk) {
|
||||
int x_out;
|
||||
uint8_t* const dst = wrk->dst;
|
||||
rescaler_t* const irow = wrk->irow;
|
||||
|
@ -123,7 +125,7 @@ void WebPRescalerExportRowExpandC(WebPRescaler* const wrk) {
|
|||
}
|
||||
}
|
||||
|
||||
void WebPRescalerExportRowShrinkC(WebPRescaler* const wrk) {
|
||||
void WebPRescalerExportRowShrink_C(WebPRescaler* const wrk) {
|
||||
int x_out;
|
||||
uint8_t* const dst = wrk->dst;
|
||||
rescaler_t* const irow = wrk->irow;
|
||||
|
@ -207,11 +209,14 @@ static volatile VP8CPUInfo rescaler_last_cpuinfo_used =
|
|||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
|
||||
if (rescaler_last_cpuinfo_used == VP8GetCPUInfo) return;
|
||||
#if !defined(WEBP_REDUCE_SIZE)
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
WebPRescalerExportRowExpand = WebPRescalerExportRowExpand_C;
|
||||
WebPRescalerExportRowShrink = WebPRescalerExportRowShrink_C;
|
||||
#endif
|
||||
|
||||
WebPRescalerImportRowExpand = WebPRescalerImportRowExpandC;
|
||||
WebPRescalerImportRowShrink = WebPRescalerImportRowShrinkC;
|
||||
WebPRescalerExportRowExpand = WebPRescalerExportRowExpandC;
|
||||
WebPRescalerExportRowShrink = WebPRescalerExportRowShrinkC;
|
||||
WebPRescalerImportRowExpand = WebPRescalerImportRowExpand_C;
|
||||
WebPRescalerImportRowShrink = WebPRescalerImportRowShrink_C;
|
||||
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
|
@ -219,11 +224,6 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
|
|||
WebPRescalerDspInitSSE2();
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_NEON)
|
||||
if (VP8GetCPUInfo(kNEON)) {
|
||||
WebPRescalerDspInitNEON();
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_MIPS32)
|
||||
if (VP8GetCPUInfo(kMIPS32)) {
|
||||
WebPRescalerDspInitMIPS32();
|
||||
|
@ -240,5 +240,18 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
|
|||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
if (WEBP_NEON_OMIT_C_CODE ||
|
||||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
|
||||
WebPRescalerDspInitNEON();
|
||||
}
|
||||
#endif
|
||||
|
||||
assert(WebPRescalerExportRowExpand != NULL);
|
||||
assert(WebPRescalerExportRowShrink != NULL);
|
||||
assert(WebPRescalerImportRowExpand != NULL);
|
||||
assert(WebPRescalerImportRowShrink != NULL);
|
||||
#endif // WEBP_REDUCE_SIZE
|
||||
rescaler_last_cpuinfo_used = VP8GetCPUInfo;
|
||||
}
|
|
@ -11,17 +11,18 @@
|
|||
//
|
||||
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MIPS32)
|
||||
#if defined(WEBP_USE_MIPS32) && !defined(WEBP_REDUCE_SIZE)
|
||||
|
||||
#include <assert.h>
|
||||
#include "../utils/rescaler_utils.h"
|
||||
#include "src/utils/rescaler_utils.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Row import
|
||||
|
||||
static void ImportRowShrink(WebPRescaler* const wrk, const uint8_t* src) {
|
||||
static void ImportRowShrink_MIPS32(WebPRescaler* const wrk,
|
||||
const uint8_t* src) {
|
||||
const int x_stride = wrk->num_channels;
|
||||
const int x_out_max = wrk->dst_width * wrk->num_channels;
|
||||
const int fx_scale = wrk->fx_scale;
|
||||
|
@ -80,7 +81,8 @@ static void ImportRowShrink(WebPRescaler* const wrk, const uint8_t* src) {
|
|||
}
|
||||
}
|
||||
|
||||
static void ImportRowExpand(WebPRescaler* const wrk, const uint8_t* src) {
|
||||
static void ImportRowExpand_MIPS32(WebPRescaler* const wrk,
|
||||
const uint8_t* src) {
|
||||
const int x_stride = wrk->num_channels;
|
||||
const int x_out_max = wrk->dst_width * wrk->num_channels;
|
||||
const int x_add = wrk->x_add;
|
||||
|
@ -144,7 +146,7 @@ static void ImportRowExpand(WebPRescaler* const wrk, const uint8_t* src) {
|
|||
//------------------------------------------------------------------------------
|
||||
// Row export
|
||||
|
||||
static void ExportRowExpand(WebPRescaler* const wrk) {
|
||||
static void ExportRowExpand_MIPS32(WebPRescaler* const wrk) {
|
||||
uint8_t* dst = wrk->dst;
|
||||
rescaler_t* irow = wrk->irow;
|
||||
const int x_out_max = wrk->dst_width * wrk->num_channels;
|
||||
|
@ -207,7 +209,7 @@ static void ExportRowExpand(WebPRescaler* const wrk) {
|
|||
}
|
||||
}
|
||||
|
||||
static void ExportRowShrink(WebPRescaler* const wrk) {
|
||||
static void ExportRowShrink_MIPS32(WebPRescaler* const wrk) {
|
||||
const int x_out_max = wrk->dst_width * wrk->num_channels;
|
||||
uint8_t* dst = wrk->dst;
|
||||
rescaler_t* irow = wrk->irow;
|
||||
|
@ -278,10 +280,10 @@ static void ExportRowShrink(WebPRescaler* const wrk) {
|
|||
extern void WebPRescalerDspInitMIPS32(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPS32(void) {
|
||||
WebPRescalerImportRowExpand = ImportRowExpand;
|
||||
WebPRescalerImportRowShrink = ImportRowShrink;
|
||||
WebPRescalerExportRowExpand = ExportRowExpand;
|
||||
WebPRescalerExportRowShrink = ExportRowShrink;
|
||||
WebPRescalerImportRowExpand = ImportRowExpand_MIPS32;
|
||||
WebPRescalerImportRowShrink = ImportRowShrink_MIPS32;
|
||||
WebPRescalerExportRowExpand = ExportRowExpand_MIPS32;
|
||||
WebPRescalerExportRowShrink = ExportRowShrink_MIPS32;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_MIPS32
|
|
@ -11,12 +11,12 @@
|
|||
//
|
||||
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MIPS_DSP_R2)
|
||||
#if defined(WEBP_USE_MIPS_DSP_R2) && !defined(WEBP_REDUCE_SIZE)
|
||||
|
||||
#include <assert.h>
|
||||
#include "../utils/rescaler_utils.h"
|
||||
#include "src/utils/rescaler_utils.h"
|
||||
|
||||
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
|
||||
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
|
||||
|
@ -24,7 +24,7 @@
|
|||
//------------------------------------------------------------------------------
|
||||
// Row export
|
||||
|
||||
static void ExportRowShrink(WebPRescaler* const wrk) {
|
||||
static void ExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
|
||||
int i;
|
||||
const int x_out_max = wrk->dst_width * wrk->num_channels;
|
||||
uint8_t* dst = wrk->dst;
|
||||
|
@ -162,7 +162,7 @@ static void ExportRowShrink(WebPRescaler* const wrk) {
|
|||
}
|
||||
}
|
||||
|
||||
static void ExportRowExpand(WebPRescaler* const wrk) {
|
||||
static void ExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
|
||||
int i;
|
||||
uint8_t* dst = wrk->dst;
|
||||
rescaler_t* irow = wrk->irow;
|
||||
|
@ -303,8 +303,8 @@ static void ExportRowExpand(WebPRescaler* const wrk) {
|
|||
extern void WebPRescalerDspInitMIPSdspR2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPSdspR2(void) {
|
||||
WebPRescalerExportRowExpand = ExportRowExpand;
|
||||
WebPRescalerExportRowShrink = ExportRowShrink;
|
||||
WebPRescalerExportRowExpand = ExportRowExpand_MIPSdspR2;
|
||||
WebPRescalerExportRowShrink = ExportRowShrink_MIPSdspR2;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_MIPS_DSP_R2
|
|
@ -11,14 +11,14 @@
|
|||
//
|
||||
// Author: Prashant Patil (prashant.patil@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MSA)
|
||||
#if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE)
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include "../utils/rescaler_utils.h"
|
||||
#include "./msa_macro.h"
|
||||
#include "src/utils/rescaler_utils.h"
|
||||
#include "src/dsp/msa_macro.h"
|
||||
|
||||
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
|
||||
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
|
||||
|
@ -246,7 +246,7 @@ static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
|
|||
}
|
||||
}
|
||||
|
||||
static void RescalerExportRowExpand(WebPRescaler* const wrk) {
|
||||
static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
|
||||
uint8_t* dst = wrk->dst;
|
||||
rescaler_t* irow = wrk->irow;
|
||||
const int x_out_max = wrk->dst_width * wrk->num_channels;
|
||||
|
@ -411,7 +411,7 @@ static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
|
|||
}
|
||||
}
|
||||
|
||||
static void RescalerExportRowShrink(WebPRescaler* const wrk) {
|
||||
static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
|
||||
uint8_t* dst = wrk->dst;
|
||||
rescaler_t* irow = wrk->irow;
|
||||
const int x_out_max = wrk->dst_width * wrk->num_channels;
|
||||
|
@ -433,8 +433,8 @@ static void RescalerExportRowShrink(WebPRescaler* const wrk) {
|
|||
extern void WebPRescalerDspInitMSA(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
|
||||
WebPRescalerExportRowExpand = RescalerExportRowExpand;
|
||||
WebPRescalerExportRowShrink = RescalerExportRowShrink;
|
||||
WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2;
|
||||
WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_MSA
|
|
@ -11,14 +11,14 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
#if defined(WEBP_USE_NEON) && !defined(WEBP_REDUCE_SIZE)
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include <assert.h>
|
||||
#include "./neon.h"
|
||||
#include "../utils/rescaler_utils.h"
|
||||
#include "src/dsp/neon.h"
|
||||
#include "src/utils/rescaler_utils.h"
|
||||
|
||||
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
|
||||
#define MULT_FIX_C(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
|
||||
|
@ -41,9 +41,9 @@
|
|||
#error "MULT_FIX/WEBP_RESCALER_RFIX need some more work"
|
||||
#endif
|
||||
|
||||
static uint32x4_t Interpolate(const rescaler_t* const frow,
|
||||
const rescaler_t* const irow,
|
||||
uint32_t A, uint32_t B) {
|
||||
static uint32x4_t Interpolate_NEON(const rescaler_t* const frow,
|
||||
const rescaler_t* const irow,
|
||||
uint32_t A, uint32_t B) {
|
||||
LOAD_32x4(frow, A0);
|
||||
LOAD_32x4(irow, B0);
|
||||
const uint64x2_t C0 = vmull_n_u32(vget_low_u32(A0), A);
|
||||
|
@ -56,7 +56,7 @@ static uint32x4_t Interpolate(const rescaler_t* const frow,
|
|||
return E;
|
||||
}
|
||||
|
||||
static void RescalerExportRowExpand(WebPRescaler* const wrk) {
|
||||
static void RescalerExportRowExpand_NEON(WebPRescaler* const wrk) {
|
||||
int x_out;
|
||||
uint8_t* const dst = wrk->dst;
|
||||
rescaler_t* const irow = wrk->irow;
|
||||
|
@ -91,9 +91,9 @@ static void RescalerExportRowExpand(WebPRescaler* const wrk) {
|
|||
const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
|
||||
for (x_out = 0; x_out < max_span; x_out += 8) {
|
||||
const uint32x4_t C0 =
|
||||
Interpolate(frow + x_out + 0, irow + x_out + 0, A, B);
|
||||
Interpolate_NEON(frow + x_out + 0, irow + x_out + 0, A, B);
|
||||
const uint32x4_t C1 =
|
||||
Interpolate(frow + x_out + 4, irow + x_out + 4, A, B);
|
||||
Interpolate_NEON(frow + x_out + 4, irow + x_out + 4, A, B);
|
||||
const uint32x4_t D0 = MULT_FIX(C0, fy_scale_half);
|
||||
const uint32x4_t D1 = MULT_FIX(C1, fy_scale_half);
|
||||
const uint16x4_t E0 = vmovn_u32(D0);
|
||||
|
@ -112,7 +112,7 @@ static void RescalerExportRowExpand(WebPRescaler* const wrk) {
|
|||
}
|
||||
}
|
||||
|
||||
static void RescalerExportRowShrink(WebPRescaler* const wrk) {
|
||||
static void RescalerExportRowShrink_NEON(WebPRescaler* const wrk) {
|
||||
int x_out;
|
||||
uint8_t* const dst = wrk->dst;
|
||||
rescaler_t* const irow = wrk->irow;
|
||||
|
@ -175,8 +175,8 @@ static void RescalerExportRowShrink(WebPRescaler* const wrk) {
|
|||
extern void WebPRescalerDspInitNEON(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitNEON(void) {
|
||||
WebPRescalerExportRowExpand = RescalerExportRowExpand;
|
||||
WebPRescalerExportRowShrink = RescalerExportRowShrink;
|
||||
WebPRescalerExportRowExpand = RescalerExportRowExpand_NEON;
|
||||
WebPRescalerExportRowShrink = RescalerExportRowShrink_NEON;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_NEON
|
|
@ -11,14 +11,14 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
#if defined(WEBP_USE_SSE2) && !defined(WEBP_REDUCE_SIZE)
|
||||
#include <emmintrin.h>
|
||||
|
||||
#include <assert.h>
|
||||
#include "../utils/rescaler_utils.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "src/utils/rescaler_utils.h"
|
||||
#include "src/utils/utils.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Implementations of critical functions ImportRow / ExportRow
|
||||
|
@ -27,7 +27,7 @@
|
|||
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
|
||||
|
||||
// input: 8 bytes ABCDEFGH -> output: A0E0B0F0C0G0D0H0
|
||||
static void LoadTwoPixels(const uint8_t* const src, __m128i* out) {
|
||||
static void LoadTwoPixels_SSE2(const uint8_t* const src, __m128i* out) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH
|
||||
const __m128i B = _mm_unpacklo_epi8(A, zero); // A0B0C0D0E0F0G0H0
|
||||
|
@ -36,14 +36,14 @@ static void LoadTwoPixels(const uint8_t* const src, __m128i* out) {
|
|||
}
|
||||
|
||||
// input: 8 bytes ABCDEFGH -> output: A0B0C0D0E0F0G0H0
|
||||
static void LoadHeightPixels(const uint8_t* const src, __m128i* out) {
|
||||
static void LoadHeightPixels_SSE2(const uint8_t* const src, __m128i* out) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH
|
||||
*out = _mm_unpacklo_epi8(A, zero);
|
||||
}
|
||||
|
||||
static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
|
||||
const uint8_t* src) {
|
||||
static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
|
||||
const uint8_t* src) {
|
||||
rescaler_t* frow = wrk->frow;
|
||||
const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels;
|
||||
const int x_add = wrk->x_add;
|
||||
|
@ -54,10 +54,10 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
|
|||
assert(wrk->x_expand);
|
||||
if (wrk->num_channels == 4) {
|
||||
if (wrk->src_width < 2) {
|
||||
WebPRescalerImportRowExpandC(wrk, src);
|
||||
WebPRescalerImportRowExpand_C(wrk, src);
|
||||
return;
|
||||
}
|
||||
LoadTwoPixels(src, &cur_pixels);
|
||||
LoadTwoPixels_SSE2(src, &cur_pixels);
|
||||
src += 4;
|
||||
while (1) {
|
||||
const __m128i mult = _mm_set1_epi32(((x_add - accum) << 16) | accum);
|
||||
|
@ -67,7 +67,7 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
|
|||
if (frow >= frow_end) break;
|
||||
accum -= wrk->x_sub;
|
||||
if (accum < 0) {
|
||||
LoadTwoPixels(src, &cur_pixels);
|
||||
LoadTwoPixels_SSE2(src, &cur_pixels);
|
||||
src += 4;
|
||||
accum += x_add;
|
||||
}
|
||||
|
@ -76,10 +76,10 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
|
|||
int left;
|
||||
const uint8_t* const src_limit = src + wrk->src_width - 8;
|
||||
if (wrk->src_width < 8) {
|
||||
WebPRescalerImportRowExpandC(wrk, src);
|
||||
WebPRescalerImportRowExpand_C(wrk, src);
|
||||
return;
|
||||
}
|
||||
LoadHeightPixels(src, &cur_pixels);
|
||||
LoadHeightPixels_SSE2(src, &cur_pixels);
|
||||
src += 7;
|
||||
left = 7;
|
||||
while (1) {
|
||||
|
@ -94,7 +94,7 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
|
|||
if (--left) {
|
||||
cur_pixels = _mm_srli_si128(cur_pixels, 2);
|
||||
} else if (src <= src_limit) {
|
||||
LoadHeightPixels(src, &cur_pixels);
|
||||
LoadHeightPixels_SSE2(src, &cur_pixels);
|
||||
src += 7;
|
||||
left = 7;
|
||||
} else { // tail
|
||||
|
@ -110,8 +110,8 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
|
|||
assert(accum == 0);
|
||||
}
|
||||
|
||||
static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
|
||||
const uint8_t* src) {
|
||||
static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk,
|
||||
const uint8_t* src) {
|
||||
const int x_sub = wrk->x_sub;
|
||||
int accum = 0;
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
|
@ -123,7 +123,7 @@ static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
|
|||
const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width;
|
||||
|
||||
if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) {
|
||||
WebPRescalerImportRowShrinkC(wrk, src);
|
||||
WebPRescalerImportRowShrink_C(wrk, src);
|
||||
return;
|
||||
}
|
||||
assert(!WebPRescalerInputDone(wrk));
|
||||
|
@ -169,12 +169,12 @@ static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
|
|||
// Row export
|
||||
|
||||
// load *src as epi64, multiply by mult and store result in [out0 ... out3]
|
||||
static WEBP_INLINE void LoadDispatchAndMult(const rescaler_t* const src,
|
||||
const __m128i* const mult,
|
||||
__m128i* const out0,
|
||||
__m128i* const out1,
|
||||
__m128i* const out2,
|
||||
__m128i* const out3) {
|
||||
static WEBP_INLINE void LoadDispatchAndMult_SSE2(const rescaler_t* const src,
|
||||
const __m128i* const mult,
|
||||
__m128i* const out0,
|
||||
__m128i* const out1,
|
||||
__m128i* const out2,
|
||||
__m128i* const out3) {
|
||||
const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + 0));
|
||||
const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + 4));
|
||||
const __m128i A2 = _mm_srli_epi64(A0, 32);
|
||||
|
@ -192,12 +192,12 @@ static WEBP_INLINE void LoadDispatchAndMult(const rescaler_t* const src,
|
|||
}
|
||||
}
|
||||
|
||||
static WEBP_INLINE void ProcessRow(const __m128i* const A0,
|
||||
const __m128i* const A1,
|
||||
const __m128i* const A2,
|
||||
const __m128i* const A3,
|
||||
const __m128i* const mult,
|
||||
uint8_t* const dst) {
|
||||
static WEBP_INLINE void ProcessRow_SSE2(const __m128i* const A0,
|
||||
const __m128i* const A1,
|
||||
const __m128i* const A2,
|
||||
const __m128i* const A3,
|
||||
const __m128i* const mult,
|
||||
uint8_t* const dst) {
|
||||
const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
|
||||
const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0);
|
||||
const __m128i B0 = _mm_mul_epu32(*A0, *mult);
|
||||
|
@ -210,7 +210,7 @@ static WEBP_INLINE void ProcessRow(const __m128i* const A0,
|
|||
const __m128i C3 = _mm_add_epi64(B3, rounder);
|
||||
const __m128i D0 = _mm_srli_epi64(C0, WEBP_RESCALER_RFIX);
|
||||
const __m128i D1 = _mm_srli_epi64(C1, WEBP_RESCALER_RFIX);
|
||||
#if (WEBP_RESCALER_FIX < 32)
|
||||
#if (WEBP_RESCALER_RFIX < 32)
|
||||
const __m128i D2 =
|
||||
_mm_and_si128(_mm_slli_epi64(C2, 32 - WEBP_RESCALER_RFIX), mask);
|
||||
const __m128i D3 =
|
||||
|
@ -226,7 +226,7 @@ static WEBP_INLINE void ProcessRow(const __m128i* const A0,
|
|||
_mm_storel_epi64((__m128i*)dst, G);
|
||||
}
|
||||
|
||||
static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
|
||||
static void RescalerExportRowExpand_SSE2(WebPRescaler* const wrk) {
|
||||
int x_out;
|
||||
uint8_t* const dst = wrk->dst;
|
||||
rescaler_t* const irow = wrk->irow;
|
||||
|
@ -240,8 +240,8 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
|
|||
if (wrk->y_accum == 0) {
|
||||
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
|
||||
__m128i A0, A1, A2, A3;
|
||||
LoadDispatchAndMult(frow + x_out, NULL, &A0, &A1, &A2, &A3);
|
||||
ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out);
|
||||
LoadDispatchAndMult_SSE2(frow + x_out, NULL, &A0, &A1, &A2, &A3);
|
||||
ProcessRow_SSE2(&A0, &A1, &A2, &A3, &mult, dst + x_out);
|
||||
}
|
||||
for (; x_out < x_out_max; ++x_out) {
|
||||
const uint32_t J = frow[x_out];
|
||||
|
@ -257,8 +257,8 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
|
|||
const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
|
||||
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
|
||||
__m128i A0, A1, A2, A3, B0, B1, B2, B3;
|
||||
LoadDispatchAndMult(frow + x_out, &mA, &A0, &A1, &A2, &A3);
|
||||
LoadDispatchAndMult(irow + x_out, &mB, &B0, &B1, &B2, &B3);
|
||||
LoadDispatchAndMult_SSE2(frow + x_out, &mA, &A0, &A1, &A2, &A3);
|
||||
LoadDispatchAndMult_SSE2(irow + x_out, &mB, &B0, &B1, &B2, &B3);
|
||||
{
|
||||
const __m128i C0 = _mm_add_epi64(A0, B0);
|
||||
const __m128i C1 = _mm_add_epi64(A1, B1);
|
||||
|
@ -272,7 +272,7 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
|
|||
const __m128i E1 = _mm_srli_epi64(D1, WEBP_RESCALER_RFIX);
|
||||
const __m128i E2 = _mm_srli_epi64(D2, WEBP_RESCALER_RFIX);
|
||||
const __m128i E3 = _mm_srli_epi64(D3, WEBP_RESCALER_RFIX);
|
||||
ProcessRow(&E0, &E1, &E2, &E3, &mult, dst + x_out);
|
||||
ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult, dst + x_out);
|
||||
}
|
||||
}
|
||||
for (; x_out < x_out_max; ++x_out) {
|
||||
|
@ -286,7 +286,7 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
|
|||
}
|
||||
}
|
||||
|
||||
static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
|
||||
static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) {
|
||||
int x_out;
|
||||
uint8_t* const dst = wrk->dst;
|
||||
rescaler_t* const irow = wrk->irow;
|
||||
|
@ -303,8 +303,8 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
|
|||
const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
|
||||
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
|
||||
__m128i A0, A1, A2, A3, B0, B1, B2, B3;
|
||||
LoadDispatchAndMult(irow + x_out, NULL, &A0, &A1, &A2, &A3);
|
||||
LoadDispatchAndMult(frow + x_out, &mult_y, &B0, &B1, &B2, &B3);
|
||||
LoadDispatchAndMult_SSE2(irow + x_out, NULL, &A0, &A1, &A2, &A3);
|
||||
LoadDispatchAndMult_SSE2(frow + x_out, &mult_y, &B0, &B1, &B2, &B3);
|
||||
{
|
||||
const __m128i C0 = _mm_add_epi64(B0, rounder);
|
||||
const __m128i C1 = _mm_add_epi64(B1, rounder);
|
||||
|
@ -324,7 +324,7 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
|
|||
const __m128i G1 = _mm_or_si128(D1, F3);
|
||||
_mm_storeu_si128((__m128i*)(irow + x_out + 0), G0);
|
||||
_mm_storeu_si128((__m128i*)(irow + x_out + 4), G1);
|
||||
ProcessRow(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out);
|
||||
ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out);
|
||||
}
|
||||
}
|
||||
for (; x_out < x_out_max; ++x_out) {
|
||||
|
@ -340,10 +340,10 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
|
|||
const __m128i zero = _mm_setzero_si128();
|
||||
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
|
||||
__m128i A0, A1, A2, A3;
|
||||
LoadDispatchAndMult(irow + x_out, NULL, &A0, &A1, &A2, &A3);
|
||||
LoadDispatchAndMult_SSE2(irow + x_out, NULL, &A0, &A1, &A2, &A3);
|
||||
_mm_storeu_si128((__m128i*)(irow + x_out + 0), zero);
|
||||
_mm_storeu_si128((__m128i*)(irow + x_out + 4), zero);
|
||||
ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out);
|
||||
ProcessRow_SSE2(&A0, &A1, &A2, &A3, &mult, dst + x_out);
|
||||
}
|
||||
for (; x_out < x_out_max; ++x_out) {
|
||||
const int v = (int)MULT_FIX(irow[x_out], scale);
|
||||
|
@ -362,10 +362,10 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
|
|||
extern void WebPRescalerDspInitSSE2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitSSE2(void) {
|
||||
WebPRescalerImportRowExpand = RescalerImportRowExpandSSE2;
|
||||
WebPRescalerImportRowShrink = RescalerImportRowShrinkSSE2;
|
||||
WebPRescalerExportRowExpand = RescalerExportRowExpandSSE2;
|
||||
WebPRescalerExportRowShrink = RescalerExportRowShrinkSSE2;
|
||||
WebPRescalerImportRowExpand = RescalerImportRowExpand_SSE2;
|
||||
WebPRescalerImportRowShrink = RescalerImportRowShrink_SSE2;
|
||||
WebPRescalerExportRowExpand = RescalerExportRowExpand_SSE2;
|
||||
WebPRescalerExportRowShrink = RescalerExportRowShrink_SSE2;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_SSE2
|
|
@ -0,0 +1,166 @@
|
|||
// Copyright 2017 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// distortion calculation
|
||||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h> // for abs()
|
||||
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if !defined(WEBP_REDUCE_SIZE)
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// SSIM / PSNR
|
||||
|
||||
// hat-shaped filter. Sum of coefficients is equal to 16.
|
||||
static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
|
||||
1, 2, 3, 4, 3, 2, 1
|
||||
};
|
||||
static const uint32_t kWeightSum = 16 * 16; // sum{kWeight}^2
|
||||
|
||||
static WEBP_INLINE double SSIMCalculation(
|
||||
const VP8DistoStats* const stats, uint32_t N /*num samples*/) {
|
||||
const uint32_t w2 = N * N;
|
||||
const uint32_t C1 = 20 * w2;
|
||||
const uint32_t C2 = 60 * w2;
|
||||
const uint32_t C3 = 8 * 8 * w2; // 'dark' limit ~= 6
|
||||
const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
|
||||
const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
|
||||
if (xmxm + ymym >= C3) {
|
||||
const int64_t xmym = (int64_t)stats->xm * stats->ym;
|
||||
const int64_t sxy = (int64_t)stats->xym * N - xmym; // can be negative
|
||||
const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
|
||||
const uint64_t syy = (uint64_t)stats->yym * N - ymym;
|
||||
// we descale by 8 to prevent overflow during the fnum/fden multiply.
|
||||
const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
|
||||
const uint64_t den_S = (sxx + syy + C2) >> 8;
|
||||
const uint64_t fnum = (2 * xmym + C1) * num_S;
|
||||
const uint64_t fden = (xmxm + ymym + C1) * den_S;
|
||||
const double r = (double)fnum / fden;
|
||||
assert(r >= 0. && r <= 1.0);
|
||||
return r;
|
||||
}
|
||||
return 1.; // area is too dark to contribute meaningfully
|
||||
}
|
||||
|
||||
double VP8SSIMFromStats(const VP8DistoStats* const stats) {
|
||||
return SSIMCalculation(stats, kWeightSum);
|
||||
}
|
||||
|
||||
double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
|
||||
return SSIMCalculation(stats, stats->w);
|
||||
}
|
||||
|
||||
static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
|
||||
const uint8_t* src2, int stride2,
|
||||
int xo, int yo, int W, int H) {
|
||||
VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
|
||||
const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
|
||||
const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
|
||||
: yo + VP8_SSIM_KERNEL;
|
||||
const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
|
||||
const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
|
||||
: xo + VP8_SSIM_KERNEL;
|
||||
int x, y;
|
||||
src1 += ymin * stride1;
|
||||
src2 += ymin * stride2;
|
||||
for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
|
||||
for (x = xmin; x <= xmax; ++x) {
|
||||
const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
|
||||
* kWeight[VP8_SSIM_KERNEL + y - yo];
|
||||
const uint32_t s1 = src1[x];
|
||||
const uint32_t s2 = src2[x];
|
||||
stats.w += w;
|
||||
stats.xm += w * s1;
|
||||
stats.ym += w * s2;
|
||||
stats.xxm += w * s1 * s1;
|
||||
stats.xym += w * s1 * s2;
|
||||
stats.yym += w * s2 * s2;
|
||||
}
|
||||
}
|
||||
return VP8SSIMFromStatsClipped(&stats);
|
||||
}
|
||||
|
||||
static double SSIMGet_C(const uint8_t* src1, int stride1,
|
||||
const uint8_t* src2, int stride2) {
|
||||
VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
|
||||
int x, y;
|
||||
for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
|
||||
for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
|
||||
const uint32_t w = kWeight[x] * kWeight[y];
|
||||
const uint32_t s1 = src1[x];
|
||||
const uint32_t s2 = src2[x];
|
||||
stats.xm += w * s1;
|
||||
stats.ym += w * s2;
|
||||
stats.xxm += w * s1 * s1;
|
||||
stats.xym += w * s1 * s2;
|
||||
stats.yym += w * s2 * s2;
|
||||
}
|
||||
}
|
||||
return VP8SSIMFromStats(&stats);
|
||||
}
|
||||
|
||||
#endif // !defined(WEBP_REDUCE_SIZE)
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
#if !defined(WEBP_DISABLE_STATS)
|
||||
static uint32_t AccumulateSSE_C(const uint8_t* src1,
|
||||
const uint8_t* src2, int len) {
|
||||
int i;
|
||||
uint32_t sse2 = 0;
|
||||
assert(len <= 65535); // to ensure that accumulation fits within uint32_t
|
||||
for (i = 0; i < len; ++i) {
|
||||
const int32_t diff = src1[i] - src2[i];
|
||||
sse2 += diff * diff;
|
||||
}
|
||||
return sse2;
|
||||
}
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
#if !defined(WEBP_REDUCE_SIZE)
|
||||
VP8SSIMGetFunc VP8SSIMGet;
|
||||
VP8SSIMGetClippedFunc VP8SSIMGetClipped;
|
||||
#endif
|
||||
#if !defined(WEBP_DISABLE_STATS)
|
||||
VP8AccumulateSSEFunc VP8AccumulateSSE;
|
||||
#endif
|
||||
|
||||
extern void VP8SSIMDspInitSSE2(void);
|
||||
|
||||
static volatile VP8CPUInfo ssim_last_cpuinfo_used =
|
||||
(VP8CPUInfo)&ssim_last_cpuinfo_used;
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
|
||||
if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
|
||||
|
||||
#if !defined(WEBP_REDUCE_SIZE)
|
||||
VP8SSIMGetClipped = SSIMGetClipped_C;
|
||||
VP8SSIMGet = SSIMGet_C;
|
||||
#endif
|
||||
|
||||
#if !defined(WEBP_DISABLE_STATS)
|
||||
VP8AccumulateSSE = AccumulateSSE_C;
|
||||
#endif
|
||||
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
if (VP8GetCPUInfo(kSSE2)) {
|
||||
VP8SSIMDspInitSSE2();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
ssim_last_cpuinfo_used = VP8GetCPUInfo;
|
||||
}
|
|
@ -0,0 +1,165 @@
|
|||
// Copyright 2017 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// SSE2 version of distortion calculation
|
||||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
|
||||
#include <assert.h>
|
||||
#include <emmintrin.h>
|
||||
|
||||
#include "src/dsp/common_sse2.h"
|
||||
|
||||
#if !defined(WEBP_DISABLE_STATS)
|
||||
|
||||
// Helper function
|
||||
static WEBP_INLINE void SubtractAndSquare_SSE2(const __m128i a, const __m128i b,
|
||||
__m128i* const sum) {
|
||||
// take abs(a-b) in 8b
|
||||
const __m128i a_b = _mm_subs_epu8(a, b);
|
||||
const __m128i b_a = _mm_subs_epu8(b, a);
|
||||
const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
|
||||
// zero-extend to 16b
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero);
|
||||
const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero);
|
||||
// multiply with self
|
||||
const __m128i sum1 = _mm_madd_epi16(C0, C0);
|
||||
const __m128i sum2 = _mm_madd_epi16(C1, C1);
|
||||
*sum = _mm_add_epi32(sum1, sum2);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// SSIM / PSNR entry point
|
||||
|
||||
static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
|
||||
const uint8_t* src2, int len) {
|
||||
int i = 0;
|
||||
uint32_t sse2 = 0;
|
||||
if (len >= 16) {
|
||||
const int limit = len - 32;
|
||||
int32_t tmp[4];
|
||||
__m128i sum1;
|
||||
__m128i sum = _mm_setzero_si128();
|
||||
__m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
|
||||
__m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
|
||||
i += 16;
|
||||
while (i <= limit) {
|
||||
const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
|
||||
const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
|
||||
__m128i sum2;
|
||||
i += 16;
|
||||
SubtractAndSquare_SSE2(a0, b0, &sum1);
|
||||
sum = _mm_add_epi32(sum, sum1);
|
||||
a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
|
||||
b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
|
||||
i += 16;
|
||||
SubtractAndSquare_SSE2(a1, b1, &sum2);
|
||||
sum = _mm_add_epi32(sum, sum2);
|
||||
}
|
||||
SubtractAndSquare_SSE2(a0, b0, &sum1);
|
||||
sum = _mm_add_epi32(sum, sum1);
|
||||
_mm_storeu_si128((__m128i*)tmp, sum);
|
||||
sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
|
||||
}
|
||||
|
||||
for (; i < len; ++i) {
|
||||
const int32_t diff = src1[i] - src2[i];
|
||||
sse2 += diff * diff;
|
||||
}
|
||||
return sse2;
|
||||
}
|
||||
#endif // !defined(WEBP_DISABLE_STATS)
|
||||
|
||||
#if !defined(WEBP_REDUCE_SIZE)
|
||||
|
||||
static uint32_t HorizontalAdd16b_SSE2(const __m128i* const m) {
|
||||
uint16_t tmp[8];
|
||||
const __m128i a = _mm_srli_si128(*m, 8);
|
||||
const __m128i b = _mm_add_epi16(*m, a);
|
||||
_mm_storeu_si128((__m128i*)tmp, b);
|
||||
return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
|
||||
}
|
||||
|
||||
static uint32_t HorizontalAdd32b_SSE2(const __m128i* const m) {
|
||||
const __m128i a = _mm_srli_si128(*m, 8);
|
||||
const __m128i b = _mm_add_epi32(*m, a);
|
||||
const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
|
||||
return (uint32_t)_mm_cvtsi128_si32(c);
|
||||
}
|
||||
|
||||
static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
|
||||
|
||||
#define ACCUMULATE_ROW(WEIGHT) do { \
|
||||
/* compute row weight (Wx * Wy) */ \
|
||||
const __m128i Wy = _mm_set1_epi16((WEIGHT)); \
|
||||
const __m128i W = _mm_mullo_epi16(Wx, Wy); \
|
||||
/* process 8 bytes at a time (7 bytes, actually) */ \
|
||||
const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
|
||||
const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
|
||||
/* convert to 16b and multiply by weight */ \
|
||||
const __m128i a1 = _mm_unpacklo_epi8(a0, zero); \
|
||||
const __m128i b1 = _mm_unpacklo_epi8(b0, zero); \
|
||||
const __m128i wa1 = _mm_mullo_epi16(a1, W); \
|
||||
const __m128i wb1 = _mm_mullo_epi16(b1, W); \
|
||||
/* accumulate */ \
|
||||
xm = _mm_add_epi16(xm, wa1); \
|
||||
ym = _mm_add_epi16(ym, wb1); \
|
||||
xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1)); \
|
||||
xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1)); \
|
||||
yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1)); \
|
||||
src1 += stride1; \
|
||||
src2 += stride2; \
|
||||
} while (0)
|
||||
|
||||
static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
|
||||
const uint8_t* src2, int stride2) {
|
||||
VP8DistoStats stats;
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m128i xm = zero, ym = zero; // 16b accums
|
||||
__m128i xxm = zero, yym = zero, xym = zero; // 32b accum
|
||||
const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
|
||||
assert(2 * VP8_SSIM_KERNEL + 1 == 7);
|
||||
ACCUMULATE_ROW(1);
|
||||
ACCUMULATE_ROW(2);
|
||||
ACCUMULATE_ROW(3);
|
||||
ACCUMULATE_ROW(4);
|
||||
ACCUMULATE_ROW(3);
|
||||
ACCUMULATE_ROW(2);
|
||||
ACCUMULATE_ROW(1);
|
||||
stats.xm = HorizontalAdd16b_SSE2(&xm);
|
||||
stats.ym = HorizontalAdd16b_SSE2(&ym);
|
||||
stats.xxm = HorizontalAdd32b_SSE2(&xxm);
|
||||
stats.xym = HorizontalAdd32b_SSE2(&xym);
|
||||
stats.yym = HorizontalAdd32b_SSE2(&yym);
|
||||
return VP8SSIMFromStats(&stats);
|
||||
}
|
||||
|
||||
#endif // !defined(WEBP_REDUCE_SIZE)
|
||||
|
||||
extern void VP8SSIMDspInitSSE2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
|
||||
#if !defined(WEBP_DISABLE_STATS)
|
||||
VP8AccumulateSSE = AccumulateSSE_SSE2;
|
||||
#endif
|
||||
#if !defined(WEBP_REDUCE_SIZE)
|
||||
VP8SSIMGet = SSIMGet_SSE2;
|
||||
#endif
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_SSE2
|
||||
|
||||
WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
|
||||
|
||||
#endif // WEBP_USE_SSE2
|
|
@ -11,8 +11,8 @@
|
|||
//
|
||||
// Author: somnath@google.com (Somnath Banerjee)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "./yuv.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
#include "src/dsp/yuv.h"
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
|
@ -63,17 +63,17 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
|
|||
const uint32_t uv0 = (diag_12 + tl_uv) >> 1; \
|
||||
const uint32_t uv1 = (diag_03 + t_uv) >> 1; \
|
||||
FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \
|
||||
top_dst + (2 * x - 1) * XSTEP); \
|
||||
top_dst + (2 * x - 1) * (XSTEP)); \
|
||||
FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16), \
|
||||
top_dst + (2 * x - 0) * XSTEP); \
|
||||
top_dst + (2 * x - 0) * (XSTEP)); \
|
||||
} \
|
||||
if (bottom_y != NULL) { \
|
||||
const uint32_t uv0 = (diag_03 + l_uv) >> 1; \
|
||||
const uint32_t uv1 = (diag_12 + uv) >> 1; \
|
||||
FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \
|
||||
bottom_dst + (2 * x - 1) * XSTEP); \
|
||||
bottom_dst + (2 * x - 1) * (XSTEP)); \
|
||||
FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16), \
|
||||
bottom_dst + (2 * x + 0) * XSTEP); \
|
||||
bottom_dst + (2 * x + 0) * (XSTEP)); \
|
||||
} \
|
||||
tl_uv = t_uv; \
|
||||
l_uv = uv; \
|
||||
|
@ -82,24 +82,50 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
|
|||
{ \
|
||||
const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \
|
||||
FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16), \
|
||||
top_dst + (len - 1) * XSTEP); \
|
||||
top_dst + (len - 1) * (XSTEP)); \
|
||||
} \
|
||||
if (bottom_y != NULL) { \
|
||||
const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \
|
||||
FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16), \
|
||||
bottom_dst + (len - 1) * XSTEP); \
|
||||
bottom_dst + (len - 1) * (XSTEP)); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
// All variants implemented.
|
||||
UPSAMPLE_FUNC(UpsampleRgbLinePair, VP8YuvToRgb, 3)
|
||||
UPSAMPLE_FUNC(UpsampleBgrLinePair, VP8YuvToBgr, 3)
|
||||
UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
|
||||
UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
|
||||
UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4)
|
||||
UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2)
|
||||
UPSAMPLE_FUNC(UpsampleRgb565LinePair, VP8YuvToRgb565, 2)
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
UPSAMPLE_FUNC(UpsampleRgbaLinePair_C, VP8YuvToRgba, 4)
|
||||
UPSAMPLE_FUNC(UpsampleBgraLinePair_C, VP8YuvToBgra, 4)
|
||||
#if !defined(WEBP_REDUCE_CSP)
|
||||
UPSAMPLE_FUNC(UpsampleArgbLinePair_C, VP8YuvToArgb, 4)
|
||||
UPSAMPLE_FUNC(UpsampleRgbLinePair_C, VP8YuvToRgb, 3)
|
||||
UPSAMPLE_FUNC(UpsampleBgrLinePair_C, VP8YuvToBgr, 3)
|
||||
UPSAMPLE_FUNC(UpsampleRgba4444LinePair_C, VP8YuvToRgba4444, 2)
|
||||
UPSAMPLE_FUNC(UpsampleRgb565LinePair_C, VP8YuvToRgb565, 2)
|
||||
#else
|
||||
static void EmptyUpsampleFunc(const uint8_t* top_y, const uint8_t* bottom_y,
|
||||
const uint8_t* top_u, const uint8_t* top_v,
|
||||
const uint8_t* cur_u, const uint8_t* cur_v,
|
||||
uint8_t* top_dst, uint8_t* bottom_dst, int len) {
|
||||
(void)top_y;
|
||||
(void)bottom_y;
|
||||
(void)top_u;
|
||||
(void)top_v;
|
||||
(void)cur_u;
|
||||
(void)cur_v;
|
||||
(void)top_dst;
|
||||
(void)bottom_dst;
|
||||
(void)len;
|
||||
assert(0); // COLORSPACE SUPPORT NOT COMPILED
|
||||
}
|
||||
#define UpsampleArgbLinePair_C EmptyUpsampleFunc
|
||||
#define UpsampleRgbLinePair_C EmptyUpsampleFunc
|
||||
#define UpsampleBgrLinePair_C EmptyUpsampleFunc
|
||||
#define UpsampleRgba4444LinePair_C EmptyUpsampleFunc
|
||||
#define UpsampleRgb565LinePair_C EmptyUpsampleFunc
|
||||
#endif // WEBP_REDUCE_CSP
|
||||
|
||||
#endif
|
||||
|
||||
#undef LOAD_UV
|
||||
#undef UPSAMPLE_FUNC
|
||||
|
@ -141,7 +167,6 @@ DUAL_SAMPLE_FUNC(DualLineSamplerARGB, VP8YuvToArgb)
|
|||
|
||||
WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last) {
|
||||
WebPInitUpsamplers();
|
||||
VP8YUVInit();
|
||||
#ifdef FANCY_UPSAMPLING
|
||||
return WebPUpsamplers[alpha_is_last ? MODE_BGRA : MODE_ARGB];
|
||||
#else
|
||||
|
@ -158,16 +183,33 @@ extern void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
|
|||
void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
|
||||
uint8_t* dst, int len) { \
|
||||
int i; \
|
||||
for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \
|
||||
for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * (XSTEP)]); \
|
||||
}
|
||||
|
||||
YUV444_FUNC(WebPYuv444ToRgbC, VP8YuvToRgb, 3)
|
||||
YUV444_FUNC(WebPYuv444ToBgrC, VP8YuvToBgr, 3)
|
||||
YUV444_FUNC(WebPYuv444ToRgbaC, VP8YuvToRgba, 4)
|
||||
YUV444_FUNC(WebPYuv444ToBgraC, VP8YuvToBgra, 4)
|
||||
YUV444_FUNC(WebPYuv444ToArgbC, VP8YuvToArgb, 4)
|
||||
YUV444_FUNC(WebPYuv444ToRgba4444C, VP8YuvToRgba4444, 2)
|
||||
YUV444_FUNC(WebPYuv444ToRgb565C, VP8YuvToRgb565, 2)
|
||||
YUV444_FUNC(WebPYuv444ToRgba_C, VP8YuvToRgba, 4)
|
||||
YUV444_FUNC(WebPYuv444ToBgra_C, VP8YuvToBgra, 4)
|
||||
#if !defined(WEBP_REDUCE_CSP)
|
||||
YUV444_FUNC(WebPYuv444ToRgb_C, VP8YuvToRgb, 3)
|
||||
YUV444_FUNC(WebPYuv444ToBgr_C, VP8YuvToBgr, 3)
|
||||
YUV444_FUNC(WebPYuv444ToArgb_C, VP8YuvToArgb, 4)
|
||||
YUV444_FUNC(WebPYuv444ToRgba4444_C, VP8YuvToRgba4444, 2)
|
||||
YUV444_FUNC(WebPYuv444ToRgb565_C, VP8YuvToRgb565, 2)
|
||||
#else
|
||||
static void EmptyYuv444Func(const uint8_t* y,
|
||||
const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst, int len) {
|
||||
(void)y;
|
||||
(void)u;
|
||||
(void)v;
|
||||
(void)dst;
|
||||
(void)len;
|
||||
}
|
||||
#define WebPYuv444ToRgb_C EmptyYuv444Func
|
||||
#define WebPYuv444ToBgr_C EmptyYuv444Func
|
||||
#define WebPYuv444ToArgb_C EmptyYuv444Func
|
||||
#define WebPYuv444ToRgba4444_C EmptyYuv444Func
|
||||
#define WebPYuv444ToRgb565_C EmptyYuv444Func
|
||||
#endif // WEBP_REDUCE_CSP
|
||||
|
||||
#undef YUV444_FUNC
|
||||
|
||||
|
@ -182,17 +224,17 @@ static volatile VP8CPUInfo upsampling_last_cpuinfo_used1 =
|
|||
WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) {
|
||||
if (upsampling_last_cpuinfo_used1 == VP8GetCPUInfo) return;
|
||||
|
||||
WebPYUV444Converters[MODE_RGB] = WebPYuv444ToRgbC;
|
||||
WebPYUV444Converters[MODE_RGBA] = WebPYuv444ToRgbaC;
|
||||
WebPYUV444Converters[MODE_BGR] = WebPYuv444ToBgrC;
|
||||
WebPYUV444Converters[MODE_BGRA] = WebPYuv444ToBgraC;
|
||||
WebPYUV444Converters[MODE_ARGB] = WebPYuv444ToArgbC;
|
||||
WebPYUV444Converters[MODE_RGBA_4444] = WebPYuv444ToRgba4444C;
|
||||
WebPYUV444Converters[MODE_RGB_565] = WebPYuv444ToRgb565C;
|
||||
WebPYUV444Converters[MODE_rgbA] = WebPYuv444ToRgbaC;
|
||||
WebPYUV444Converters[MODE_bgrA] = WebPYuv444ToBgraC;
|
||||
WebPYUV444Converters[MODE_Argb] = WebPYuv444ToArgbC;
|
||||
WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444C;
|
||||
WebPYUV444Converters[MODE_RGBA] = WebPYuv444ToRgba_C;
|
||||
WebPYUV444Converters[MODE_BGRA] = WebPYuv444ToBgra_C;
|
||||
WebPYUV444Converters[MODE_RGB] = WebPYuv444ToRgb_C;
|
||||
WebPYUV444Converters[MODE_BGR] = WebPYuv444ToBgr_C;
|
||||
WebPYUV444Converters[MODE_ARGB] = WebPYuv444ToArgb_C;
|
||||
WebPYUV444Converters[MODE_RGBA_4444] = WebPYuv444ToRgba4444_C;
|
||||
WebPYUV444Converters[MODE_RGB_565] = WebPYuv444ToRgb565_C;
|
||||
WebPYUV444Converters[MODE_rgbA] = WebPYuv444ToRgba_C;
|
||||
WebPYUV444Converters[MODE_bgrA] = WebPYuv444ToBgra_C;
|
||||
WebPYUV444Converters[MODE_Argb] = WebPYuv444ToArgb_C;
|
||||
WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444_C;
|
||||
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
|
@ -224,17 +266,19 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
|
|||
if (upsampling_last_cpuinfo_used2 == VP8GetCPUInfo) return;
|
||||
|
||||
#ifdef FANCY_UPSAMPLING
|
||||
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
|
||||
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
|
||||
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
|
||||
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
|
||||
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
|
||||
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
|
||||
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
|
||||
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
|
||||
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
|
||||
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
|
||||
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_C;
|
||||
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_C;
|
||||
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_C;
|
||||
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_C;
|
||||
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_C;
|
||||
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_C;
|
||||
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_C;
|
||||
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_C;
|
||||
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_C;
|
||||
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_C;
|
||||
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_C;
|
||||
#endif
|
||||
|
||||
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
|
@ -243,11 +287,6 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
|
|||
WebPInitUpsamplersSSE2();
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_NEON)
|
||||
if (VP8GetCPUInfo(kNEON)) {
|
||||
WebPInitUpsamplersNEON();
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_MIPS_DSP_R2)
|
||||
if (VP8GetCPUInfo(kMIPSdspR2)) {
|
||||
WebPInitUpsamplersMIPSdspR2();
|
||||
|
@ -259,6 +298,26 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
|
|||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
if (WEBP_NEON_OMIT_C_CODE ||
|
||||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
|
||||
WebPInitUpsamplersNEON();
|
||||
}
|
||||
#endif
|
||||
|
||||
assert(WebPUpsamplers[MODE_RGBA] != NULL);
|
||||
assert(WebPUpsamplers[MODE_BGRA] != NULL);
|
||||
assert(WebPUpsamplers[MODE_rgbA] != NULL);
|
||||
assert(WebPUpsamplers[MODE_bgrA] != NULL);
|
||||
assert(WebPUpsamplers[MODE_RGB] != NULL);
|
||||
assert(WebPUpsamplers[MODE_BGR] != NULL);
|
||||
assert(WebPUpsamplers[MODE_ARGB] != NULL);
|
||||
assert(WebPUpsamplers[MODE_RGBA_4444] != NULL);
|
||||
assert(WebPUpsamplers[MODE_RGB_565] != NULL);
|
||||
assert(WebPUpsamplers[MODE_Argb] != NULL);
|
||||
assert(WebPUpsamplers[MODE_rgbA_4444] != NULL);
|
||||
|
||||
#endif // FANCY_UPSAMPLING
|
||||
upsampling_last_cpuinfo_used2 = VP8GetCPUInfo;
|
||||
}
|
|
@ -12,14 +12,12 @@
|
|||
// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
|
||||
// Djordje Pesut (djordje.pesut@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MIPS_DSP_R2)
|
||||
|
||||
#include <assert.h>
|
||||
#include "./yuv.h"
|
||||
|
||||
#if !defined(WEBP_YUV_USE_TABLE)
|
||||
#include "src/dsp/yuv.h"
|
||||
|
||||
#define YUV_TO_RGB(Y, U, V, R, G, B) do { \
|
||||
const int t1 = MultHi(Y, 19077); \
|
||||
|
@ -48,6 +46,7 @@
|
|||
); \
|
||||
} while (0)
|
||||
|
||||
#if !defined(WEBP_REDUCE_CSP)
|
||||
static WEBP_INLINE void YuvToRgb(int y, int u, int v, uint8_t* const rgb) {
|
||||
int r, g, b;
|
||||
YUV_TO_RGB(y, u, v, r, g, b);
|
||||
|
@ -68,7 +67,7 @@ static WEBP_INLINE void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
|
|||
{
|
||||
const int rg = (r & 0xf8) | (g >> 5);
|
||||
const int gb = ((g << 3) & 0xe0) | (b >> 3);
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
#if (WEBP_SWAP_16BIT_CSP == 1)
|
||||
rgb[0] = gb;
|
||||
rgb[1] = rg;
|
||||
#else
|
||||
|
@ -84,7 +83,7 @@ static WEBP_INLINE void YuvToRgba4444(int y, int u, int v,
|
|||
{
|
||||
const int rg = (r & 0xf0) | (g >> 4);
|
||||
const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
#if (WEBP_SWAP_16BIT_CSP == 1)
|
||||
argb[0] = ba;
|
||||
argb[1] = rg;
|
||||
#else
|
||||
|
@ -93,11 +92,12 @@ static WEBP_INLINE void YuvToRgba4444(int y, int u, int v,
|
|||
#endif
|
||||
}
|
||||
}
|
||||
#endif // WEBP_YUV_USE_TABLE
|
||||
#endif // WEBP_REDUCE_CSP
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Alpha handling variants
|
||||
|
||||
#if !defined(WEBP_REDUCE_CSP)
|
||||
static WEBP_INLINE void YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
|
||||
uint8_t* const argb) {
|
||||
int r, g, b;
|
||||
|
@ -107,6 +107,7 @@ static WEBP_INLINE void YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
|
|||
argb[2] = g;
|
||||
argb[3] = b;
|
||||
}
|
||||
#endif // WEBP_REDUCE_CSP
|
||||
static WEBP_INLINE void YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
|
||||
uint8_t* const bgra) {
|
||||
int r, g, b;
|
||||
|
@ -200,13 +201,15 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
|
|||
}
|
||||
|
||||
// All variants implemented.
|
||||
UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3)
|
||||
UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3)
|
||||
UPSAMPLE_FUNC(UpsampleRgbaLinePair, YuvToRgba, 4)
|
||||
UPSAMPLE_FUNC(UpsampleBgraLinePair, YuvToBgra, 4)
|
||||
#if !defined(WEBP_REDUCE_CSP)
|
||||
UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3)
|
||||
UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3)
|
||||
UPSAMPLE_FUNC(UpsampleArgbLinePair, YuvToArgb, 4)
|
||||
UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
|
||||
UPSAMPLE_FUNC(UpsampleRgb565LinePair, YuvToRgb565, 2)
|
||||
#endif // WEBP_REDUCE_CSP
|
||||
|
||||
#undef LOAD_UV
|
||||
#undef UPSAMPLE_FUNC
|
||||
|
@ -217,17 +220,19 @@ UPSAMPLE_FUNC(UpsampleRgb565LinePair, YuvToRgb565, 2)
|
|||
extern void WebPInitUpsamplersMIPSdspR2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMIPSdspR2(void) {
|
||||
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
|
||||
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
|
||||
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
|
||||
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
|
||||
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
|
||||
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
|
||||
#if !defined(WEBP_REDUCE_CSP)
|
||||
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
|
||||
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
|
||||
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
|
||||
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
|
||||
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
|
||||
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
|
||||
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
|
||||
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
|
||||
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
|
||||
#endif // WEBP_REDUCE_CSP
|
||||
}
|
||||
|
||||
#endif // FANCY_UPSAMPLING
|
||||
|
@ -242,13 +247,15 @@ static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
|
|||
for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \
|
||||
}
|
||||
|
||||
YUV444_FUNC(Yuv444ToRgb, YuvToRgb, 3)
|
||||
YUV444_FUNC(Yuv444ToBgr, YuvToBgr, 3)
|
||||
YUV444_FUNC(Yuv444ToRgba, YuvToRgba, 4)
|
||||
YUV444_FUNC(Yuv444ToBgra, YuvToBgra, 4)
|
||||
#if !defined(WEBP_REDUCE_CSP)
|
||||
YUV444_FUNC(Yuv444ToRgb, YuvToRgb, 3)
|
||||
YUV444_FUNC(Yuv444ToBgr, YuvToBgr, 3)
|
||||
YUV444_FUNC(Yuv444ToArgb, YuvToArgb, 4)
|
||||
YUV444_FUNC(Yuv444ToRgba4444, YuvToRgba4444, 2)
|
||||
YUV444_FUNC(Yuv444ToRgb565, YuvToRgb565, 2)
|
||||
#endif // WEBP_REDUCE_CSP
|
||||
|
||||
#undef YUV444_FUNC
|
||||
|
||||
|
@ -258,17 +265,19 @@ YUV444_FUNC(Yuv444ToRgb565, YuvToRgb565, 2)
|
|||
extern void WebPInitYUV444ConvertersMIPSdspR2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersMIPSdspR2(void) {
|
||||
WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb;
|
||||
WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba;
|
||||
WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr;
|
||||
WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra;
|
||||
WebPYUV444Converters[MODE_rgbA] = Yuv444ToRgba;
|
||||
WebPYUV444Converters[MODE_bgrA] = Yuv444ToBgra;
|
||||
#if !defined(WEBP_REDUCE_CSP)
|
||||
WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb;
|
||||
WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr;
|
||||
WebPYUV444Converters[MODE_ARGB] = Yuv444ToArgb;
|
||||
WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444;
|
||||
WebPYUV444Converters[MODE_RGB_565] = Yuv444ToRgb565;
|
||||
WebPYUV444Converters[MODE_rgbA] = Yuv444ToRgba;
|
||||
WebPYUV444Converters[MODE_bgrA] = Yuv444ToBgra;
|
||||
WebPYUV444Converters[MODE_Argb] = Yuv444ToArgb;
|
||||
WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444;
|
||||
#endif // WEBP_REDUCE_CSP
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_MIPS_DSP_R2
|
|
@ -12,12 +12,12 @@
|
|||
// Author: Prashant Patil (prashant.patil@imgtec.com)
|
||||
|
||||
#include <string.h>
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MSA)
|
||||
|
||||
#include "./msa_macro.h"
|
||||
#include "./yuv.h"
|
||||
#include "src/dsp/msa_macro.h"
|
||||
#include "src/dsp/yuv.h"
|
||||
|
||||
#ifdef FANCY_UPSAMPLING
|
||||
|
||||
|
@ -274,7 +274,7 @@ static void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
|
|||
const int b = Clip8(b1 >> 6);
|
||||
const int rg = (r & 0xf8) | (g >> 5);
|
||||
const int gb = ((g << 3) & 0xe0) | (b >> 3);
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
#if (WEBP_SWAP_16BIT_CSP == 1)
|
||||
rgb[0] = gb;
|
||||
rgb[1] = rg;
|
||||
#else
|
||||
|
@ -293,7 +293,7 @@ static void YuvToRgba4444(int y, int u, int v, uint8_t* const argb) {
|
|||
const int b = Clip8(b1 >> 6);
|
||||
const int rg = (r & 0xf0) | (g >> 4);
|
||||
const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
#if (WEBP_SWAP_16BIT_CSP == 1)
|
||||
argb[0] = ba;
|
||||
argb[1] = rg;
|
||||
#else
|
||||
|
@ -374,7 +374,7 @@ static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
|
|||
static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
|
||||
const uint8_t* v, uint8_t* dst, int length) {
|
||||
v16u8 R, G, B;
|
||||
const v16u8 A = (v16u8)__msa_ldi_b(0xff);
|
||||
const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
|
||||
while (length >= 16) {
|
||||
CALC_RGB16(y, u, v, R, G, B);
|
||||
STORE16_4(R, G, B, A, dst);
|
||||
|
@ -402,7 +402,7 @@ static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
|
|||
static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
|
||||
const uint8_t* v, uint8_t* dst, int length) {
|
||||
v16u8 R, G, B;
|
||||
const v16u8 A = (v16u8)__msa_ldi_b(0xff);
|
||||
const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
|
||||
while (length >= 16) {
|
||||
CALC_RGB16(y, u, v, R, G, B);
|
||||
STORE16_4(B, G, R, A, dst);
|
||||
|
@ -430,7 +430,7 @@ static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
|
|||
static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
|
||||
const uint8_t* v, uint8_t* dst, int length) {
|
||||
v16u8 R, G, B;
|
||||
const v16u8 A = (v16u8)__msa_ldi_b(0xff);
|
||||
const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
|
||||
while (length >= 16) {
|
||||
CALC_RGB16(y, u, v, R, G, B);
|
||||
STORE16_4(A, R, G, B, dst);
|
||||
|
@ -459,11 +459,11 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
|
|||
const uint8_t* v, uint8_t* dst, int length) {
|
||||
v16u8 R, G, B, RG, BA, tmp0, tmp1;
|
||||
while (length >= 16) {
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
#if (WEBP_SWAP_16BIT_CSP == 1)
|
||||
CALC_RGBA4444(y, u, v, BA, RG, 16, dst);
|
||||
#else
|
||||
#else
|
||||
CALC_RGBA4444(y, u, v, RG, BA, 16, dst);
|
||||
#endif
|
||||
#endif
|
||||
y += 16;
|
||||
u += 16;
|
||||
v += 16;
|
||||
|
@ -473,7 +473,7 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
|
|||
if (length > 8) {
|
||||
uint8_t temp[2 * 16] = { 0 };
|
||||
memcpy(temp, y, length * sizeof(*temp));
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
#if (WEBP_SWAP_16BIT_CSP == 1)
|
||||
CALC_RGBA4444(temp, u, v, BA, RG, 16, temp);
|
||||
#else
|
||||
CALC_RGBA4444(temp, u, v, RG, BA, 16, temp);
|
||||
|
@ -482,7 +482,7 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
|
|||
} else if (length > 0) {
|
||||
uint8_t temp[2 * 8] = { 0 };
|
||||
memcpy(temp, y, length * sizeof(*temp));
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
#if (WEBP_SWAP_16BIT_CSP == 1)
|
||||
CALC_RGBA4444(temp, u, v, BA, RG, 8, temp);
|
||||
#else
|
||||
CALC_RGBA4444(temp, u, v, RG, BA, 8, temp);
|
||||
|
@ -495,11 +495,11 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
|
|||
const uint8_t* v, uint8_t* dst, int length) {
|
||||
v16u8 R, G, B, RG, GB, tmp0, tmp1;
|
||||
while (length >= 16) {
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
#if (WEBP_SWAP_16BIT_CSP == 1)
|
||||
CALC_RGB565(y, u, v, GB, RG, 16, dst);
|
||||
#else
|
||||
#else
|
||||
CALC_RGB565(y, u, v, RG, GB, 16, dst);
|
||||
#endif
|
||||
#endif
|
||||
y += 16;
|
||||
u += 16;
|
||||
v += 16;
|
||||
|
@ -509,7 +509,7 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
|
|||
if (length > 8) {
|
||||
uint8_t temp[2 * 16] = { 0 };
|
||||
memcpy(temp, y, length * sizeof(*temp));
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
#if (WEBP_SWAP_16BIT_CSP == 1)
|
||||
CALC_RGB565(temp, u, v, GB, RG, 16, temp);
|
||||
#else
|
||||
CALC_RGB565(temp, u, v, RG, GB, 16, temp);
|
||||
|
@ -518,7 +518,7 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
|
|||
} else if (length > 0) {
|
||||
uint8_t temp[2 * 8] = { 0 };
|
||||
memcpy(temp, y, length * sizeof(*temp));
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
#if (WEBP_SWAP_16BIT_CSP == 1)
|
||||
CALC_RGB565(temp, u, v, GB, RG, 8, temp);
|
||||
#else
|
||||
CALC_RGB565(temp, u, v, RG, GB, 8, temp);
|
||||
|
@ -640,13 +640,15 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y, \
|
|||
} \
|
||||
}
|
||||
|
||||
UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3)
|
||||
UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3)
|
||||
UPSAMPLE_FUNC(UpsampleRgbaLinePair, YuvToRgba, 4)
|
||||
UPSAMPLE_FUNC(UpsampleBgraLinePair, YuvToBgra, 4)
|
||||
#if !defined(WEBP_REDUCE_CSP)
|
||||
UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3)
|
||||
UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3)
|
||||
UPSAMPLE_FUNC(UpsampleArgbLinePair, YuvToArgb, 4)
|
||||
UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
|
||||
UPSAMPLE_FUNC(UpsampleRgb565LinePair, YuvToRgb565, 2)
|
||||
#endif // WEBP_REDUCE_CSP
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Entry point
|
||||
|
@ -656,17 +658,19 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
|
|||
extern void WebPInitUpsamplersMSA(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMSA(void) {
|
||||
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
|
||||
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
|
||||
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
|
||||
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
|
||||
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
|
||||
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
|
||||
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
|
||||
#if !defined(WEBP_REDUCE_CSP)
|
||||
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
|
||||
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
|
||||
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
|
||||
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
|
||||
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
|
||||
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
|
||||
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
|
||||
#endif // WEBP_REDUCE_CSP
|
||||
}
|
||||
|
||||
#endif // FANCY_UPSAMPLING
|
|
@ -12,15 +12,15 @@
|
|||
// Author: mans@mansr.com (Mans Rullgard)
|
||||
// Based on SSE code by: somnath@google.com (Somnath Banerjee)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
|
||||
#include <assert.h>
|
||||
#include <arm_neon.h>
|
||||
#include <string.h>
|
||||
#include "./neon.h"
|
||||
#include "./yuv.h"
|
||||
#include "src/dsp/neon.h"
|
||||
#include "src/dsp/yuv.h"
|
||||
|
||||
#ifdef FANCY_UPSAMPLING
|
||||
|
||||
|
@ -58,8 +58,8 @@
|
|||
} while (0)
|
||||
|
||||
// Turn the macro into a function for reducing code-size when non-critical
|
||||
static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
|
||||
uint8_t *out) {
|
||||
static void Upsample16Pixels_NEON(const uint8_t *r1, const uint8_t *r2,
|
||||
uint8_t *out) {
|
||||
UPSAMPLE_16PIXELS(r1, r2, out);
|
||||
}
|
||||
|
||||
|
@ -70,7 +70,7 @@ static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
|
|||
/* replicate last byte */ \
|
||||
memset(r1 + (num_pixels), r1[(num_pixels) - 1], 9 - (num_pixels)); \
|
||||
memset(r2 + (num_pixels), r2[(num_pixels) - 1], 9 - (num_pixels)); \
|
||||
Upsample16Pixels(r1, r2, out); \
|
||||
Upsample16Pixels_NEON(r1, r2, out); \
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
@ -243,13 +243,15 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y, \
|
|||
}
|
||||
|
||||
// NEON variants of the fancy upsampler.
|
||||
NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair, Rgb, 3)
|
||||
NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair, Bgr, 3)
|
||||
NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair, Rgba, 4)
|
||||
NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair, Bgra, 4)
|
||||
NEON_UPSAMPLE_FUNC(UpsampleArgbLinePair, Argb, 4)
|
||||
NEON_UPSAMPLE_FUNC(UpsampleRgba4444LinePair, Rgba4444, 2)
|
||||
NEON_UPSAMPLE_FUNC(UpsampleRgb565LinePair, Rgb565, 2)
|
||||
NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair_NEON, Rgba, 4)
|
||||
NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair_NEON, Bgra, 4)
|
||||
#if !defined(WEBP_REDUCE_CSP)
|
||||
NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair_NEON, Rgb, 3)
|
||||
NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair_NEON, Bgr, 3)
|
||||
NEON_UPSAMPLE_FUNC(UpsampleArgbLinePair_NEON, Argb, 4)
|
||||
NEON_UPSAMPLE_FUNC(UpsampleRgba4444LinePair_NEON, Rgba4444, 2)
|
||||
NEON_UPSAMPLE_FUNC(UpsampleRgb565LinePair_NEON, Rgb565, 2)
|
||||
#endif // WEBP_REDUCE_CSP
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Entry point
|
||||
|
@ -259,17 +261,19 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
|
|||
extern void WebPInitUpsamplersNEON(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersNEON(void) {
|
||||
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
|
||||
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
|
||||
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
|
||||
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
|
||||
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
|
||||
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
|
||||
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
|
||||
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
|
||||
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
|
||||
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
|
||||
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
|
||||
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_NEON;
|
||||
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_NEON;
|
||||
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_NEON;
|
||||
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_NEON;
|
||||
#if !defined(WEBP_REDUCE_CSP)
|
||||
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_NEON;
|
||||
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_NEON;
|
||||
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_NEON;
|
||||
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_NEON;
|
||||
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_NEON;
|
||||
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_NEON;
|
||||
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_NEON;
|
||||
#endif // WEBP_REDUCE_CSP
|
||||
}
|
||||
|
||||
#endif // FANCY_UPSAMPLING
|
|
@ -11,14 +11,14 @@
|
|||
//
|
||||
// Author: somnath@google.com (Somnath Banerjee)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
|
||||
#include <assert.h>
|
||||
#include <emmintrin.h>
|
||||
#include <string.h>
|
||||
#include "./yuv.h"
|
||||
#include "src/dsp/yuv.h"
|
||||
|
||||
#ifdef FANCY_UPSAMPLING
|
||||
|
||||
|
@ -83,13 +83,13 @@
|
|||
GET_M(ad, s, diag2); /* diag2 = (3a + b + c + 3d) / 8 */ \
|
||||
\
|
||||
/* pack the alternate pixels */ \
|
||||
PACK_AND_STORE(a, b, diag1, diag2, out + 0); /* store top */ \
|
||||
PACK_AND_STORE(c, d, diag2, diag1, out + 2 * 32); /* store bottom */ \
|
||||
PACK_AND_STORE(a, b, diag1, diag2, (out) + 0); /* store top */ \
|
||||
PACK_AND_STORE(c, d, diag2, diag1, (out) + 2 * 32); /* store bottom */ \
|
||||
}
|
||||
|
||||
// Turn the macro into a function for reducing code-size when non-critical
|
||||
static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
|
||||
uint8_t* const out) {
|
||||
static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[],
|
||||
uint8_t* const out) {
|
||||
UPSAMPLE_32PIXELS(r1, r2, out);
|
||||
}
|
||||
|
||||
|
@ -101,30 +101,30 @@ static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
|
|||
memset(r1 + (num_pixels), r1[(num_pixels) - 1], 17 - (num_pixels)); \
|
||||
memset(r2 + (num_pixels), r2[(num_pixels) - 1], 17 - (num_pixels)); \
|
||||
/* using the shared function instead of the macro saves ~3k code size */ \
|
||||
Upsample32Pixels(r1, r2, out); \
|
||||
Upsample32Pixels_SSE2(r1, r2, out); \
|
||||
}
|
||||
|
||||
#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, \
|
||||
top_dst, bottom_dst, cur_x, num_pixels) { \
|
||||
int n; \
|
||||
for (n = 0; n < (num_pixels); ++n) { \
|
||||
FUNC(top_y[(cur_x) + n], r_u[n], r_v[n], \
|
||||
top_dst + ((cur_x) + n) * XSTEP); \
|
||||
FUNC((top_y)[(cur_x) + n], r_u[n], r_v[n], \
|
||||
(top_dst) + ((cur_x) + n) * (XSTEP)); \
|
||||
} \
|
||||
if (bottom_y != NULL) { \
|
||||
if ((bottom_y) != NULL) { \
|
||||
for (n = 0; n < (num_pixels); ++n) { \
|
||||
FUNC(bottom_y[(cur_x) + n], r_u[64 + n], r_v[64 + n], \
|
||||
bottom_dst + ((cur_x) + n) * XSTEP); \
|
||||
FUNC((bottom_y)[(cur_x) + n], r_u[64 + n], r_v[64 + n], \
|
||||
(bottom_dst) + ((cur_x) + n) * (XSTEP)); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, \
|
||||
top_dst, bottom_dst, cur_x) do { \
|
||||
FUNC##32(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP); \
|
||||
if (bottom_y != NULL) { \
|
||||
FUNC##32(bottom_y + (cur_x), r_u + 64, r_v + 64, \
|
||||
bottom_dst + (cur_x) * XSTEP); \
|
||||
FUNC##32_SSE2((top_y) + (cur_x), r_u, r_v, (top_dst) + (cur_x) * (XSTEP)); \
|
||||
if ((bottom_y) != NULL) { \
|
||||
FUNC##32_SSE2((bottom_y) + (cur_x), r_u + 64, r_v + 64, \
|
||||
(bottom_dst) + (cur_x) * (XSTEP)); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
|
@ -169,13 +169,16 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
|
|||
}
|
||||
|
||||
// SSE2 variants of the fancy upsampler.
|
||||
SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePair, VP8YuvToRgb, 3)
|
||||
SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePair, VP8YuvToBgr, 3)
|
||||
SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
|
||||
SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
|
||||
SSE2_UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4)
|
||||
SSE2_UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2)
|
||||
SSE2_UPSAMPLE_FUNC(UpsampleRgb565LinePair, VP8YuvToRgb565, 2)
|
||||
SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePair_SSE2, VP8YuvToRgba, 4)
|
||||
SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePair_SSE2, VP8YuvToBgra, 4)
|
||||
|
||||
#if !defined(WEBP_REDUCE_CSP)
|
||||
SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePair_SSE2, VP8YuvToRgb, 3)
|
||||
SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePair_SSE2, VP8YuvToBgr, 3)
|
||||
SSE2_UPSAMPLE_FUNC(UpsampleArgbLinePair_SSE2, VP8YuvToArgb, 4)
|
||||
SSE2_UPSAMPLE_FUNC(UpsampleRgba4444LinePair_SSE2, VP8YuvToRgba4444, 2)
|
||||
SSE2_UPSAMPLE_FUNC(UpsampleRgb565LinePair_SSE2, VP8YuvToRgb565, 2)
|
||||
#endif // WEBP_REDUCE_CSP
|
||||
|
||||
#undef GET_M
|
||||
#undef PACK_AND_STORE
|
||||
|
@ -193,17 +196,19 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
|
|||
extern void WebPInitUpsamplersSSE2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE2(void) {
|
||||
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
|
||||
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
|
||||
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
|
||||
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
|
||||
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
|
||||
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
|
||||
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
|
||||
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
|
||||
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
|
||||
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
|
||||
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
|
||||
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_SSE2;
|
||||
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_SSE2;
|
||||
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_SSE2;
|
||||
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_SSE2;
|
||||
#if !defined(WEBP_REDUCE_CSP)
|
||||
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_SSE2;
|
||||
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_SSE2;
|
||||
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_SSE2;
|
||||
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_SSE2;
|
||||
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_SSE2;
|
||||
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_SSE2;
|
||||
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_SSE2;
|
||||
#endif // WEBP_REDUCE_CSP
|
||||
}
|
||||
|
||||
#endif // FANCY_UPSAMPLING
|
||||
|
@ -213,29 +218,46 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE2(void) {
|
|||
extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
|
||||
extern void WebPInitYUV444ConvertersSSE2(void);
|
||||
|
||||
#define YUV444_FUNC(FUNC_NAME, CALL, XSTEP) \
|
||||
extern void WebP##FUNC_NAME##C(const uint8_t* y, const uint8_t* u, \
|
||||
const uint8_t* v, uint8_t* dst, int len); \
|
||||
#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \
|
||||
extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
|
||||
uint8_t* dst, int len); \
|
||||
static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
|
||||
uint8_t* dst, int len) { \
|
||||
int i; \
|
||||
const int max_len = len & ~31; \
|
||||
for (i = 0; i < max_len; i += 32) CALL(y + i, u + i, v + i, dst + i * XSTEP);\
|
||||
for (i = 0; i < max_len; i += 32) { \
|
||||
CALL(y + i, u + i, v + i, dst + i * (XSTEP)); \
|
||||
} \
|
||||
if (i < len) { /* C-fallback */ \
|
||||
WebP##FUNC_NAME##C(y + i, u + i, v + i, dst + i * XSTEP, len - i); \
|
||||
CALL_C(y + i, u + i, v + i, dst + i * (XSTEP), len - i); \
|
||||
} \
|
||||
}
|
||||
|
||||
YUV444_FUNC(Yuv444ToRgba, VP8YuvToRgba32, 4);
|
||||
YUV444_FUNC(Yuv444ToBgra, VP8YuvToBgra32, 4);
|
||||
YUV444_FUNC(Yuv444ToRgb, VP8YuvToRgb32, 3);
|
||||
YUV444_FUNC(Yuv444ToBgr, VP8YuvToBgr32, 3);
|
||||
YUV444_FUNC(Yuv444ToRgba_SSE2, VP8YuvToRgba32_SSE2, WebPYuv444ToRgba_C, 4);
|
||||
YUV444_FUNC(Yuv444ToBgra_SSE2, VP8YuvToBgra32_SSE2, WebPYuv444ToBgra_C, 4);
|
||||
#if !defined(WEBP_REDUCE_CSP)
|
||||
YUV444_FUNC(Yuv444ToRgb_SSE2, VP8YuvToRgb32_SSE2, WebPYuv444ToRgb_C, 3);
|
||||
YUV444_FUNC(Yuv444ToBgr_SSE2, VP8YuvToBgr32_SSE2, WebPYuv444ToBgr_C, 3);
|
||||
YUV444_FUNC(Yuv444ToArgb_SSE2, VP8YuvToArgb32_SSE2, WebPYuv444ToArgb_C, 4)
|
||||
YUV444_FUNC(Yuv444ToRgba4444_SSE2, VP8YuvToRgba444432_SSE2, \
|
||||
WebPYuv444ToRgba4444_C, 2)
|
||||
YUV444_FUNC(Yuv444ToRgb565_SSE2, VP8YuvToRgb56532_SSE2, WebPYuv444ToRgb565_C, 2)
|
||||
#endif // WEBP_REDUCE_CSP
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersSSE2(void) {
|
||||
WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba;
|
||||
WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra;
|
||||
WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb;
|
||||
WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr;
|
||||
WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba_SSE2;
|
||||
WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra_SSE2;
|
||||
WebPYUV444Converters[MODE_rgbA] = Yuv444ToRgba_SSE2;
|
||||
WebPYUV444Converters[MODE_bgrA] = Yuv444ToBgra_SSE2;
|
||||
#if !defined(WEBP_REDUCE_CSP)
|
||||
WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb_SSE2;
|
||||
WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr_SSE2;
|
||||
WebPYUV444Converters[MODE_ARGB] = Yuv444ToArgb_SSE2;
|
||||
WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444_SSE2;
|
||||
WebPYUV444Converters[MODE_RGB_565] = Yuv444ToRgb565_SSE2;
|
||||
WebPYUV444Converters[MODE_Argb] = Yuv444ToArgb_SSE2;
|
||||
WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444_SSE2;
|
||||
#endif // WEBP_REDUCE_CSP
|
||||
}
|
||||
|
||||
#else
|
|
@ -11,63 +11,11 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./yuv.h"
|
||||
#include "src/dsp/yuv.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#if defined(WEBP_YUV_USE_TABLE)
|
||||
|
||||
static int done = 0;
|
||||
|
||||
static WEBP_INLINE uint8_t clip(int v, int max_value) {
|
||||
return v < 0 ? 0 : v > max_value ? max_value : v;
|
||||
}
|
||||
|
||||
int16_t VP8kVToR[256], VP8kUToB[256];
|
||||
int32_t VP8kVToG[256], VP8kUToG[256];
|
||||
uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
|
||||
uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {
|
||||
int i;
|
||||
if (done) {
|
||||
return;
|
||||
}
|
||||
#ifndef USE_YUVj
|
||||
for (i = 0; i < 256; ++i) {
|
||||
VP8kVToR[i] = (89858 * (i - 128) + YUV_HALF) >> YUV_FIX;
|
||||
VP8kUToG[i] = -22014 * (i - 128) + YUV_HALF;
|
||||
VP8kVToG[i] = -45773 * (i - 128);
|
||||
VP8kUToB[i] = (113618 * (i - 128) + YUV_HALF) >> YUV_FIX;
|
||||
}
|
||||
for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) {
|
||||
const int k = ((i - 16) * 76283 + YUV_HALF) >> YUV_FIX;
|
||||
VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
|
||||
VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
|
||||
}
|
||||
#else
|
||||
for (i = 0; i < 256; ++i) {
|
||||
VP8kVToR[i] = (91881 * (i - 128) + YUV_HALF) >> YUV_FIX;
|
||||
VP8kUToG[i] = -22554 * (i - 128) + YUV_HALF;
|
||||
VP8kVToG[i] = -46802 * (i - 128);
|
||||
VP8kUToB[i] = (116130 * (i - 128) + YUV_HALF) >> YUV_FIX;
|
||||
}
|
||||
for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) {
|
||||
const int k = i;
|
||||
VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
|
||||
VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
|
||||
}
|
||||
#endif
|
||||
|
||||
done = 1;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {}
|
||||
|
||||
#endif // WEBP_YUV_USE_TABLE
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Plain-C version
|
||||
|
||||
|
@ -75,14 +23,14 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {}
|
|||
static void FUNC_NAME(const uint8_t* y, \
|
||||
const uint8_t* u, const uint8_t* v, \
|
||||
uint8_t* dst, int len) { \
|
||||
const uint8_t* const end = dst + (len & ~1) * XSTEP; \
|
||||
const uint8_t* const end = dst + (len & ~1) * (XSTEP); \
|
||||
while (dst != end) { \
|
||||
FUNC(y[0], u[0], v[0], dst); \
|
||||
FUNC(y[1], u[0], v[0], dst + XSTEP); \
|
||||
FUNC(y[1], u[0], v[0], dst + (XSTEP)); \
|
||||
y += 2; \
|
||||
++u; \
|
||||
++v; \
|
||||
dst += 2 * XSTEP; \
|
||||
dst += 2 * (XSTEP); \
|
||||
} \
|
||||
if (len & 1) { \
|
||||
FUNC(y[0], u[0], v[0], dst); \
|
||||
|
@ -168,7 +116,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) {
|
|||
//-----------------------------------------------------------------------------
|
||||
// ARGB -> YUV converters
|
||||
|
||||
static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
|
||||
static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
const uint32_t p = argb[i];
|
||||
|
@ -220,14 +168,14 @@ void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
|
|||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
|
||||
static void ConvertRGB24ToY_C(const uint8_t* rgb, uint8_t* y, int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i, rgb += 3) {
|
||||
y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
|
||||
}
|
||||
}
|
||||
|
||||
static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
|
||||
static void ConvertBGR24ToY_C(const uint8_t* bgr, uint8_t* y, int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i, bgr += 3) {
|
||||
y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
|
||||
|
@ -246,6 +194,7 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
|
|||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
|
||||
static uint16_t clip_y(int v) {
|
||||
return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
|
||||
|
@ -283,6 +232,7 @@ static void SharpYUVFilterRow_C(const int16_t* A, const int16_t* B, int len,
|
|||
out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
|
||||
}
|
||||
}
|
||||
#endif // !WEBP_NEON_OMIT_C_CODE
|
||||
|
||||
#undef MAX_Y
|
||||
|
||||
|
@ -308,22 +258,26 @@ static volatile VP8CPUInfo rgba_to_yuv_last_cpuinfo_used =
|
|||
(VP8CPUInfo)&rgba_to_yuv_last_cpuinfo_used;
|
||||
|
||||
extern void WebPInitConvertARGBToYUVSSE2(void);
|
||||
extern void WebPInitConvertARGBToYUVNEON(void);
|
||||
extern void WebPInitSharpYUVSSE2(void);
|
||||
extern void WebPInitSharpYUVNEON(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
|
||||
if (rgba_to_yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
|
||||
|
||||
WebPConvertARGBToY = ConvertARGBToY;
|
||||
WebPConvertARGBToY = ConvertARGBToY_C;
|
||||
WebPConvertARGBToUV = WebPConvertARGBToUV_C;
|
||||
|
||||
WebPConvertRGB24ToY = ConvertRGB24ToY;
|
||||
WebPConvertBGR24ToY = ConvertBGR24ToY;
|
||||
WebPConvertRGB24ToY = ConvertRGB24ToY_C;
|
||||
WebPConvertBGR24ToY = ConvertBGR24ToY_C;
|
||||
|
||||
WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C;
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
WebPSharpYUVUpdateY = SharpYUVUpdateY_C;
|
||||
WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_C;
|
||||
WebPSharpYUVFilterRow = SharpYUVFilterRow_C;
|
||||
#endif
|
||||
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
|
@ -333,5 +287,23 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
|
|||
}
|
||||
#endif // WEBP_USE_SSE2
|
||||
}
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
if (WEBP_NEON_OMIT_C_CODE ||
|
||||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
|
||||
WebPInitConvertARGBToYUVNEON();
|
||||
WebPInitSharpYUVNEON();
|
||||
}
|
||||
#endif // WEBP_USE_NEON
|
||||
|
||||
assert(WebPConvertARGBToY != NULL);
|
||||
assert(WebPConvertARGBToUV != NULL);
|
||||
assert(WebPConvertRGB24ToY != NULL);
|
||||
assert(WebPConvertBGR24ToY != NULL);
|
||||
assert(WebPConvertRGBA32ToUV != NULL);
|
||||
assert(WebPSharpYUVUpdateY != NULL);
|
||||
assert(WebPSharpYUVUpdateRGB != NULL);
|
||||
assert(WebPSharpYUVFilterRow != NULL);
|
||||
|
||||
rgba_to_yuv_last_cpuinfo_used = VP8GetCPUInfo;
|
||||
}
|
|
@ -35,18 +35,8 @@
|
|||
#ifndef WEBP_DSP_YUV_H_
|
||||
#define WEBP_DSP_YUV_H_
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "../dec/vp8_dec.h"
|
||||
|
||||
#if defined(WEBP_EXPERIMENTAL_FEATURES)
|
||||
// Do NOT activate this feature for real compression. This is only experimental!
|
||||
// This flag is for comparison purpose against JPEG's "YUVj" natural colorspace.
|
||||
// This colorspace is close to Rec.601's Y'CbCr model with the notable
|
||||
// difference of allowing larger range for luma/chroma.
|
||||
// See http://en.wikipedia.org/wiki/YCbCr#JPEG_conversion paragraph, and its
|
||||
// difference with http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
|
||||
// #define USE_YUVj
|
||||
#endif
|
||||
#include "src/dsp/dsp.h"
|
||||
#include "src/dec/vp8_dec.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// YUV -> RGB conversion
|
||||
|
@ -58,12 +48,8 @@ extern "C" {
|
|||
enum {
|
||||
YUV_FIX = 16, // fixed-point precision for RGB->YUV
|
||||
YUV_HALF = 1 << (YUV_FIX - 1),
|
||||
YUV_MASK = (256 << YUV_FIX) - 1,
|
||||
YUV_RANGE_MIN = -227, // min value of r/g/b output
|
||||
YUV_RANGE_MAX = 256 + 226, // max value of r/g/b output
|
||||
|
||||
YUV_FIX2 = 6, // fixed-point precision for YUV->RGB
|
||||
YUV_HALF2 = 1 << YUV_FIX2 >> 1,
|
||||
YUV_MASK2 = (256 << YUV_FIX2) - 1
|
||||
};
|
||||
|
||||
|
@ -111,7 +97,7 @@ static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
|
|||
const int b = VP8YUVToB(y, u); // 5 usable bits
|
||||
const int rg = (r & 0xf8) | (g >> 5);
|
||||
const int gb = ((g << 3) & 0xe0) | (b >> 3);
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
#if (WEBP_SWAP_16BIT_CSP == 1)
|
||||
rgb[0] = gb;
|
||||
rgb[1] = rg;
|
||||
#else
|
||||
|
@ -127,7 +113,7 @@ static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
|
|||
const int b = VP8YUVToB(y, u); // 4 usable bits
|
||||
const int rg = (r & 0xf0) | (g >> 4);
|
||||
const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits
|
||||
#ifdef WEBP_SWAP_16BIT_CSP
|
||||
#if (WEBP_SWAP_16BIT_CSP == 1)
|
||||
argb[0] = ba;
|
||||
argb[1] = rg;
|
||||
#else
|
||||
|
@ -157,29 +143,26 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
|
|||
rgba[3] = 0xff;
|
||||
}
|
||||
|
||||
// Must be called before everything, to initialize the tables.
|
||||
void VP8YUVInit(void);
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// SSE2 extra functions (mostly for upsampling_sse2.c)
|
||||
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
|
||||
// Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
|
||||
void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst);
|
||||
void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst);
|
||||
void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst);
|
||||
void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst);
|
||||
void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst);
|
||||
void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst);
|
||||
void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst);
|
||||
void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst);
|
||||
void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst);
|
||||
void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst);
|
||||
void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst);
|
||||
void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
|
||||
const uint8_t* v, uint8_t* dst);
|
||||
void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst);
|
||||
|
||||
#endif // WEBP_USE_SSE2
|
||||
|
||||
|
@ -192,8 +175,6 @@ static WEBP_INLINE int VP8ClipUV(int uv, int rounding) {
|
|||
return ((uv & ~0xff) == 0) ? uv : (uv < 0) ? 0 : 255;
|
||||
}
|
||||
|
||||
#ifndef USE_YUVj
|
||||
|
||||
static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
|
||||
const int luma = 16839 * r + 33059 * g + 6420 * b;
|
||||
return (luma + rounding + (16 << YUV_FIX)) >> YUV_FIX; // no need to clip
|
||||
|
@ -209,28 +190,6 @@ static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
|
|||
return VP8ClipUV(v, rounding);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// This JPEG-YUV colorspace, only for comparison!
|
||||
// These are also 16bit precision coefficients from Rec.601, but with full
|
||||
// [0..255] output range.
|
||||
static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
|
||||
const int luma = 19595 * r + 38470 * g + 7471 * b;
|
||||
return (luma + rounding) >> YUV_FIX; // no need to clip
|
||||
}
|
||||
|
||||
static WEBP_INLINE int VP8RGBToU(int r, int g, int b, int rounding) {
|
||||
const int u = -11058 * r - 21710 * g + 32768 * b;
|
||||
return VP8ClipUV(u, rounding);
|
||||
}
|
||||
|
||||
static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
|
||||
const int v = 32768 * r - 27439 * g - 5329 * b;
|
||||
return VP8ClipUV(v, rounding);
|
||||
}
|
||||
|
||||
#endif // USE_YUVj
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
|
@ -12,11 +12,11 @@
|
|||
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
|
||||
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MIPS32)
|
||||
|
||||
#include "./yuv.h"
|
||||
#include "src/dsp/yuv.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// simple point-sampling
|
||||
|
@ -77,10 +77,10 @@ static void FUNC_NAME(const uint8_t* y, \
|
|||
} \
|
||||
}
|
||||
|
||||
ROW_FUNC(YuvToRgbRow, 3, 0, 1, 2, 0)
|
||||
ROW_FUNC(YuvToRgbaRow, 4, 0, 1, 2, 3)
|
||||
ROW_FUNC(YuvToBgrRow, 3, 2, 1, 0, 0)
|
||||
ROW_FUNC(YuvToBgraRow, 4, 2, 1, 0, 3)
|
||||
ROW_FUNC(YuvToRgbRow_MIPS32, 3, 0, 1, 2, 0)
|
||||
ROW_FUNC(YuvToRgbaRow_MIPS32, 4, 0, 1, 2, 3)
|
||||
ROW_FUNC(YuvToBgrRow_MIPS32, 3, 2, 1, 0, 0)
|
||||
ROW_FUNC(YuvToBgraRow_MIPS32, 4, 2, 1, 0, 3)
|
||||
|
||||
#undef ROW_FUNC
|
||||
|
||||
|
@ -90,10 +90,10 @@ ROW_FUNC(YuvToBgraRow, 4, 2, 1, 0, 3)
|
|||
extern void WebPInitSamplersMIPS32(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPS32(void) {
|
||||
WebPSamplers[MODE_RGB] = YuvToRgbRow;
|
||||
WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
|
||||
WebPSamplers[MODE_BGR] = YuvToBgrRow;
|
||||
WebPSamplers[MODE_BGRA] = YuvToBgraRow;
|
||||
WebPSamplers[MODE_RGB] = YuvToRgbRow_MIPS32;
|
||||
WebPSamplers[MODE_RGBA] = YuvToRgbaRow_MIPS32;
|
||||
WebPSamplers[MODE_BGR] = YuvToBgrRow_MIPS32;
|
||||
WebPSamplers[MODE_BGRA] = YuvToBgraRow_MIPS32;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_MIPS32
|
|
@ -12,11 +12,11 @@
|
|||
// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
|
||||
// Djordje Pesut (djordje.pesut@imgtec.com)
|
||||
|
||||
#include "./dsp.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_MIPS_DSP_R2)
|
||||
|
||||
#include "./yuv.h"
|
||||
#include "src/dsp/yuv.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// simple point-sampling
|
||||
|
@ -105,10 +105,10 @@ static void FUNC_NAME(const uint8_t* y, \
|
|||
} \
|
||||
}
|
||||
|
||||
ROW_FUNC(YuvToRgbRow, 3, 0, 1, 2, 0)
|
||||
ROW_FUNC(YuvToRgbaRow, 4, 0, 1, 2, 3)
|
||||
ROW_FUNC(YuvToBgrRow, 3, 2, 1, 0, 0)
|
||||
ROW_FUNC(YuvToBgraRow, 4, 2, 1, 0, 3)
|
||||
ROW_FUNC(YuvToRgbRow_MIPSdspR2, 3, 0, 1, 2, 0)
|
||||
ROW_FUNC(YuvToRgbaRow_MIPSdspR2, 4, 0, 1, 2, 3)
|
||||
ROW_FUNC(YuvToBgrRow_MIPSdspR2, 3, 2, 1, 0, 0)
|
||||
ROW_FUNC(YuvToBgraRow_MIPSdspR2, 4, 2, 1, 0, 3)
|
||||
|
||||
#undef ROW_FUNC
|
||||
#undef ASM_CLOBBER_LIST
|
||||
|
@ -121,10 +121,10 @@ ROW_FUNC(YuvToBgraRow, 4, 2, 1, 0, 3)
|
|||
extern void WebPInitSamplersMIPSdspR2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPSdspR2(void) {
|
||||
WebPSamplers[MODE_RGB] = YuvToRgbRow;
|
||||
WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
|
||||
WebPSamplers[MODE_BGR] = YuvToBgrRow;
|
||||
WebPSamplers[MODE_BGRA] = YuvToBgraRow;
|
||||
WebPSamplers[MODE_RGB] = YuvToRgbRow_MIPSdspR2;
|
||||
WebPSamplers[MODE_RGBA] = YuvToRgbaRow_MIPSdspR2;
|
||||
WebPSamplers[MODE_BGR] = YuvToBgrRow_MIPSdspR2;
|
||||
WebPSamplers[MODE_BGRA] = YuvToBgraRow_MIPSdspR2;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_MIPS_DSP_R2
|
|
@ -0,0 +1,288 @@
|
|||
// Copyright 2017 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// YUV->RGB conversion functions
|
||||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "src/dsp/yuv.h"
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "src/dsp/neon.h"
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R,
|
||||
const uint8x8_t G,
|
||||
const uint8x8_t B) {
|
||||
const uint16x8_t r = vmovl_u8(R);
|
||||
const uint16x8_t g = vmovl_u8(G);
|
||||
const uint16x8_t b = vmovl_u8(B);
|
||||
const uint16x4_t r_lo = vget_low_u16(r);
|
||||
const uint16x4_t r_hi = vget_high_u16(r);
|
||||
const uint16x4_t g_lo = vget_low_u16(g);
|
||||
const uint16x4_t g_hi = vget_high_u16(g);
|
||||
const uint16x4_t b_lo = vget_low_u16(b);
|
||||
const uint16x4_t b_hi = vget_high_u16(b);
|
||||
const uint32x4_t tmp0_lo = vmull_n_u16( r_lo, 16839u);
|
||||
const uint32x4_t tmp0_hi = vmull_n_u16( r_hi, 16839u);
|
||||
const uint32x4_t tmp1_lo = vmlal_n_u16(tmp0_lo, g_lo, 33059u);
|
||||
const uint32x4_t tmp1_hi = vmlal_n_u16(tmp0_hi, g_hi, 33059u);
|
||||
const uint32x4_t tmp2_lo = vmlal_n_u16(tmp1_lo, b_lo, 6420u);
|
||||
const uint32x4_t tmp2_hi = vmlal_n_u16(tmp1_hi, b_hi, 6420u);
|
||||
const uint16x8_t Y1 = vcombine_u16(vrshrn_n_u32(tmp2_lo, 16),
|
||||
vrshrn_n_u32(tmp2_hi, 16));
|
||||
const uint16x8_t Y2 = vaddq_u16(Y1, vdupq_n_u16(16));
|
||||
return vqmovn_u16(Y2);
|
||||
}
|
||||
|
||||
static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) {
|
||||
int i;
|
||||
for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) {
|
||||
const uint8x8x3_t RGB = vld3_u8(rgb);
|
||||
const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[0], RGB.val[1], RGB.val[2]);
|
||||
vst1_u8(y + i, Y);
|
||||
}
|
||||
for (; i < width; ++i, rgb += 3) { // left-over
|
||||
y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
|
||||
}
|
||||
}
|
||||
|
||||
static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) {
|
||||
int i;
|
||||
for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) {
|
||||
const uint8x8x3_t BGR = vld3_u8(bgr);
|
||||
const uint8x8_t Y = ConvertRGBToY_NEON(BGR.val[2], BGR.val[1], BGR.val[0]);
|
||||
vst1_u8(y + i, Y);
|
||||
}
|
||||
for (; i < width; ++i, bgr += 3) { // left-over
|
||||
y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
|
||||
}
|
||||
}
|
||||
|
||||
static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) {
|
||||
int i;
|
||||
for (i = 0; i + 8 <= width; i += 8) {
|
||||
const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]);
|
||||
const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[2], RGB.val[1], RGB.val[0]);
|
||||
vst1_u8(y + i, Y);
|
||||
}
|
||||
for (; i < width; ++i) { // left-over
|
||||
const uint32_t p = argb[i];
|
||||
y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff,
|
||||
YUV_HALF);
|
||||
}
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
// computes: DST_s16 = [(C0 * r + C1 * g + C2 * b) >> 16] + CST
|
||||
#define MULTIPLY_16b_PREAMBLE(r, g, b) \
|
||||
const int16x4_t r_lo = vreinterpret_s16_u16(vget_low_u16(r)); \
|
||||
const int16x4_t r_hi = vreinterpret_s16_u16(vget_high_u16(r)); \
|
||||
const int16x4_t g_lo = vreinterpret_s16_u16(vget_low_u16(g)); \
|
||||
const int16x4_t g_hi = vreinterpret_s16_u16(vget_high_u16(g)); \
|
||||
const int16x4_t b_lo = vreinterpret_s16_u16(vget_low_u16(b)); \
|
||||
const int16x4_t b_hi = vreinterpret_s16_u16(vget_high_u16(b))
|
||||
|
||||
#define MULTIPLY_16b(C0, C1, C2, CST, DST_s16) do { \
|
||||
const int32x4_t tmp0_lo = vmull_n_s16( r_lo, C0); \
|
||||
const int32x4_t tmp0_hi = vmull_n_s16( r_hi, C0); \
|
||||
const int32x4_t tmp1_lo = vmlal_n_s16(tmp0_lo, g_lo, C1); \
|
||||
const int32x4_t tmp1_hi = vmlal_n_s16(tmp0_hi, g_hi, C1); \
|
||||
const int32x4_t tmp2_lo = vmlal_n_s16(tmp1_lo, b_lo, C2); \
|
||||
const int32x4_t tmp2_hi = vmlal_n_s16(tmp1_hi, b_hi, C2); \
|
||||
const int16x8_t tmp3 = vcombine_s16(vshrn_n_s32(tmp2_lo, 16), \
|
||||
vshrn_n_s32(tmp2_hi, 16)); \
|
||||
DST_s16 = vaddq_s16(tmp3, vdupq_n_s16(CST)); \
|
||||
} while (0)
|
||||
|
||||
// This needs to be a macro, since (128 << SHIFT) needs to be an immediate.
|
||||
#define CONVERT_RGB_TO_UV(r, g, b, SHIFT, U_DST, V_DST) do { \
|
||||
MULTIPLY_16b_PREAMBLE(r, g, b); \
|
||||
MULTIPLY_16b(-9719, -19081, 28800, 128 << SHIFT, U_DST); \
|
||||
MULTIPLY_16b(28800, -24116, -4684, 128 << SHIFT, V_DST); \
|
||||
} while (0)
|
||||
|
||||
static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb,
|
||||
uint8_t* u, uint8_t* v, int width) {
|
||||
int i;
|
||||
for (i = 0; i + 8 <= width; i += 8, rgb += 4 * 8) {
|
||||
const uint16x8x4_t RGB = vld4q_u16((const uint16_t*)rgb);
|
||||
int16x8_t U, V;
|
||||
CONVERT_RGB_TO_UV(RGB.val[0], RGB.val[1], RGB.val[2], 2, U, V);
|
||||
vst1_u8(u + i, vqrshrun_n_s16(U, 2));
|
||||
vst1_u8(v + i, vqrshrun_n_s16(V, 2));
|
||||
}
|
||||
for (; i < width; i += 1, rgb += 4) {
|
||||
const int r = rgb[0], g = rgb[1], b = rgb[2];
|
||||
u[i] = VP8RGBToU(r, g, b, YUV_HALF << 2);
|
||||
v[i] = VP8RGBToV(r, g, b, YUV_HALF << 2);
|
||||
}
|
||||
}
|
||||
|
||||
static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v,
|
||||
int src_width, int do_store) {
|
||||
int i;
|
||||
for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) {
|
||||
const uint8x16x4_t RGB = vld4q_u8((const uint8_t*)&argb[i]);
|
||||
const uint16x8_t R = vpaddlq_u8(RGB.val[2]); // pair-wise adds
|
||||
const uint16x8_t G = vpaddlq_u8(RGB.val[1]);
|
||||
const uint16x8_t B = vpaddlq_u8(RGB.val[0]);
|
||||
int16x8_t U_tmp, V_tmp;
|
||||
CONVERT_RGB_TO_UV(R, G, B, 1, U_tmp, V_tmp);
|
||||
{
|
||||
const uint8x8_t U = vqrshrun_n_s16(U_tmp, 1);
|
||||
const uint8x8_t V = vqrshrun_n_s16(V_tmp, 1);
|
||||
if (do_store) {
|
||||
vst1_u8(u, U);
|
||||
vst1_u8(v, V);
|
||||
} else {
|
||||
const uint8x8_t prev_u = vld1_u8(u);
|
||||
const uint8x8_t prev_v = vld1_u8(v);
|
||||
vst1_u8(u, vrhadd_u8(U, prev_u));
|
||||
vst1_u8(v, vrhadd_u8(V, prev_v));
|
||||
}
|
||||
}
|
||||
}
|
||||
if (i < src_width) { // left-over
|
||||
WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
extern void WebPInitConvertARGBToYUVNEON(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVNEON(void) {
|
||||
WebPConvertRGB24ToY = ConvertRGB24ToY_NEON;
|
||||
WebPConvertBGR24ToY = ConvertBGR24ToY_NEON;
|
||||
WebPConvertARGBToY = ConvertARGBToY_NEON;
|
||||
WebPConvertARGBToUV = ConvertARGBToUV_NEON;
|
||||
WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_NEON;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
|
||||
static uint16_t clip_y_NEON(int v) {
|
||||
return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
|
||||
}
|
||||
|
||||
static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
|
||||
uint16_t* dst, int len) {
|
||||
int i;
|
||||
const int16x8_t zero = vdupq_n_s16(0);
|
||||
const int16x8_t max = vdupq_n_s16(MAX_Y);
|
||||
uint64x2_t sum = vdupq_n_u64(0);
|
||||
uint64_t diff;
|
||||
|
||||
for (i = 0; i + 8 <= len; i += 8) {
|
||||
const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
|
||||
const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
|
||||
const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
|
||||
const int16x8_t D = vsubq_s16(A, B); // diff_y
|
||||
const int16x8_t F = vaddq_s16(C, D); // new_y
|
||||
const uint16x8_t H =
|
||||
vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
|
||||
const int16x8_t I = vabsq_s16(D); // abs(diff_y)
|
||||
vst1q_u16(dst + i, H);
|
||||
sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
|
||||
}
|
||||
diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
|
||||
for (; i < len; ++i) {
|
||||
const int diff_y = ref[i] - src[i];
|
||||
const int new_y = (int)(dst[i]) + diff_y;
|
||||
dst[i] = clip_y_NEON(new_y);
|
||||
diff += (uint64_t)(abs(diff_y));
|
||||
}
|
||||
return diff;
|
||||
}
|
||||
|
||||
static void SharpYUVUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
|
||||
int16_t* dst, int len) {
|
||||
int i;
|
||||
for (i = 0; i + 8 <= len; i += 8) {
|
||||
const int16x8_t A = vld1q_s16(ref + i);
|
||||
const int16x8_t B = vld1q_s16(src + i);
|
||||
const int16x8_t C = vld1q_s16(dst + i);
|
||||
const int16x8_t D = vsubq_s16(A, B); // diff_uv
|
||||
const int16x8_t E = vaddq_s16(C, D); // new_uv
|
||||
vst1q_s16(dst + i, E);
|
||||
}
|
||||
for (; i < len; ++i) {
|
||||
const int diff_uv = ref[i] - src[i];
|
||||
dst[i] += diff_uv;
|
||||
}
|
||||
}
|
||||
|
||||
static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
|
||||
const uint16_t* best_y, uint16_t* out) {
|
||||
int i;
|
||||
const int16x8_t max = vdupq_n_s16(MAX_Y);
|
||||
const int16x8_t zero = vdupq_n_s16(0);
|
||||
for (i = 0; i + 8 <= len; i += 8) {
|
||||
const int16x8_t a0 = vld1q_s16(A + i + 0);
|
||||
const int16x8_t a1 = vld1q_s16(A + i + 1);
|
||||
const int16x8_t b0 = vld1q_s16(B + i + 0);
|
||||
const int16x8_t b1 = vld1q_s16(B + i + 1);
|
||||
const int16x8_t a0b1 = vaddq_s16(a0, b1);
|
||||
const int16x8_t a1b0 = vaddq_s16(a1, b0);
|
||||
const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1
|
||||
const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1)
|
||||
const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0)
|
||||
const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
|
||||
const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
|
||||
const int16x8_t d0 = vaddq_s16(c1, a0);
|
||||
const int16x8_t d1 = vaddq_s16(c0, a1);
|
||||
const int16x8_t e0 = vrshrq_n_s16(d0, 1);
|
||||
const int16x8_t e1 = vrshrq_n_s16(d1, 1);
|
||||
const int16x8x2_t f = vzipq_s16(e0, e1);
|
||||
const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
|
||||
const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
|
||||
const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
|
||||
const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
|
||||
const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
|
||||
const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
|
||||
vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
|
||||
vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
|
||||
}
|
||||
for (; i < len; ++i) {
|
||||
const int a0b1 = A[i + 0] + B[i + 1];
|
||||
const int a1b0 = A[i + 1] + B[i + 0];
|
||||
const int a0a1b0b1 = a0b1 + a1b0 + 8;
|
||||
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
|
||||
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
|
||||
out[2 * i + 0] = clip_y_NEON(best_y[2 * i + 0] + v0);
|
||||
out[2 * i + 1] = clip_y_NEON(best_y[2 * i + 1] + v1);
|
||||
}
|
||||
}
|
||||
#undef MAX_Y
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
extern void WebPInitSharpYUVNEON(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVNEON(void) {
|
||||
WebPSharpYUVUpdateY = SharpYUVUpdateY_NEON;
|
||||
WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_NEON;
|
||||
WebPSharpYUVFilterRow = SharpYUVFilterRow_NEON;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_NEON
|
||||
|
||||
WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVNEON)
|
||||
WEBP_DSP_INIT_STUB(WebPInitSharpYUVNEON)
|
||||
|
||||
#endif // WEBP_USE_NEON
|
|
@ -11,11 +11,11 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./yuv.h"
|
||||
#include "src/dsp/yuv.h"
|
||||
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
|
||||
#include "./common_sse2.h"
|
||||
#include "src/dsp/common_sse2.h"
|
||||
#include <stdlib.h>
|
||||
#include <emmintrin.h>
|
||||
|
||||
|
@ -26,12 +26,12 @@
|
|||
// R = (19077 * y + 26149 * v - 14234) >> 6
|
||||
// G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6
|
||||
// B = (19077 * y + 33050 * u - 17685) >> 6
|
||||
static void ConvertYUV444ToRGB(const __m128i* const Y0,
|
||||
const __m128i* const U0,
|
||||
const __m128i* const V0,
|
||||
__m128i* const R,
|
||||
__m128i* const G,
|
||||
__m128i* const B) {
|
||||
static void ConvertYUV444ToRGB_SSE2(const __m128i* const Y0,
|
||||
const __m128i* const U0,
|
||||
const __m128i* const V0,
|
||||
__m128i* const R,
|
||||
__m128i* const G,
|
||||
__m128i* const B) {
|
||||
const __m128i k19077 = _mm_set1_epi16(19077);
|
||||
const __m128i k26149 = _mm_set1_epi16(26149);
|
||||
const __m128i k14234 = _mm_set1_epi16(14234);
|
||||
|
@ -66,13 +66,13 @@ static void ConvertYUV444ToRGB(const __m128i* const Y0,
|
|||
}
|
||||
|
||||
// Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
|
||||
static WEBP_INLINE __m128i Load_HI_16(const uint8_t* src) {
|
||||
static WEBP_INLINE __m128i Load_HI_16_SSE2(const uint8_t* src) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
|
||||
}
|
||||
|
||||
// Load and replicate the U/V samples
|
||||
static WEBP_INLINE __m128i Load_UV_HI_8(const uint8_t* src) {
|
||||
static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src);
|
||||
const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
|
||||
|
@ -80,29 +80,33 @@ static WEBP_INLINE __m128i Load_UV_HI_8(const uint8_t* src) {
|
|||
}
|
||||
|
||||
// Convert 32 samples of YUV444 to R/G/B
|
||||
static void YUV444ToRGB(const uint8_t* const y,
|
||||
const uint8_t* const u,
|
||||
const uint8_t* const v,
|
||||
__m128i* const R, __m128i* const G, __m128i* const B) {
|
||||
const __m128i Y0 = Load_HI_16(y), U0 = Load_HI_16(u), V0 = Load_HI_16(v);
|
||||
ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B);
|
||||
static void YUV444ToRGB_SSE2(const uint8_t* const y,
|
||||
const uint8_t* const u,
|
||||
const uint8_t* const v,
|
||||
__m128i* const R, __m128i* const G,
|
||||
__m128i* const B) {
|
||||
const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u),
|
||||
V0 = Load_HI_16_SSE2(v);
|
||||
ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
|
||||
}
|
||||
|
||||
// Convert 32 samples of YUV420 to R/G/B
|
||||
static void YUV420ToRGB(const uint8_t* const y,
|
||||
const uint8_t* const u,
|
||||
const uint8_t* const v,
|
||||
__m128i* const R, __m128i* const G, __m128i* const B) {
|
||||
const __m128i Y0 = Load_HI_16(y), U0 = Load_UV_HI_8(u), V0 = Load_UV_HI_8(v);
|
||||
ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B);
|
||||
static void YUV420ToRGB_SSE2(const uint8_t* const y,
|
||||
const uint8_t* const u,
|
||||
const uint8_t* const v,
|
||||
__m128i* const R, __m128i* const G,
|
||||
__m128i* const B) {
|
||||
const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u),
|
||||
V0 = Load_UV_HI_8_SSE2(v);
|
||||
ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
|
||||
}
|
||||
|
||||
// Pack R/G/B/A results into 32b output.
|
||||
static WEBP_INLINE void PackAndStore4(const __m128i* const R,
|
||||
const __m128i* const G,
|
||||
const __m128i* const B,
|
||||
const __m128i* const A,
|
||||
uint8_t* const dst) {
|
||||
static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
|
||||
const __m128i* const G,
|
||||
const __m128i* const B,
|
||||
const __m128i* const A,
|
||||
uint8_t* const dst) {
|
||||
const __m128i rb = _mm_packus_epi16(*R, *B);
|
||||
const __m128i ga = _mm_packus_epi16(*G, *A);
|
||||
const __m128i rg = _mm_unpacklo_epi8(rb, ga);
|
||||
|
@ -114,12 +118,12 @@ static WEBP_INLINE void PackAndStore4(const __m128i* const R,
|
|||
}
|
||||
|
||||
// Pack R/G/B/A results into 16b output.
|
||||
static WEBP_INLINE void PackAndStore4444(const __m128i* const R,
|
||||
const __m128i* const G,
|
||||
const __m128i* const B,
|
||||
const __m128i* const A,
|
||||
uint8_t* const dst) {
|
||||
#if !defined(WEBP_SWAP_16BIT_CSP)
|
||||
static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
|
||||
const __m128i* const G,
|
||||
const __m128i* const B,
|
||||
const __m128i* const A,
|
||||
uint8_t* const dst) {
|
||||
#if (WEBP_SWAP_16BIT_CSP == 0)
|
||||
const __m128i rg0 = _mm_packus_epi16(*R, *G);
|
||||
const __m128i ba0 = _mm_packus_epi16(*B, *A);
|
||||
#else
|
||||
|
@ -136,10 +140,10 @@ static WEBP_INLINE void PackAndStore4444(const __m128i* const R,
|
|||
}
|
||||
|
||||
// Pack R/G/B results into 16b output.
|
||||
static WEBP_INLINE void PackAndStore565(const __m128i* const R,
|
||||
const __m128i* const G,
|
||||
const __m128i* const B,
|
||||
uint8_t* const dst) {
|
||||
static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
|
||||
const __m128i* const G,
|
||||
const __m128i* const B,
|
||||
uint8_t* const dst) {
|
||||
const __m128i r0 = _mm_packus_epi16(*R, *R);
|
||||
const __m128i g0 = _mm_packus_epi16(*G, *G);
|
||||
const __m128i b0 = _mm_packus_epi16(*B, *B);
|
||||
|
@ -149,7 +153,7 @@ static WEBP_INLINE void PackAndStore565(const __m128i* const R,
|
|||
const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0x1c)), 3);
|
||||
const __m128i rg = _mm_or_si128(r1, g1);
|
||||
const __m128i gb = _mm_or_si128(g2, b1);
|
||||
#if !defined(WEBP_SWAP_16BIT_CSP)
|
||||
#if (WEBP_SWAP_16BIT_CSP == 0)
|
||||
const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb);
|
||||
#else
|
||||
const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg);
|
||||
|
@ -160,10 +164,10 @@ static WEBP_INLINE void PackAndStore565(const __m128i* const R,
|
|||
// Pack the planar buffers
|
||||
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
|
||||
// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
|
||||
static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1,
|
||||
__m128i* const in2, __m128i* const in3,
|
||||
__m128i* const in4, __m128i* const in5,
|
||||
uint8_t* const rgb) {
|
||||
static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
|
||||
__m128i* const in2, __m128i* const in3,
|
||||
__m128i* const in4, __m128i* const in5,
|
||||
uint8_t* const rgb) {
|
||||
// The input is 6 registers of sixteen 8b but for the sake of explanation,
|
||||
// let's take 6 registers of four 8b values.
|
||||
// To pack, we will keep taking one every two 8b integer and move it
|
||||
|
@ -186,69 +190,69 @@ static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1,
|
|||
_mm_storeu_si128((__m128i*)(rgb + 80), *in5);
|
||||
}
|
||||
|
||||
void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst) {
|
||||
void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst) {
|
||||
const __m128i kAlpha = _mm_set1_epi16(255);
|
||||
int n;
|
||||
for (n = 0; n < 32; n += 8, dst += 32) {
|
||||
__m128i R, G, B;
|
||||
YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
|
||||
PackAndStore4(&R, &G, &B, &kAlpha, dst);
|
||||
YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
|
||||
PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
|
||||
}
|
||||
}
|
||||
|
||||
void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst) {
|
||||
void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst) {
|
||||
const __m128i kAlpha = _mm_set1_epi16(255);
|
||||
int n;
|
||||
for (n = 0; n < 32; n += 8, dst += 32) {
|
||||
__m128i R, G, B;
|
||||
YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
|
||||
PackAndStore4(&B, &G, &R, &kAlpha, dst);
|
||||
YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
|
||||
PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
|
||||
}
|
||||
}
|
||||
|
||||
void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst) {
|
||||
void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst) {
|
||||
const __m128i kAlpha = _mm_set1_epi16(255);
|
||||
int n;
|
||||
for (n = 0; n < 32; n += 8, dst += 32) {
|
||||
__m128i R, G, B;
|
||||
YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
|
||||
PackAndStore4(&kAlpha, &R, &G, &B, dst);
|
||||
YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
|
||||
PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
|
||||
}
|
||||
}
|
||||
|
||||
void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
|
||||
const uint8_t* v, uint8_t* dst) {
|
||||
const __m128i kAlpha = _mm_set1_epi16(255);
|
||||
int n;
|
||||
for (n = 0; n < 32; n += 8, dst += 16) {
|
||||
__m128i R, G, B;
|
||||
YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
|
||||
PackAndStore4444_SSE2(&R, &G, &B, &kAlpha, dst);
|
||||
}
|
||||
}
|
||||
|
||||
void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst) {
|
||||
int n;
|
||||
for (n = 0; n < 32; n += 8, dst += 16) {
|
||||
__m128i R, G, B;
|
||||
YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
|
||||
PackAndStore565_SSE2(&R, &G, &B, dst);
|
||||
}
|
||||
}
|
||||
|
||||
void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst) {
|
||||
const __m128i kAlpha = _mm_set1_epi16(255);
|
||||
int n;
|
||||
for (n = 0; n < 32; n += 8, dst += 16) {
|
||||
__m128i R, G, B;
|
||||
YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
|
||||
PackAndStore4444(&R, &G, &B, &kAlpha, dst);
|
||||
}
|
||||
}
|
||||
|
||||
void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst) {
|
||||
int n;
|
||||
for (n = 0; n < 32; n += 8, dst += 16) {
|
||||
__m128i R, G, B;
|
||||
YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
|
||||
PackAndStore565(&R, &G, &B, dst);
|
||||
}
|
||||
}
|
||||
|
||||
void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst) {
|
||||
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
|
||||
__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
|
||||
|
||||
YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
|
||||
YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1);
|
||||
YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2);
|
||||
YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);
|
||||
YUV444ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
|
||||
YUV444ToRGB_SSE2(y + 8, u + 8, v + 8, &R1, &G1, &B1);
|
||||
YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2);
|
||||
YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3);
|
||||
|
||||
// Cast to 8b and store as RRRRGGGGBBBB.
|
||||
rgb0 = _mm_packus_epi16(R0, R1);
|
||||
|
@ -259,18 +263,18 @@ void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
|||
rgb5 = _mm_packus_epi16(B2, B3);
|
||||
|
||||
// Pack as RGBRGBRGBRGB.
|
||||
PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
|
||||
PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
|
||||
}
|
||||
|
||||
void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst) {
|
||||
void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst) {
|
||||
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
|
||||
__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
|
||||
|
||||
YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
|
||||
YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1);
|
||||
YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2);
|
||||
YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);
|
||||
YUV444ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
|
||||
YUV444ToRGB_SSE2(y + 8, u + 8, v + 8, &R1, &G1, &B1);
|
||||
YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2);
|
||||
YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3);
|
||||
|
||||
// Cast to 8b and store as BBBBGGGGRRRR.
|
||||
bgr0 = _mm_packus_epi16(B0, B1);
|
||||
|
@ -281,20 +285,21 @@ void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
|||
bgr5= _mm_packus_epi16(R2, R3);
|
||||
|
||||
// Pack as BGRBGRBGRBGR.
|
||||
PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
|
||||
PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Arbitrary-length row conversion functions
|
||||
|
||||
static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst, int len) {
|
||||
static void YuvToRgbaRow_SSE2(const uint8_t* y,
|
||||
const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst, int len) {
|
||||
const __m128i kAlpha = _mm_set1_epi16(255);
|
||||
int n;
|
||||
for (n = 0; n + 8 <= len; n += 8, dst += 32) {
|
||||
__m128i R, G, B;
|
||||
YUV420ToRGB(y, u, v, &R, &G, &B);
|
||||
PackAndStore4(&R, &G, &B, &kAlpha, dst);
|
||||
YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
|
||||
PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
|
||||
y += 8;
|
||||
u += 4;
|
||||
v += 4;
|
||||
|
@ -308,14 +313,15 @@ static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
|||
}
|
||||
}
|
||||
|
||||
static void YuvToBgraRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst, int len) {
|
||||
static void YuvToBgraRow_SSE2(const uint8_t* y,
|
||||
const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst, int len) {
|
||||
const __m128i kAlpha = _mm_set1_epi16(255);
|
||||
int n;
|
||||
for (n = 0; n + 8 <= len; n += 8, dst += 32) {
|
||||
__m128i R, G, B;
|
||||
YUV420ToRGB(y, u, v, &R, &G, &B);
|
||||
PackAndStore4(&B, &G, &R, &kAlpha, dst);
|
||||
YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
|
||||
PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
|
||||
y += 8;
|
||||
u += 4;
|
||||
v += 4;
|
||||
|
@ -329,14 +335,15 @@ static void YuvToBgraRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
|||
}
|
||||
}
|
||||
|
||||
static void YuvToArgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst, int len) {
|
||||
static void YuvToArgbRow_SSE2(const uint8_t* y,
|
||||
const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst, int len) {
|
||||
const __m128i kAlpha = _mm_set1_epi16(255);
|
||||
int n;
|
||||
for (n = 0; n + 8 <= len; n += 8, dst += 32) {
|
||||
__m128i R, G, B;
|
||||
YUV420ToRGB(y, u, v, &R, &G, &B);
|
||||
PackAndStore4(&kAlpha, &R, &G, &B, dst);
|
||||
YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
|
||||
PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
|
||||
y += 8;
|
||||
u += 4;
|
||||
v += 4;
|
||||
|
@ -350,17 +357,18 @@ static void YuvToArgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
|||
}
|
||||
}
|
||||
|
||||
static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst, int len) {
|
||||
static void YuvToRgbRow_SSE2(const uint8_t* y,
|
||||
const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst, int len) {
|
||||
int n;
|
||||
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
|
||||
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
|
||||
__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
|
||||
|
||||
YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
|
||||
YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1);
|
||||
YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2);
|
||||
YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);
|
||||
YUV420ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
|
||||
YUV420ToRGB_SSE2(y + 8, u + 4, v + 4, &R1, &G1, &B1);
|
||||
YUV420ToRGB_SSE2(y + 16, u + 8, v + 8, &R2, &G2, &B2);
|
||||
YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3);
|
||||
|
||||
// Cast to 8b and store as RRRRGGGGBBBB.
|
||||
rgb0 = _mm_packus_epi16(R0, R1);
|
||||
|
@ -371,7 +379,7 @@ static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
|||
rgb5 = _mm_packus_epi16(B2, B3);
|
||||
|
||||
// Pack as RGBRGBRGBRGB.
|
||||
PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
|
||||
PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
|
||||
|
||||
y += 32;
|
||||
u += 16;
|
||||
|
@ -386,17 +394,18 @@ static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
|||
}
|
||||
}
|
||||
|
||||
static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst, int len) {
|
||||
static void YuvToBgrRow_SSE2(const uint8_t* y,
|
||||
const uint8_t* u, const uint8_t* v,
|
||||
uint8_t* dst, int len) {
|
||||
int n;
|
||||
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
|
||||
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
|
||||
__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
|
||||
|
||||
YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
|
||||
YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1);
|
||||
YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2);
|
||||
YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);
|
||||
YUV420ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
|
||||
YUV420ToRGB_SSE2(y + 8, u + 4, v + 4, &R1, &G1, &B1);
|
||||
YUV420ToRGB_SSE2(y + 16, u + 8, v + 8, &R2, &G2, &B2);
|
||||
YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3);
|
||||
|
||||
// Cast to 8b and store as BBBBGGGGRRRR.
|
||||
bgr0 = _mm_packus_epi16(B0, B1);
|
||||
|
@ -407,7 +416,7 @@ static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
|||
bgr5 = _mm_packus_epi16(R2, R3);
|
||||
|
||||
// Pack as BGRBGRBGRBGR.
|
||||
PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
|
||||
PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
|
||||
|
||||
y += 32;
|
||||
u += 16;
|
||||
|
@ -428,11 +437,11 @@ static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
|
|||
extern void WebPInitSamplersSSE2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) {
|
||||
WebPSamplers[MODE_RGB] = YuvToRgbRow;
|
||||
WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
|
||||
WebPSamplers[MODE_BGR] = YuvToBgrRow;
|
||||
WebPSamplers[MODE_BGRA] = YuvToBgraRow;
|
||||
WebPSamplers[MODE_ARGB] = YuvToArgbRow;
|
||||
WebPSamplers[MODE_RGB] = YuvToRgbRow_SSE2;
|
||||
WebPSamplers[MODE_RGBA] = YuvToRgbaRow_SSE2;
|
||||
WebPSamplers[MODE_BGR] = YuvToBgrRow_SSE2;
|
||||
WebPSamplers[MODE_BGRA] = YuvToBgraRow_SSE2;
|
||||
WebPSamplers[MODE_ARGB] = YuvToArgbRow_SSE2;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
@ -445,7 +454,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) {
|
|||
|
||||
// Function that inserts a value of the second half of the in buffer in between
|
||||
// every two char of the first half.
|
||||
static WEBP_INLINE void RGB24PackedToPlanarHelper(
|
||||
static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2(
|
||||
const __m128i* const in /*in[6]*/, __m128i* const out /*out[6]*/) {
|
||||
out[0] = _mm_unpacklo_epi8(in[0], in[3]);
|
||||
out[1] = _mm_unpackhi_epi8(in[0], in[3]);
|
||||
|
@ -458,8 +467,8 @@ static WEBP_INLINE void RGB24PackedToPlanarHelper(
|
|||
// Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
|
||||
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
|
||||
// Similar to PlanarTo24bHelper(), but in reverse order.
|
||||
static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb,
|
||||
__m128i* const out /*out[6]*/) {
|
||||
static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
|
||||
const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
|
||||
__m128i tmp[6];
|
||||
tmp[0] = _mm_loadu_si128((const __m128i*)(rgb + 0));
|
||||
tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16));
|
||||
|
@ -468,16 +477,16 @@ static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb,
|
|||
tmp[4] = _mm_loadu_si128((const __m128i*)(rgb + 64));
|
||||
tmp[5] = _mm_loadu_si128((const __m128i*)(rgb + 80));
|
||||
|
||||
RGB24PackedToPlanarHelper(tmp, out);
|
||||
RGB24PackedToPlanarHelper(out, tmp);
|
||||
RGB24PackedToPlanarHelper(tmp, out);
|
||||
RGB24PackedToPlanarHelper(out, tmp);
|
||||
RGB24PackedToPlanarHelper(tmp, out);
|
||||
RGB24PackedToPlanarHelper_SSE2(tmp, out);
|
||||
RGB24PackedToPlanarHelper_SSE2(out, tmp);
|
||||
RGB24PackedToPlanarHelper_SSE2(tmp, out);
|
||||
RGB24PackedToPlanarHelper_SSE2(out, tmp);
|
||||
RGB24PackedToPlanarHelper_SSE2(tmp, out);
|
||||
}
|
||||
|
||||
// Convert 8 packed ARGB to r[], g[], b[]
|
||||
static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb,
|
||||
__m128i* const rgb /*in[6]*/) {
|
||||
static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb,
|
||||
__m128i* const rgb /*in[6]*/) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m128i a0 = LOAD_16(argb + 0);
|
||||
__m128i a1 = LOAD_16(argb + 4);
|
||||
|
@ -511,10 +520,10 @@ static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb,
|
|||
} while (0)
|
||||
|
||||
#define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
|
||||
static WEBP_INLINE void ConvertRGBToY(const __m128i* const R,
|
||||
const __m128i* const G,
|
||||
const __m128i* const B,
|
||||
__m128i* const Y) {
|
||||
static WEBP_INLINE void ConvertRGBToY_SSE2(const __m128i* const R,
|
||||
const __m128i* const G,
|
||||
const __m128i* const B,
|
||||
__m128i* const Y) {
|
||||
const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384);
|
||||
const __m128i kGB_y = MK_CST_16(16384, 6420);
|
||||
const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF);
|
||||
|
@ -526,10 +535,11 @@ static WEBP_INLINE void ConvertRGBToY(const __m128i* const R,
|
|||
TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void ConvertRGBToUV(const __m128i* const R,
|
||||
const __m128i* const G,
|
||||
const __m128i* const B,
|
||||
__m128i* const U, __m128i* const V) {
|
||||
static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R,
|
||||
const __m128i* const G,
|
||||
const __m128i* const B,
|
||||
__m128i* const U,
|
||||
__m128i* const V) {
|
||||
const __m128i kRG_u = MK_CST_16(-9719, -19081);
|
||||
const __m128i kGB_u = MK_CST_16(0, 28800);
|
||||
const __m128i kRG_v = MK_CST_16(28800, 0);
|
||||
|
@ -549,14 +559,14 @@ static WEBP_INLINE void ConvertRGBToUV(const __m128i* const R,
|
|||
#undef MK_CST_16
|
||||
#undef TRANSFORM
|
||||
|
||||
static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
|
||||
static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) {
|
||||
const int max_width = width & ~31;
|
||||
int i;
|
||||
for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
|
||||
__m128i rgb_plane[6];
|
||||
int j;
|
||||
|
||||
RGB24PackedToPlanar(rgb, rgb_plane);
|
||||
RGB24PackedToPlanar_SSE2(rgb, rgb_plane);
|
||||
|
||||
for (j = 0; j < 2; ++j, i += 16) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
|
@ -566,13 +576,13 @@ static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
|
|||
r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero);
|
||||
g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero);
|
||||
b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero);
|
||||
ConvertRGBToY(&r, &g, &b, &Y0);
|
||||
ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
|
||||
|
||||
// Convert to 16-bit Y.
|
||||
r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero);
|
||||
g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero);
|
||||
b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero);
|
||||
ConvertRGBToY(&r, &g, &b, &Y1);
|
||||
ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
|
||||
|
||||
// Cast to 8-bit and store.
|
||||
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
|
||||
|
@ -583,14 +593,14 @@ static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
|
|||
}
|
||||
}
|
||||
|
||||
static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
|
||||
static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) {
|
||||
const int max_width = width & ~31;
|
||||
int i;
|
||||
for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
|
||||
__m128i bgr_plane[6];
|
||||
int j;
|
||||
|
||||
RGB24PackedToPlanar(bgr, bgr_plane);
|
||||
RGB24PackedToPlanar_SSE2(bgr, bgr_plane);
|
||||
|
||||
for (j = 0; j < 2; ++j, i += 16) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
|
@ -600,13 +610,13 @@ static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
|
|||
b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero);
|
||||
g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero);
|
||||
r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero);
|
||||
ConvertRGBToY(&r, &g, &b, &Y0);
|
||||
ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
|
||||
|
||||
// Convert to 16-bit Y.
|
||||
b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero);
|
||||
g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero);
|
||||
r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero);
|
||||
ConvertRGBToY(&r, &g, &b, &Y1);
|
||||
ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
|
||||
|
||||
// Cast to 8-bit and store.
|
||||
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
|
||||
|
@ -617,14 +627,14 @@ static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
|
|||
}
|
||||
}
|
||||
|
||||
static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
|
||||
static void ConvertARGBToY_SSE2(const uint32_t* argb, uint8_t* y, int width) {
|
||||
const int max_width = width & ~15;
|
||||
int i;
|
||||
for (i = 0; i < max_width; i += 16) {
|
||||
__m128i Y0, Y1, rgb[6];
|
||||
RGB32PackedToPlanar(&argb[i], rgb);
|
||||
ConvertRGBToY(&rgb[0], &rgb[2], &rgb[4], &Y0);
|
||||
ConvertRGBToY(&rgb[1], &rgb[3], &rgb[5], &Y1);
|
||||
RGB32PackedToPlanar_SSE2(&argb[i], rgb);
|
||||
ConvertRGBToY_SSE2(&rgb[0], &rgb[2], &rgb[4], &Y0);
|
||||
ConvertRGBToY_SSE2(&rgb[1], &rgb[3], &rgb[5], &Y1);
|
||||
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
|
||||
}
|
||||
for (; i < width; ++i) { // left-over
|
||||
|
@ -636,31 +646,33 @@ static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
|
|||
|
||||
// Horizontal add (doubled) of two 16b values, result is 16b.
|
||||
// in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ...
|
||||
static void HorizontalAddPack(const __m128i* const A, const __m128i* const B,
|
||||
__m128i* const out) {
|
||||
static void HorizontalAddPack_SSE2(const __m128i* const A,
|
||||
const __m128i* const B,
|
||||
__m128i* const out) {
|
||||
const __m128i k2 = _mm_set1_epi16(2);
|
||||
const __m128i C = _mm_madd_epi16(*A, k2);
|
||||
const __m128i D = _mm_madd_epi16(*B, k2);
|
||||
*out = _mm_packs_epi32(C, D);
|
||||
}
|
||||
|
||||
static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v,
|
||||
int src_width, int do_store) {
|
||||
static void ConvertARGBToUV_SSE2(const uint32_t* argb,
|
||||
uint8_t* u, uint8_t* v,
|
||||
int src_width, int do_store) {
|
||||
const int max_width = src_width & ~31;
|
||||
int i;
|
||||
for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
|
||||
__m128i rgb[6], U0, V0, U1, V1;
|
||||
RGB32PackedToPlanar(&argb[i], rgb);
|
||||
HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]);
|
||||
HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]);
|
||||
HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]);
|
||||
ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
|
||||
RGB32PackedToPlanar_SSE2(&argb[i], rgb);
|
||||
HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]);
|
||||
HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]);
|
||||
HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]);
|
||||
ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
|
||||
|
||||
RGB32PackedToPlanar(&argb[i + 16], rgb);
|
||||
HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]);
|
||||
HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]);
|
||||
HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]);
|
||||
ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
|
||||
RGB32PackedToPlanar_SSE2(&argb[i + 16], rgb);
|
||||
HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]);
|
||||
HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]);
|
||||
HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]);
|
||||
ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
|
||||
|
||||
U0 = _mm_packus_epi16(U0, U1);
|
||||
V0 = _mm_packus_epi16(V0, V1);
|
||||
|
@ -679,10 +691,9 @@ static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v,
|
|||
}
|
||||
|
||||
// Convert 16 packed ARGB 16b-values to r[], g[], b[]
|
||||
static WEBP_INLINE void RGBA32PackedToPlanar_16b(const uint16_t* const rgbx,
|
||||
__m128i* const r,
|
||||
__m128i* const g,
|
||||
__m128i* const b) {
|
||||
static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
|
||||
const uint16_t* const rgbx,
|
||||
__m128i* const r, __m128i* const g, __m128i* const b) {
|
||||
const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x
|
||||
const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x
|
||||
const __m128i in2 = LOAD_16(rgbx + 16); // r4 | ...
|
||||
|
@ -701,16 +712,16 @@ static WEBP_INLINE void RGBA32PackedToPlanar_16b(const uint16_t* const rgbx,
|
|||
*b = _mm_unpacklo_epi64(B1, B3);
|
||||
}
|
||||
|
||||
static void ConvertRGBA32ToUV(const uint16_t* rgb,
|
||||
uint8_t* u, uint8_t* v, int width) {
|
||||
static void ConvertRGBA32ToUV_SSE2(const uint16_t* rgb,
|
||||
uint8_t* u, uint8_t* v, int width) {
|
||||
const int max_width = width & ~15;
|
||||
const uint16_t* const last_rgb = rgb + 4 * max_width;
|
||||
while (rgb < last_rgb) {
|
||||
__m128i r, g, b, U0, V0, U1, V1;
|
||||
RGBA32PackedToPlanar_16b(rgb + 0, &r, &g, &b);
|
||||
ConvertRGBToUV(&r, &g, &b, &U0, &V0);
|
||||
RGBA32PackedToPlanar_16b(rgb + 32, &r, &g, &b);
|
||||
ConvertRGBToUV(&r, &g, &b, &U1, &V1);
|
||||
RGBA32PackedToPlanar_16b_SSE2(rgb + 0, &r, &g, &b);
|
||||
ConvertRGBToUV_SSE2(&r, &g, &b, &U0, &V0);
|
||||
RGBA32PackedToPlanar_16b_SSE2(rgb + 32, &r, &g, &b);
|
||||
ConvertRGBToUV_SSE2(&r, &g, &b, &U1, &V1);
|
||||
STORE_16(_mm_packus_epi16(U0, U1), u);
|
||||
STORE_16(_mm_packus_epi16(V0, V1), v);
|
||||
u += 16;
|
||||
|
@ -727,13 +738,13 @@ static void ConvertRGBA32ToUV(const uint16_t* rgb,
|
|||
extern void WebPInitConvertARGBToYUVSSE2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
|
||||
WebPConvertARGBToY = ConvertARGBToY;
|
||||
WebPConvertARGBToUV = ConvertARGBToUV;
|
||||
WebPConvertARGBToY = ConvertARGBToY_SSE2;
|
||||
WebPConvertARGBToUV = ConvertARGBToUV_SSE2;
|
||||
|
||||
WebPConvertRGB24ToY = ConvertRGB24ToY;
|
||||
WebPConvertBGR24ToY = ConvertBGR24ToY;
|
||||
WebPConvertRGB24ToY = ConvertRGB24ToY_SSE2;
|
||||
WebPConvertBGR24ToY = ConvertBGR24ToY_SSE2;
|
||||
|
||||
WebPConvertRGBA32ToUV = ConvertRGBA32ToUV;
|
||||
WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
|
@ -14,12 +14,12 @@
|
|||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "./vp8i_enc.h"
|
||||
#include "../dsp/dsp.h"
|
||||
#include "../utils/filters_utils.h"
|
||||
#include "../utils/quant_levels_utils.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "../webp/format_constants.h"
|
||||
#include "src/enc/vp8i_enc.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
#include "src/utils/filters_utils.h"
|
||||
#include "src/utils/quant_levels_utils.h"
|
||||
#include "src/utils/utils.h"
|
||||
#include "src/webp/format_constants.h"
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Encodes the given alpha data via specified compression method 'method'.
|
||||
|
@ -44,11 +44,11 @@
|
|||
// invalid quality or method, or
|
||||
// memory allocation for the compressed data fails.
|
||||
|
||||
#include "../enc/vp8li_enc.h"
|
||||
#include "src/enc/vp8li_enc.h"
|
||||
|
||||
static int EncodeLossless(const uint8_t* const data, int width, int height,
|
||||
int effort_level, // in [0..6] range
|
||||
VP8LBitWriter* const bw,
|
||||
int use_quality_100, VP8LBitWriter* const bw,
|
||||
WebPAuxStats* const stats) {
|
||||
int ok = 0;
|
||||
WebPConfig config;
|
||||
|
@ -76,7 +76,10 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
|
|||
// Set a low default quality for encoding alpha. Ensure that Alpha quality at
|
||||
// lower methods (3 and below) is less than the threshold for triggering
|
||||
// costly 'BackwardReferencesTraceBackwards'.
|
||||
config.quality = 8.f * effort_level;
|
||||
// If the alpha quality is set to 100 and the method to 6, allow for a high
|
||||
// lossless quality to trigger the cruncher.
|
||||
config.quality =
|
||||
(use_quality_100 && effort_level == 6) ? 100 : 8.f * effort_level;
|
||||
assert(config.quality >= 0 && config.quality <= 100.f);
|
||||
|
||||
// TODO(urvang): Temporary fix to avoid generating images that trigger
|
||||
|
@ -134,7 +137,7 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
|
|||
if (method != ALPHA_NO_COMPRESSION) {
|
||||
ok = VP8LBitWriterInit(&tmp_bw, data_size >> 3);
|
||||
ok = ok && EncodeLossless(alpha_src, width, height, effort_level,
|
||||
&tmp_bw, &result->stats);
|
||||
!reduce_levels, &tmp_bw, &result->stats);
|
||||
if (ok) {
|
||||
output = VP8LBitWriterFinish(&tmp_bw);
|
||||
output_size = VP8LBitWriterNumBytes(&tmp_bw);
|
||||
|
@ -264,6 +267,7 @@ static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
|
|||
reduce_levels, effort_level, NULL, &best);
|
||||
}
|
||||
if (ok) {
|
||||
#if !defined(WEBP_DISABLE_STATS)
|
||||
if (stats != NULL) {
|
||||
stats->lossless_features = best.stats.lossless_features;
|
||||
stats->histogram_bits = best.stats.histogram_bits;
|
||||
|
@ -274,6 +278,9 @@ static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
|
|||
stats->lossless_hdr_size = best.stats.lossless_hdr_size;
|
||||
stats->lossless_data_size = best.stats.lossless_data_size;
|
||||
}
|
||||
#else
|
||||
(void)stats;
|
||||
#endif
|
||||
*output_size = VP8BitWriterSize(&best.bw);
|
||||
*output = VP8BitWriterBuf(&best.bw);
|
||||
} else {
|
||||
|
@ -339,10 +346,12 @@ static int EncodeAlpha(VP8Encoder* const enc,
|
|||
ok = ApplyFiltersAndEncode(quant_alpha, width, height, data_size, method,
|
||||
filter, reduce_levels, effort_level, output,
|
||||
output_size, pic->stats);
|
||||
#if !defined(WEBP_DISABLE_STATS)
|
||||
if (pic->stats != NULL) { // need stats?
|
||||
pic->stats->coded_size += (int)(*output_size);
|
||||
enc->sse_[3] = sse;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
WebPSafeFree(quant_alpha);
|
|
@ -15,9 +15,9 @@
|
|||
#include <string.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "./vp8i_enc.h"
|
||||
#include "./cost_enc.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "src/enc/vp8i_enc.h"
|
||||
#include "src/enc/cost_enc.h"
|
||||
#include "src/utils/utils.h"
|
||||
|
||||
#define MAX_ITERS_K_MEANS 6
|
||||
|
|
@ -0,0 +1,790 @@
|
|||
// Copyright 2017 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// Improves a given set of backward references by analyzing its bit cost.
|
||||
// The algorithm is similar to the Zopfli compression algorithm but tailored to
|
||||
// images.
|
||||
//
|
||||
// Author: Vincent Rabaud (vrabaud@google.com)
|
||||
//
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include "src/enc/backward_references_enc.h"
|
||||
#include "src/enc/histogram_enc.h"
|
||||
#include "src/dsp/lossless_common.h"
|
||||
#include "src/utils/color_cache_utils.h"
|
||||
#include "src/utils/utils.h"
|
||||
|
||||
#define VALUES_IN_BYTE 256
|
||||
|
||||
extern void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs);
|
||||
extern int VP8LDistanceToPlaneCode(int xsize, int dist);
|
||||
extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
|
||||
const PixOrCopy v);
|
||||
|
||||
typedef struct {
|
||||
double alpha_[VALUES_IN_BYTE];
|
||||
double red_[VALUES_IN_BYTE];
|
||||
double blue_[VALUES_IN_BYTE];
|
||||
double distance_[NUM_DISTANCE_CODES];
|
||||
double* literal_;
|
||||
} CostModel;
|
||||
|
||||
static void ConvertPopulationCountTableToBitEstimates(
|
||||
int num_symbols, const uint32_t population_counts[], double output[]) {
|
||||
uint32_t sum = 0;
|
||||
int nonzeros = 0;
|
||||
int i;
|
||||
for (i = 0; i < num_symbols; ++i) {
|
||||
sum += population_counts[i];
|
||||
if (population_counts[i] > 0) {
|
||||
++nonzeros;
|
||||
}
|
||||
}
|
||||
if (nonzeros <= 1) {
|
||||
memset(output, 0, num_symbols * sizeof(*output));
|
||||
} else {
|
||||
const double logsum = VP8LFastLog2(sum);
|
||||
for (i = 0; i < num_symbols; ++i) {
|
||||
output[i] = logsum - VP8LFastLog2(population_counts[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int CostModelBuild(CostModel* const m, int xsize, int cache_bits,
|
||||
const VP8LBackwardRefs* const refs) {
|
||||
int ok = 0;
|
||||
VP8LRefsCursor c = VP8LRefsCursorInit(refs);
|
||||
VP8LHistogram* const histo = VP8LAllocateHistogram(cache_bits);
|
||||
if (histo == NULL) goto Error;
|
||||
|
||||
// The following code is similar to VP8LHistogramCreate but converts the
|
||||
// distance to plane code.
|
||||
VP8LHistogramInit(histo, cache_bits);
|
||||
while (VP8LRefsCursorOk(&c)) {
|
||||
VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos, VP8LDistanceToPlaneCode,
|
||||
xsize);
|
||||
VP8LRefsCursorNext(&c);
|
||||
}
|
||||
|
||||
ConvertPopulationCountTableToBitEstimates(
|
||||
VP8LHistogramNumCodes(histo->palette_code_bits_),
|
||||
histo->literal_, m->literal_);
|
||||
ConvertPopulationCountTableToBitEstimates(
|
||||
VALUES_IN_BYTE, histo->red_, m->red_);
|
||||
ConvertPopulationCountTableToBitEstimates(
|
||||
VALUES_IN_BYTE, histo->blue_, m->blue_);
|
||||
ConvertPopulationCountTableToBitEstimates(
|
||||
VALUES_IN_BYTE, histo->alpha_, m->alpha_);
|
||||
ConvertPopulationCountTableToBitEstimates(
|
||||
NUM_DISTANCE_CODES, histo->distance_, m->distance_);
|
||||
ok = 1;
|
||||
|
||||
Error:
|
||||
VP8LFreeHistogram(histo);
|
||||
return ok;
|
||||
}
|
||||
|
||||
static WEBP_INLINE double GetLiteralCost(const CostModel* const m, uint32_t v) {
|
||||
return m->alpha_[v >> 24] +
|
||||
m->red_[(v >> 16) & 0xff] +
|
||||
m->literal_[(v >> 8) & 0xff] +
|
||||
m->blue_[v & 0xff];
|
||||
}
|
||||
|
||||
static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) {
|
||||
const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx;
|
||||
return m->literal_[literal_idx];
|
||||
}
|
||||
|
||||
static WEBP_INLINE double GetLengthCost(const CostModel* const m,
|
||||
uint32_t length) {
|
||||
int code, extra_bits;
|
||||
VP8LPrefixEncodeBits(length, &code, &extra_bits);
|
||||
return m->literal_[VALUES_IN_BYTE + code] + extra_bits;
|
||||
}
|
||||
|
||||
static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
|
||||
uint32_t distance) {
|
||||
int code, extra_bits;
|
||||
VP8LPrefixEncodeBits(distance, &code, &extra_bits);
|
||||
return m->distance_[code] + extra_bits;
|
||||
}
|
||||
|
||||
static WEBP_INLINE void AddSingleLiteralWithCostModel(
|
||||
const uint32_t* const argb, VP8LColorCache* const hashers,
|
||||
const CostModel* const cost_model, int idx, int use_color_cache,
|
||||
float prev_cost, float* const cost, uint16_t* const dist_array) {
|
||||
double cost_val = prev_cost;
|
||||
const uint32_t color = argb[idx];
|
||||
const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1;
|
||||
if (ix >= 0) {
|
||||
// use_color_cache is true and hashers contains color
|
||||
const double mul0 = 0.68;
|
||||
cost_val += GetCacheCost(cost_model, ix) * mul0;
|
||||
} else {
|
||||
const double mul1 = 0.82;
|
||||
if (use_color_cache) VP8LColorCacheInsert(hashers, color);
|
||||
cost_val += GetLiteralCost(cost_model, color) * mul1;
|
||||
}
|
||||
if (cost[idx] > cost_val) {
|
||||
cost[idx] = (float)cost_val;
|
||||
dist_array[idx] = 1; // only one is inserted.
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// CostManager and interval handling
|
||||
|
||||
// Empirical value to avoid high memory consumption but good for performance.
|
||||
#define COST_CACHE_INTERVAL_SIZE_MAX 500
|
||||
|
||||
// To perform backward reference every pixel at index index_ is considered and
|
||||
// the cost for the MAX_LENGTH following pixels computed. Those following pixels
|
||||
// at index index_ + k (k from 0 to MAX_LENGTH) have a cost of:
|
||||
// cost_ = distance cost at index + GetLengthCost(cost_model, k)
|
||||
// and the minimum value is kept. GetLengthCost(cost_model, k) is cached in an
|
||||
// array of size MAX_LENGTH.
|
||||
// Instead of performing MAX_LENGTH comparisons per pixel, we keep track of the
|
||||
// minimal values using intervals of constant cost.
|
||||
// An interval is defined by the index_ of the pixel that generated it and
|
||||
// is only useful in a range of indices from start_ to end_ (exclusive), i.e.
|
||||
// it contains the minimum value for pixels between start_ and end_.
|
||||
// Intervals are stored in a linked list and ordered by start_. When a new
|
||||
// interval has a better value, old intervals are split or removed. There are
|
||||
// therefore no overlapping intervals.
|
||||
typedef struct CostInterval CostInterval;
|
||||
struct CostInterval {
|
||||
float cost_;
|
||||
int start_;
|
||||
int end_;
|
||||
int index_;
|
||||
CostInterval* previous_;
|
||||
CostInterval* next_;
|
||||
};
|
||||
|
||||
// The GetLengthCost(cost_model, k) are cached in a CostCacheInterval.
|
||||
typedef struct {
|
||||
double cost_;
|
||||
int start_;
|
||||
int end_; // Exclusive.
|
||||
} CostCacheInterval;
|
||||
|
||||
// This structure is in charge of managing intervals and costs.
|
||||
// It caches the different CostCacheInterval, caches the different
|
||||
// GetLengthCost(cost_model, k) in cost_cache_ and the CostInterval's (whose
|
||||
// count_ is limited by COST_CACHE_INTERVAL_SIZE_MAX).
|
||||
#define COST_MANAGER_MAX_FREE_LIST 10
|
||||
typedef struct {
|
||||
CostInterval* head_;
|
||||
int count_; // The number of stored intervals.
|
||||
CostCacheInterval* cache_intervals_;
|
||||
size_t cache_intervals_size_;
|
||||
double cost_cache_[MAX_LENGTH]; // Contains the GetLengthCost(cost_model, k).
|
||||
float* costs_;
|
||||
uint16_t* dist_array_;
|
||||
// Most of the time, we only need few intervals -> use a free-list, to avoid
|
||||
// fragmentation with small allocs in most common cases.
|
||||
CostInterval intervals_[COST_MANAGER_MAX_FREE_LIST];
|
||||
CostInterval* free_intervals_;
|
||||
// These are regularly malloc'd remains. This list can't grow larger than than
|
||||
// size COST_CACHE_INTERVAL_SIZE_MAX - COST_MANAGER_MAX_FREE_LIST, note.
|
||||
CostInterval* recycled_intervals_;
|
||||
} CostManager;
|
||||
|
||||
static void CostIntervalAddToFreeList(CostManager* const manager,
|
||||
CostInterval* const interval) {
|
||||
interval->next_ = manager->free_intervals_;
|
||||
manager->free_intervals_ = interval;
|
||||
}
|
||||
|
||||
static int CostIntervalIsInFreeList(const CostManager* const manager,
|
||||
const CostInterval* const interval) {
|
||||
return (interval >= &manager->intervals_[0] &&
|
||||
interval <= &manager->intervals_[COST_MANAGER_MAX_FREE_LIST - 1]);
|
||||
}
|
||||
|
||||
static void CostManagerInitFreeList(CostManager* const manager) {
|
||||
int i;
|
||||
manager->free_intervals_ = NULL;
|
||||
for (i = 0; i < COST_MANAGER_MAX_FREE_LIST; ++i) {
|
||||
CostIntervalAddToFreeList(manager, &manager->intervals_[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static void DeleteIntervalList(CostManager* const manager,
|
||||
const CostInterval* interval) {
|
||||
while (interval != NULL) {
|
||||
const CostInterval* const next = interval->next_;
|
||||
if (!CostIntervalIsInFreeList(manager, interval)) {
|
||||
WebPSafeFree((void*)interval);
|
||||
} // else: do nothing
|
||||
interval = next;
|
||||
}
|
||||
}
|
||||
|
||||
static void CostManagerClear(CostManager* const manager) {
|
||||
if (manager == NULL) return;
|
||||
|
||||
WebPSafeFree(manager->costs_);
|
||||
WebPSafeFree(manager->cache_intervals_);
|
||||
|
||||
// Clear the interval lists.
|
||||
DeleteIntervalList(manager, manager->head_);
|
||||
manager->head_ = NULL;
|
||||
DeleteIntervalList(manager, manager->recycled_intervals_);
|
||||
manager->recycled_intervals_ = NULL;
|
||||
|
||||
// Reset pointers, count_ and cache_intervals_size_.
|
||||
memset(manager, 0, sizeof(*manager));
|
||||
CostManagerInitFreeList(manager);
|
||||
}
|
||||
|
||||
static int CostManagerInit(CostManager* const manager,
|
||||
uint16_t* const dist_array, int pix_count,
|
||||
const CostModel* const cost_model) {
|
||||
int i;
|
||||
const int cost_cache_size = (pix_count > MAX_LENGTH) ? MAX_LENGTH : pix_count;
|
||||
|
||||
manager->costs_ = NULL;
|
||||
manager->cache_intervals_ = NULL;
|
||||
manager->head_ = NULL;
|
||||
manager->recycled_intervals_ = NULL;
|
||||
manager->count_ = 0;
|
||||
manager->dist_array_ = dist_array;
|
||||
CostManagerInitFreeList(manager);
|
||||
|
||||
// Fill in the cost_cache_.
|
||||
manager->cache_intervals_size_ = 1;
|
||||
manager->cost_cache_[0] = GetLengthCost(cost_model, 0);
|
||||
for (i = 1; i < cost_cache_size; ++i) {
|
||||
manager->cost_cache_[i] = GetLengthCost(cost_model, i);
|
||||
// Get the number of bound intervals.
|
||||
if (manager->cost_cache_[i] != manager->cost_cache_[i - 1]) {
|
||||
++manager->cache_intervals_size_;
|
||||
}
|
||||
}
|
||||
|
||||
// With the current cost model, we usually have below 20 intervals.
|
||||
// The worst case scenario with a cost model would be if every length has a
|
||||
// different cost, hence MAX_LENGTH but that is impossible with the current
|
||||
// implementation that spirals around a pixel.
|
||||
assert(manager->cache_intervals_size_ <= MAX_LENGTH);
|
||||
manager->cache_intervals_ = (CostCacheInterval*)WebPSafeMalloc(
|
||||
manager->cache_intervals_size_, sizeof(*manager->cache_intervals_));
|
||||
if (manager->cache_intervals_ == NULL) {
|
||||
CostManagerClear(manager);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Fill in the cache_intervals_.
|
||||
{
|
||||
CostCacheInterval* cur = manager->cache_intervals_;
|
||||
|
||||
// Consecutive values in cost_cache_ are compared and if a big enough
|
||||
// difference is found, a new interval is created and bounded.
|
||||
cur->start_ = 0;
|
||||
cur->end_ = 1;
|
||||
cur->cost_ = manager->cost_cache_[0];
|
||||
for (i = 1; i < cost_cache_size; ++i) {
|
||||
const double cost_val = manager->cost_cache_[i];
|
||||
if (cost_val != cur->cost_) {
|
||||
++cur;
|
||||
// Initialize an interval.
|
||||
cur->start_ = i;
|
||||
cur->cost_ = cost_val;
|
||||
}
|
||||
cur->end_ = i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
manager->costs_ = (float*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_));
|
||||
if (manager->costs_ == NULL) {
|
||||
CostManagerClear(manager);
|
||||
return 0;
|
||||
}
|
||||
// Set the initial costs_ high for every pixel as we will keep the minimum.
|
||||
for (i = 0; i < pix_count; ++i) manager->costs_[i] = 1e38f;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Given the cost and the position that define an interval, update the cost at
|
||||
// pixel 'i' if it is smaller than the previously computed value.
|
||||
static WEBP_INLINE void UpdateCost(CostManager* const manager, int i,
|
||||
int position, float cost) {
|
||||
const int k = i - position;
|
||||
assert(k >= 0 && k < MAX_LENGTH);
|
||||
|
||||
if (manager->costs_[i] > cost) {
|
||||
manager->costs_[i] = cost;
|
||||
manager->dist_array_[i] = k + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Given the cost and the position that define an interval, update the cost for
|
||||
// all the pixels between 'start' and 'end' excluded.
|
||||
static WEBP_INLINE void UpdateCostPerInterval(CostManager* const manager,
|
||||
int start, int end, int position,
|
||||
float cost) {
|
||||
int i;
|
||||
for (i = start; i < end; ++i) UpdateCost(manager, i, position, cost);
|
||||
}
|
||||
|
||||
// Given two intervals, make 'prev' be the previous one of 'next' in 'manager'.
|
||||
static WEBP_INLINE void ConnectIntervals(CostManager* const manager,
|
||||
CostInterval* const prev,
|
||||
CostInterval* const next) {
|
||||
if (prev != NULL) {
|
||||
prev->next_ = next;
|
||||
} else {
|
||||
manager->head_ = next;
|
||||
}
|
||||
|
||||
if (next != NULL) next->previous_ = prev;
|
||||
}
|
||||
|
||||
// Pop an interval in the manager.
|
||||
static WEBP_INLINE void PopInterval(CostManager* const manager,
|
||||
CostInterval* const interval) {
|
||||
if (interval == NULL) return;
|
||||
|
||||
ConnectIntervals(manager, interval->previous_, interval->next_);
|
||||
if (CostIntervalIsInFreeList(manager, interval)) {
|
||||
CostIntervalAddToFreeList(manager, interval);
|
||||
} else { // recycle regularly malloc'd intervals too
|
||||
interval->next_ = manager->recycled_intervals_;
|
||||
manager->recycled_intervals_ = interval;
|
||||
}
|
||||
--manager->count_;
|
||||
assert(manager->count_ >= 0);
|
||||
}
|
||||
|
||||
// Update the cost at index i by going over all the stored intervals that
|
||||
// overlap with i.
|
||||
// If 'do_clean_intervals' is set to something different than 0, intervals that
|
||||
// end before 'i' will be popped.
|
||||
static WEBP_INLINE void UpdateCostAtIndex(CostManager* const manager, int i,
|
||||
int do_clean_intervals) {
|
||||
CostInterval* current = manager->head_;
|
||||
|
||||
while (current != NULL && current->start_ <= i) {
|
||||
CostInterval* const next = current->next_;
|
||||
if (current->end_ <= i) {
|
||||
if (do_clean_intervals) {
|
||||
// We have an outdated interval, remove it.
|
||||
PopInterval(manager, current);
|
||||
}
|
||||
} else {
|
||||
UpdateCost(manager, i, current->index_, current->cost_);
|
||||
}
|
||||
current = next;
|
||||
}
|
||||
}
|
||||
|
||||
// Given a current orphan interval and its previous interval, before
|
||||
// it was orphaned (which can be NULL), set it at the right place in the list
|
||||
// of intervals using the start_ ordering and the previous interval as a hint.
|
||||
static WEBP_INLINE void PositionOrphanInterval(CostManager* const manager,
|
||||
CostInterval* const current,
|
||||
CostInterval* previous) {
|
||||
assert(current != NULL);
|
||||
|
||||
if (previous == NULL) previous = manager->head_;
|
||||
while (previous != NULL && current->start_ < previous->start_) {
|
||||
previous = previous->previous_;
|
||||
}
|
||||
while (previous != NULL && previous->next_ != NULL &&
|
||||
previous->next_->start_ < current->start_) {
|
||||
previous = previous->next_;
|
||||
}
|
||||
|
||||
if (previous != NULL) {
|
||||
ConnectIntervals(manager, current, previous->next_);
|
||||
} else {
|
||||
ConnectIntervals(manager, current, manager->head_);
|
||||
}
|
||||
ConnectIntervals(manager, previous, current);
|
||||
}
|
||||
|
||||
// Insert an interval in the list contained in the manager by starting at
|
||||
// interval_in as a hint. The intervals are sorted by start_ value.
|
||||
static WEBP_INLINE void InsertInterval(CostManager* const manager,
|
||||
CostInterval* const interval_in,
|
||||
float cost, int position, int start,
|
||||
int end) {
|
||||
CostInterval* interval_new;
|
||||
|
||||
if (start >= end) return;
|
||||
if (manager->count_ >= COST_CACHE_INTERVAL_SIZE_MAX) {
|
||||
// Serialize the interval if we cannot store it.
|
||||
UpdateCostPerInterval(manager, start, end, position, cost);
|
||||
return;
|
||||
}
|
||||
if (manager->free_intervals_ != NULL) {
|
||||
interval_new = manager->free_intervals_;
|
||||
manager->free_intervals_ = interval_new->next_;
|
||||
} else if (manager->recycled_intervals_ != NULL) {
|
||||
interval_new = manager->recycled_intervals_;
|
||||
manager->recycled_intervals_ = interval_new->next_;
|
||||
} else { // malloc for good
|
||||
interval_new = (CostInterval*)WebPSafeMalloc(1, sizeof(*interval_new));
|
||||
if (interval_new == NULL) {
|
||||
// Write down the interval if we cannot create it.
|
||||
UpdateCostPerInterval(manager, start, end, position, cost);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
interval_new->cost_ = cost;
|
||||
interval_new->index_ = position;
|
||||
interval_new->start_ = start;
|
||||
interval_new->end_ = end;
|
||||
PositionOrphanInterval(manager, interval_new, interval_in);
|
||||
|
||||
++manager->count_;
|
||||
}
|
||||
|
||||
// Given a new cost interval defined by its start at position, its length value
|
||||
// and distance_cost, add its contributions to the previous intervals and costs.
|
||||
// If handling the interval or one of its subintervals becomes to heavy, its
|
||||
// contribution is added to the costs right away.
|
||||
static WEBP_INLINE void PushInterval(CostManager* const manager,
|
||||
double distance_cost, int position,
|
||||
int len) {
|
||||
size_t i;
|
||||
CostInterval* interval = manager->head_;
|
||||
CostInterval* interval_next;
|
||||
const CostCacheInterval* const cost_cache_intervals =
|
||||
manager->cache_intervals_;
|
||||
// If the interval is small enough, no need to deal with the heavy
|
||||
// interval logic, just serialize it right away. This constant is empirical.
|
||||
const int kSkipDistance = 10;
|
||||
|
||||
if (len < kSkipDistance) {
|
||||
int j;
|
||||
for (j = position; j < position + len; ++j) {
|
||||
const int k = j - position;
|
||||
float cost_tmp;
|
||||
assert(k >= 0 && k < MAX_LENGTH);
|
||||
cost_tmp = (float)(distance_cost + manager->cost_cache_[k]);
|
||||
|
||||
if (manager->costs_[j] > cost_tmp) {
|
||||
manager->costs_[j] = cost_tmp;
|
||||
manager->dist_array_[j] = k + 1;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
for (i = 0; i < manager->cache_intervals_size_ &&
|
||||
cost_cache_intervals[i].start_ < len;
|
||||
++i) {
|
||||
// Define the intersection of the ith interval with the new one.
|
||||
int start = position + cost_cache_intervals[i].start_;
|
||||
const int end = position + (cost_cache_intervals[i].end_ > len
|
||||
? len
|
||||
: cost_cache_intervals[i].end_);
|
||||
const float cost = (float)(distance_cost + cost_cache_intervals[i].cost_);
|
||||
|
||||
for (; interval != NULL && interval->start_ < end;
|
||||
interval = interval_next) {
|
||||
interval_next = interval->next_;
|
||||
|
||||
// Make sure we have some overlap
|
||||
if (start >= interval->end_) continue;
|
||||
|
||||
if (cost >= interval->cost_) {
|
||||
// When intervals are represented, the lower, the better.
|
||||
// [**********************************************************[
|
||||
// start end
|
||||
// [----------------------------------[
|
||||
// interval->start_ interval->end_
|
||||
// If we are worse than what we already have, add whatever we have so
|
||||
// far up to interval.
|
||||
const int start_new = interval->end_;
|
||||
InsertInterval(manager, interval, cost, position, start,
|
||||
interval->start_);
|
||||
start = start_new;
|
||||
if (start >= end) break;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (start <= interval->start_) {
|
||||
if (interval->end_ <= end) {
|
||||
// [----------------------------------[
|
||||
// interval->start_ interval->end_
|
||||
// [**************************************************************[
|
||||
// start end
|
||||
// We can safely remove the old interval as it is fully included.
|
||||
PopInterval(manager, interval);
|
||||
} else {
|
||||
// [------------------------------------[
|
||||
// interval->start_ interval->end_
|
||||
// [*****************************[
|
||||
// start end
|
||||
interval->start_ = end;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
if (end < interval->end_) {
|
||||
// [--------------------------------------------------------------[
|
||||
// interval->start_ interval->end_
|
||||
// [*****************************[
|
||||
// start end
|
||||
// We have to split the old interval as it fully contains the new one.
|
||||
const int end_original = interval->end_;
|
||||
interval->end_ = start;
|
||||
InsertInterval(manager, interval, interval->cost_, interval->index_,
|
||||
end, end_original);
|
||||
interval = interval->next_;
|
||||
break;
|
||||
} else {
|
||||
// [------------------------------------[
|
||||
// interval->start_ interval->end_
|
||||
// [*****************************[
|
||||
// start end
|
||||
interval->end_ = start;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Insert the remaining interval from start to end.
|
||||
InsertInterval(manager, interval, cost, position, start, end);
|
||||
}
|
||||
}
|
||||
|
||||
static int BackwardReferencesHashChainDistanceOnly(
|
||||
int xsize, int ysize, const uint32_t* const argb, int cache_bits,
|
||||
const VP8LHashChain* const hash_chain, const VP8LBackwardRefs* const refs,
|
||||
uint16_t* const dist_array) {
|
||||
int i;
|
||||
int ok = 0;
|
||||
int cc_init = 0;
|
||||
const int pix_count = xsize * ysize;
|
||||
const int use_color_cache = (cache_bits > 0);
|
||||
const size_t literal_array_size =
|
||||
sizeof(double) * (NUM_LITERAL_CODES + NUM_LENGTH_CODES +
|
||||
((cache_bits > 0) ? (1 << cache_bits) : 0));
|
||||
const size_t cost_model_size = sizeof(CostModel) + literal_array_size;
|
||||
CostModel* const cost_model =
|
||||
(CostModel*)WebPSafeCalloc(1ULL, cost_model_size);
|
||||
VP8LColorCache hashers;
|
||||
CostManager* cost_manager =
|
||||
(CostManager*)WebPSafeMalloc(1ULL, sizeof(*cost_manager));
|
||||
int offset_prev = -1, len_prev = -1;
|
||||
double offset_cost = -1;
|
||||
int first_offset_is_constant = -1; // initialized with 'impossible' value
|
||||
int reach = 0;
|
||||
|
||||
if (cost_model == NULL || cost_manager == NULL) goto Error;
|
||||
|
||||
cost_model->literal_ = (double*)(cost_model + 1);
|
||||
if (use_color_cache) {
|
||||
cc_init = VP8LColorCacheInit(&hashers, cache_bits);
|
||||
if (!cc_init) goto Error;
|
||||
}
|
||||
|
||||
if (!CostModelBuild(cost_model, xsize, cache_bits, refs)) {
|
||||
goto Error;
|
||||
}
|
||||
|
||||
if (!CostManagerInit(cost_manager, dist_array, pix_count, cost_model)) {
|
||||
goto Error;
|
||||
}
|
||||
|
||||
// We loop one pixel at a time, but store all currently best points to
|
||||
// non-processed locations from this point.
|
||||
dist_array[0] = 0;
|
||||
// Add first pixel as literal.
|
||||
AddSingleLiteralWithCostModel(argb, &hashers, cost_model, 0, use_color_cache,
|
||||
0.f, cost_manager->costs_, dist_array);
|
||||
|
||||
for (i = 1; i < pix_count; ++i) {
|
||||
const float prev_cost = cost_manager->costs_[i - 1];
|
||||
int offset, len;
|
||||
VP8LHashChainFindCopy(hash_chain, i, &offset, &len);
|
||||
|
||||
// Try adding the pixel as a literal.
|
||||
AddSingleLiteralWithCostModel(argb, &hashers, cost_model, i,
|
||||
use_color_cache, prev_cost,
|
||||
cost_manager->costs_, dist_array);
|
||||
|
||||
// If we are dealing with a non-literal.
|
||||
if (len >= 2) {
|
||||
if (offset != offset_prev) {
|
||||
const int code = VP8LDistanceToPlaneCode(xsize, offset);
|
||||
offset_cost = GetDistanceCost(cost_model, code);
|
||||
first_offset_is_constant = 1;
|
||||
PushInterval(cost_manager, prev_cost + offset_cost, i, len);
|
||||
} else {
|
||||
assert(offset_cost >= 0);
|
||||
assert(len_prev >= 0);
|
||||
assert(first_offset_is_constant == 0 || first_offset_is_constant == 1);
|
||||
// Instead of considering all contributions from a pixel i by calling:
|
||||
// PushInterval(cost_manager, prev_cost + offset_cost, i, len);
|
||||
// we optimize these contributions in case offset_cost stays the same
|
||||
// for consecutive pixels. This describes a set of pixels similar to a
|
||||
// previous set (e.g. constant color regions).
|
||||
if (first_offset_is_constant) {
|
||||
reach = i - 1 + len_prev - 1;
|
||||
first_offset_is_constant = 0;
|
||||
}
|
||||
|
||||
if (i + len - 1 > reach) {
|
||||
// We can only be go further with the same offset if the previous
|
||||
// length was maxed, hence len_prev == len == MAX_LENGTH.
|
||||
// TODO(vrabaud), bump i to the end right away (insert cache and
|
||||
// update cost).
|
||||
// TODO(vrabaud), check if one of the points in between does not have
|
||||
// a lower cost.
|
||||
// Already consider the pixel at "reach" to add intervals that are
|
||||
// better than whatever we add.
|
||||
int offset_j, len_j = 0;
|
||||
int j;
|
||||
assert(len == MAX_LENGTH || len == pix_count - i);
|
||||
// Figure out the last consecutive pixel within [i, reach + 1] with
|
||||
// the same offset.
|
||||
for (j = i; j <= reach; ++j) {
|
||||
VP8LHashChainFindCopy(hash_chain, j + 1, &offset_j, &len_j);
|
||||
if (offset_j != offset) {
|
||||
VP8LHashChainFindCopy(hash_chain, j, &offset_j, &len_j);
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Update the cost at j - 1 and j.
|
||||
UpdateCostAtIndex(cost_manager, j - 1, 0);
|
||||
UpdateCostAtIndex(cost_manager, j, 0);
|
||||
|
||||
PushInterval(cost_manager, cost_manager->costs_[j - 1] + offset_cost,
|
||||
j, len_j);
|
||||
reach = j + len_j - 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
UpdateCostAtIndex(cost_manager, i, 1);
|
||||
offset_prev = offset;
|
||||
len_prev = len;
|
||||
}
|
||||
|
||||
ok = !refs->error_;
|
||||
Error:
|
||||
if (cc_init) VP8LColorCacheClear(&hashers);
|
||||
CostManagerClear(cost_manager);
|
||||
WebPSafeFree(cost_model);
|
||||
WebPSafeFree(cost_manager);
|
||||
return ok;
|
||||
}
|
||||
|
||||
// We pack the path at the end of *dist_array and return
|
||||
// a pointer to this part of the array. Example:
|
||||
// dist_array = [1x2xx3x2] => packed [1x2x1232], chosen_path = [1232]
|
||||
static void TraceBackwards(uint16_t* const dist_array,
|
||||
int dist_array_size,
|
||||
uint16_t** const chosen_path,
|
||||
int* const chosen_path_size) {
|
||||
uint16_t* path = dist_array + dist_array_size;
|
||||
uint16_t* cur = dist_array + dist_array_size - 1;
|
||||
while (cur >= dist_array) {
|
||||
const int k = *cur;
|
||||
--path;
|
||||
*path = k;
|
||||
cur -= k;
|
||||
}
|
||||
*chosen_path = path;
|
||||
*chosen_path_size = (int)(dist_array + dist_array_size - path);
|
||||
}
|
||||
|
||||
static int BackwardReferencesHashChainFollowChosenPath(
|
||||
const uint32_t* const argb, int cache_bits,
|
||||
const uint16_t* const chosen_path, int chosen_path_size,
|
||||
const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs) {
|
||||
const int use_color_cache = (cache_bits > 0);
|
||||
int ix;
|
||||
int i = 0;
|
||||
int ok = 0;
|
||||
int cc_init = 0;
|
||||
VP8LColorCache hashers;
|
||||
|
||||
if (use_color_cache) {
|
||||
cc_init = VP8LColorCacheInit(&hashers, cache_bits);
|
||||
if (!cc_init) goto Error;
|
||||
}
|
||||
|
||||
VP8LClearBackwardRefs(refs);
|
||||
for (ix = 0; ix < chosen_path_size; ++ix) {
|
||||
const int len = chosen_path[ix];
|
||||
if (len != 1) {
|
||||
int k;
|
||||
const int offset = VP8LHashChainFindOffset(hash_chain, i);
|
||||
VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
|
||||
if (use_color_cache) {
|
||||
for (k = 0; k < len; ++k) {
|
||||
VP8LColorCacheInsert(&hashers, argb[i + k]);
|
||||
}
|
||||
}
|
||||
i += len;
|
||||
} else {
|
||||
PixOrCopy v;
|
||||
const int idx =
|
||||
use_color_cache ? VP8LColorCacheContains(&hashers, argb[i]) : -1;
|
||||
if (idx >= 0) {
|
||||
// use_color_cache is true and hashers contains argb[i]
|
||||
// push pixel as a color cache index
|
||||
v = PixOrCopyCreateCacheIdx(idx);
|
||||
} else {
|
||||
if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
|
||||
v = PixOrCopyCreateLiteral(argb[i]);
|
||||
}
|
||||
VP8LBackwardRefsCursorAdd(refs, v);
|
||||
++i;
|
||||
}
|
||||
}
|
||||
ok = !refs->error_;
|
||||
Error:
|
||||
if (cc_init) VP8LColorCacheClear(&hashers);
|
||||
return ok;
|
||||
}
|
||||
|
||||
// Returns 1 on success.
|
||||
extern int VP8LBackwardReferencesTraceBackwards(
|
||||
int xsize, int ysize, const uint32_t* const argb, int cache_bits,
|
||||
const VP8LHashChain* const hash_chain,
|
||||
const VP8LBackwardRefs* const refs_src, VP8LBackwardRefs* const refs_dst);
|
||||
int VP8LBackwardReferencesTraceBackwards(int xsize, int ysize,
|
||||
const uint32_t* const argb,
|
||||
int cache_bits,
|
||||
const VP8LHashChain* const hash_chain,
|
||||
const VP8LBackwardRefs* const refs_src,
|
||||
VP8LBackwardRefs* const refs_dst) {
|
||||
int ok = 0;
|
||||
const int dist_array_size = xsize * ysize;
|
||||
uint16_t* chosen_path = NULL;
|
||||
int chosen_path_size = 0;
|
||||
uint16_t* dist_array =
|
||||
(uint16_t*)WebPSafeMalloc(dist_array_size, sizeof(*dist_array));
|
||||
|
||||
if (dist_array == NULL) goto Error;
|
||||
|
||||
if (!BackwardReferencesHashChainDistanceOnly(
|
||||
xsize, ysize, argb, cache_bits, hash_chain, refs_src, dist_array)) {
|
||||
goto Error;
|
||||
}
|
||||
TraceBackwards(dist_array, dist_array_size, &chosen_path, &chosen_path_size);
|
||||
if (!BackwardReferencesHashChainFollowChosenPath(
|
||||
argb, cache_bits, chosen_path, chosen_path_size, hash_chain,
|
||||
refs_dst)) {
|
||||
goto Error;
|
||||
}
|
||||
ok = 1;
|
||||
Error:
|
||||
WebPSafeFree(dist_array);
|
||||
return ok;
|
||||
}
|
|
@ -0,0 +1,943 @@
|
|||
// Copyright 2012 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// Author: Jyrki Alakuijala (jyrki@google.com)
|
||||
//
|
||||
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "src/enc/backward_references_enc.h"
|
||||
#include "src/enc/histogram_enc.h"
|
||||
#include "src/dsp/lossless.h"
|
||||
#include "src/dsp/lossless_common.h"
|
||||
#include "src/dsp/dsp.h"
|
||||
#include "src/utils/color_cache_utils.h"
|
||||
#include "src/utils/utils.h"
|
||||
|
||||
#define MIN_BLOCK_SIZE 256 // minimum block size for backward references
|
||||
|
||||
#define MAX_ENTROPY (1e30f)
|
||||
|
||||
// 1M window (4M bytes) minus 120 special codes for short distances.
|
||||
#define WINDOW_SIZE ((1 << WINDOW_SIZE_BITS) - 120)
|
||||
|
||||
// Minimum number of pixels for which it is cheaper to encode a
|
||||
// distance + length instead of each pixel as a literal.
|
||||
#define MIN_LENGTH 4
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
static const uint8_t plane_to_code_lut[128] = {
|
||||
96, 73, 55, 39, 23, 13, 5, 1, 255, 255, 255, 255, 255, 255, 255, 255,
|
||||
101, 78, 58, 42, 26, 16, 8, 2, 0, 3, 9, 17, 27, 43, 59, 79,
|
||||
102, 86, 62, 46, 32, 20, 10, 6, 4, 7, 11, 21, 33, 47, 63, 87,
|
||||
105, 90, 70, 52, 37, 28, 18, 14, 12, 15, 19, 29, 38, 53, 71, 91,
|
||||
110, 99, 82, 66, 48, 35, 30, 24, 22, 25, 31, 36, 49, 67, 83, 100,
|
||||
115, 108, 94, 76, 64, 50, 44, 40, 34, 41, 45, 51, 65, 77, 95, 109,
|
||||
118, 113, 103, 92, 80, 68, 60, 56, 54, 57, 61, 69, 81, 93, 104, 114,
|
||||
119, 116, 111, 106, 97, 88, 84, 74, 72, 75, 85, 89, 98, 107, 112, 117
|
||||
};
|
||||
|
||||
extern int VP8LDistanceToPlaneCode(int xsize, int dist);
|
||||
int VP8LDistanceToPlaneCode(int xsize, int dist) {
|
||||
const int yoffset = dist / xsize;
|
||||
const int xoffset = dist - yoffset * xsize;
|
||||
if (xoffset <= 8 && yoffset < 8) {
|
||||
return plane_to_code_lut[yoffset * 16 + 8 - xoffset] + 1;
|
||||
} else if (xoffset > xsize - 8 && yoffset < 7) {
|
||||
return plane_to_code_lut[(yoffset + 1) * 16 + 8 + (xsize - xoffset)] + 1;
|
||||
}
|
||||
return dist + 120;
|
||||
}
|
||||
|
||||
// Returns the exact index where array1 and array2 are different. For an index
|
||||
// inferior or equal to best_len_match, the return value just has to be strictly
|
||||
// inferior to best_len_match. The current behavior is to return 0 if this index
|
||||
// is best_len_match, and the index itself otherwise.
|
||||
// If no two elements are the same, it returns max_limit.
|
||||
static WEBP_INLINE int FindMatchLength(const uint32_t* const array1,
|
||||
const uint32_t* const array2,
|
||||
int best_len_match, int max_limit) {
|
||||
// Before 'expensive' linear match, check if the two arrays match at the
|
||||
// current best length index.
|
||||
if (array1[best_len_match] != array2[best_len_match]) return 0;
|
||||
|
||||
return VP8LVectorMismatch(array1, array2, max_limit);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// VP8LBackwardRefs
|
||||
|
||||
struct PixOrCopyBlock {
|
||||
PixOrCopyBlock* next_; // next block (or NULL)
|
||||
PixOrCopy* start_; // data start
|
||||
int size_; // currently used size
|
||||
};
|
||||
|
||||
extern void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs);
|
||||
void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs) {
|
||||
assert(refs != NULL);
|
||||
if (refs->tail_ != NULL) {
|
||||
*refs->tail_ = refs->free_blocks_; // recycle all blocks at once
|
||||
}
|
||||
refs->free_blocks_ = refs->refs_;
|
||||
refs->tail_ = &refs->refs_;
|
||||
refs->last_block_ = NULL;
|
||||
refs->refs_ = NULL;
|
||||
}
|
||||
|
||||
void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs) {
|
||||
assert(refs != NULL);
|
||||
VP8LClearBackwardRefs(refs);
|
||||
while (refs->free_blocks_ != NULL) {
|
||||
PixOrCopyBlock* const next = refs->free_blocks_->next_;
|
||||
WebPSafeFree(refs->free_blocks_);
|
||||
refs->free_blocks_ = next;
|
||||
}
|
||||
}
|
||||
|
||||
void VP8LBackwardRefsInit(VP8LBackwardRefs* const refs, int block_size) {
|
||||
assert(refs != NULL);
|
||||
memset(refs, 0, sizeof(*refs));
|
||||
refs->tail_ = &refs->refs_;
|
||||
refs->block_size_ =
|
||||
(block_size < MIN_BLOCK_SIZE) ? MIN_BLOCK_SIZE : block_size;
|
||||
}
|
||||
|
||||
VP8LRefsCursor VP8LRefsCursorInit(const VP8LBackwardRefs* const refs) {
|
||||
VP8LRefsCursor c;
|
||||
c.cur_block_ = refs->refs_;
|
||||
if (refs->refs_ != NULL) {
|
||||
c.cur_pos = c.cur_block_->start_;
|
||||
c.last_pos_ = c.cur_pos + c.cur_block_->size_;
|
||||
} else {
|
||||
c.cur_pos = NULL;
|
||||
c.last_pos_ = NULL;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
void VP8LRefsCursorNextBlock(VP8LRefsCursor* const c) {
|
||||
PixOrCopyBlock* const b = c->cur_block_->next_;
|
||||
c->cur_pos = (b == NULL) ? NULL : b->start_;
|
||||
c->last_pos_ = (b == NULL) ? NULL : b->start_ + b->size_;
|
||||
c->cur_block_ = b;
|
||||
}
|
||||
|
||||
// Create a new block, either from the free list or allocated
|
||||
static PixOrCopyBlock* BackwardRefsNewBlock(VP8LBackwardRefs* const refs) {
|
||||
PixOrCopyBlock* b = refs->free_blocks_;
|
||||
if (b == NULL) { // allocate new memory chunk
|
||||
const size_t total_size =
|
||||
sizeof(*b) + refs->block_size_ * sizeof(*b->start_);
|
||||
b = (PixOrCopyBlock*)WebPSafeMalloc(1ULL, total_size);
|
||||
if (b == NULL) {
|
||||
refs->error_ |= 1;
|
||||
return NULL;
|
||||
}
|
||||
b->start_ = (PixOrCopy*)((uint8_t*)b + sizeof(*b)); // not always aligned
|
||||
} else { // recycle from free-list
|
||||
refs->free_blocks_ = b->next_;
|
||||
}
|
||||
*refs->tail_ = b;
|
||||
refs->tail_ = &b->next_;
|
||||
refs->last_block_ = b;
|
||||
b->next_ = NULL;
|
||||
b->size_ = 0;
|
||||
return b;
|
||||
}
|
||||
|
||||
extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
|
||||
const PixOrCopy v);
|
||||
void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
|
||||
const PixOrCopy v) {
|
||||
PixOrCopyBlock* b = refs->last_block_;
|
||||
if (b == NULL || b->size_ == refs->block_size_) {
|
||||
b = BackwardRefsNewBlock(refs);
|
||||
if (b == NULL) return; // refs->error_ is set
|
||||
}
|
||||
b->start_[b->size_++] = v;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Hash chains
|
||||
|
||||
int VP8LHashChainInit(VP8LHashChain* const p, int size) {
|
||||
assert(p->size_ == 0);
|
||||
assert(p->offset_length_ == NULL);
|
||||
assert(size > 0);
|
||||
p->offset_length_ =
|
||||
(uint32_t*)WebPSafeMalloc(size, sizeof(*p->offset_length_));
|
||||
if (p->offset_length_ == NULL) return 0;
|
||||
p->size_ = size;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
void VP8LHashChainClear(VP8LHashChain* const p) {
|
||||
assert(p != NULL);
|
||||
WebPSafeFree(p->offset_length_);
|
||||
|
||||
p->size_ = 0;
|
||||
p->offset_length_ = NULL;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
#define HASH_MULTIPLIER_HI (0xc6a4a793ULL)
|
||||
#define HASH_MULTIPLIER_LO (0x5bd1e996ULL)
|
||||
|
||||
static WEBP_INLINE uint32_t GetPixPairHash64(const uint32_t* const argb) {
|
||||
uint32_t key;
|
||||
key = (argb[1] * HASH_MULTIPLIER_HI) & 0xffffffffu;
|
||||
key += (argb[0] * HASH_MULTIPLIER_LO) & 0xffffffffu;
|
||||
key = key >> (32 - HASH_BITS);
|
||||
return key;
|
||||
}
|
||||
|
||||
// Returns the maximum number of hash chain lookups to do for a
|
||||
// given compression quality. Return value in range [8, 86].
|
||||
static int GetMaxItersForQuality(int quality) {
|
||||
return 8 + (quality * quality) / 128;
|
||||
}
|
||||
|
||||
static int GetWindowSizeForHashChain(int quality, int xsize) {
|
||||
const int max_window_size = (quality > 75) ? WINDOW_SIZE
|
||||
: (quality > 50) ? (xsize << 8)
|
||||
: (quality > 25) ? (xsize << 6)
|
||||
: (xsize << 4);
|
||||
assert(xsize > 0);
|
||||
return (max_window_size > WINDOW_SIZE) ? WINDOW_SIZE : max_window_size;
|
||||
}
|
||||
|
||||
static WEBP_INLINE int MaxFindCopyLength(int len) {
|
||||
return (len < MAX_LENGTH) ? len : MAX_LENGTH;
|
||||
}
|
||||
|
||||
int VP8LHashChainFill(VP8LHashChain* const p, int quality,
|
||||
const uint32_t* const argb, int xsize, int ysize,
|
||||
int low_effort) {
|
||||
const int size = xsize * ysize;
|
||||
const int iter_max = GetMaxItersForQuality(quality);
|
||||
const uint32_t window_size = GetWindowSizeForHashChain(quality, xsize);
|
||||
int pos;
|
||||
int argb_comp;
|
||||
uint32_t base_position;
|
||||
int32_t* hash_to_first_index;
|
||||
// Temporarily use the p->offset_length_ as a hash chain.
|
||||
int32_t* chain = (int32_t*)p->offset_length_;
|
||||
assert(size > 0);
|
||||
assert(p->size_ != 0);
|
||||
assert(p->offset_length_ != NULL);
|
||||
|
||||
if (size <= 2) {
|
||||
p->offset_length_[0] = p->offset_length_[size - 1] = 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
hash_to_first_index =
|
||||
(int32_t*)WebPSafeMalloc(HASH_SIZE, sizeof(*hash_to_first_index));
|
||||
if (hash_to_first_index == NULL) return 0;
|
||||
|
||||
// Set the int32_t array to -1.
|
||||
memset(hash_to_first_index, 0xff, HASH_SIZE * sizeof(*hash_to_first_index));
|
||||
// Fill the chain linking pixels with the same hash.
|
||||
argb_comp = (argb[0] == argb[1]);
|
||||
for (pos = 0; pos < size - 2;) {
|
||||
uint32_t hash_code;
|
||||
const int argb_comp_next = (argb[pos + 1] == argb[pos + 2]);
|
||||
if (argb_comp && argb_comp_next) {
|
||||
// Consecutive pixels with the same color will share the same hash.
|
||||
// We therefore use a different hash: the color and its repetition
|
||||
// length.
|
||||
uint32_t tmp[2];
|
||||
uint32_t len = 1;
|
||||
tmp[0] = argb[pos];
|
||||
// Figure out how far the pixels are the same.
|
||||
// The last pixel has a different 64 bit hash, as its next pixel does
|
||||
// not have the same color, so we just need to get to the last pixel equal
|
||||
// to its follower.
|
||||
while (pos + (int)len + 2 < size && argb[pos + len + 2] == argb[pos]) {
|
||||
++len;
|
||||
}
|
||||
if (len > MAX_LENGTH) {
|
||||
// Skip the pixels that match for distance=1 and length>MAX_LENGTH
|
||||
// because they are linked to their predecessor and we automatically
|
||||
// check that in the main for loop below. Skipping means setting no
|
||||
// predecessor in the chain, hence -1.
|
||||
memset(chain + pos, 0xff, (len - MAX_LENGTH) * sizeof(*chain));
|
||||
pos += len - MAX_LENGTH;
|
||||
len = MAX_LENGTH;
|
||||
}
|
||||
// Process the rest of the hash chain.
|
||||
while (len) {
|
||||
tmp[1] = len--;
|
||||
hash_code = GetPixPairHash64(tmp);
|
||||
chain[pos] = hash_to_first_index[hash_code];
|
||||
hash_to_first_index[hash_code] = pos++;
|
||||
}
|
||||
argb_comp = 0;
|
||||
} else {
|
||||
// Just move one pixel forward.
|
||||
hash_code = GetPixPairHash64(argb + pos);
|
||||
chain[pos] = hash_to_first_index[hash_code];
|
||||
hash_to_first_index[hash_code] = pos++;
|
||||
argb_comp = argb_comp_next;
|
||||
}
|
||||
}
|
||||
// Process the penultimate pixel.
|
||||
chain[pos] = hash_to_first_index[GetPixPairHash64(argb + pos)];
|
||||
|
||||
WebPSafeFree(hash_to_first_index);
|
||||
|
||||
// Find the best match interval at each pixel, defined by an offset to the
|
||||
// pixel and a length. The right-most pixel cannot match anything to the right
|
||||
// (hence a best length of 0) and the left-most pixel nothing to the left
|
||||
// (hence an offset of 0).
|
||||
assert(size > 2);
|
||||
p->offset_length_[0] = p->offset_length_[size - 1] = 0;
|
||||
for (base_position = size - 2; base_position > 0;) {
|
||||
const int max_len = MaxFindCopyLength(size - 1 - base_position);
|
||||
const uint32_t* const argb_start = argb + base_position;
|
||||
int iter = iter_max;
|
||||
int best_length = 0;
|
||||
uint32_t best_distance = 0;
|
||||
uint32_t best_argb;
|
||||
const int min_pos =
|
||||
(base_position > window_size) ? base_position - window_size : 0;
|
||||
const int length_max = (max_len < 256) ? max_len : 256;
|
||||
uint32_t max_base_position;
|
||||
|
||||
pos = chain[base_position];
|
||||
if (!low_effort) {
|
||||
int curr_length;
|
||||
// Heuristic: use the comparison with the above line as an initialization.
|
||||
if (base_position >= (uint32_t)xsize) {
|
||||
curr_length = FindMatchLength(argb_start - xsize, argb_start,
|
||||
best_length, max_len);
|
||||
if (curr_length > best_length) {
|
||||
best_length = curr_length;
|
||||
best_distance = xsize;
|
||||
}
|
||||
--iter;
|
||||
}
|
||||
// Heuristic: compare to the previous pixel.
|
||||
curr_length =
|
||||
FindMatchLength(argb_start - 1, argb_start, best_length, max_len);
|
||||
if (curr_length > best_length) {
|
||||
best_length = curr_length;
|
||||
best_distance = 1;
|
||||
}
|
||||
--iter;
|
||||
// Skip the for loop if we already have the maximum.
|
||||
if (best_length == MAX_LENGTH) pos = min_pos - 1;
|
||||
}
|
||||
best_argb = argb_start[best_length];
|
||||
|
||||
for (; pos >= min_pos && --iter; pos = chain[pos]) {
|
||||
int curr_length;
|
||||
assert(base_position > (uint32_t)pos);
|
||||
|
||||
if (argb[pos + best_length] != best_argb) continue;
|
||||
|
||||
curr_length = VP8LVectorMismatch(argb + pos, argb_start, max_len);
|
||||
if (best_length < curr_length) {
|
||||
best_length = curr_length;
|
||||
best_distance = base_position - pos;
|
||||
best_argb = argb_start[best_length];
|
||||
// Stop if we have reached a good enough length.
|
||||
if (best_length >= length_max) break;
|
||||
}
|
||||
}
|
||||
// We have the best match but in case the two intervals continue matching
|
||||
// to the left, we have the best matches for the left-extended pixels.
|
||||
max_base_position = base_position;
|
||||
while (1) {
|
||||
assert(best_length <= MAX_LENGTH);
|
||||
assert(best_distance <= WINDOW_SIZE);
|
||||
p->offset_length_[base_position] =
|
||||
(best_distance << MAX_LENGTH_BITS) | (uint32_t)best_length;
|
||||
--base_position;
|
||||
// Stop if we don't have a match or if we are out of bounds.
|
||||
if (best_distance == 0 || base_position == 0) break;
|
||||
// Stop if we cannot extend the matching intervals to the left.
|
||||
if (base_position < best_distance ||
|
||||
argb[base_position - best_distance] != argb[base_position]) {
|
||||
break;
|
||||
}
|
||||
// Stop if we are matching at its limit because there could be a closer
|
||||
// matching interval with the same maximum length. Then again, if the
|
||||
// matching interval is as close as possible (best_distance == 1), we will
|
||||
// never find anything better so let's continue.
|
||||
if (best_length == MAX_LENGTH && best_distance != 1 &&
|
||||
base_position + MAX_LENGTH < max_base_position) {
|
||||
break;
|
||||
}
|
||||
if (best_length < MAX_LENGTH) {
|
||||
++best_length;
|
||||
max_base_position = base_position;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static WEBP_INLINE void AddSingleLiteral(uint32_t pixel, int use_color_cache,
|
||||
VP8LColorCache* const hashers,
|
||||
VP8LBackwardRefs* const refs) {
|
||||
PixOrCopy v;
|
||||
if (use_color_cache) {
|
||||
const uint32_t key = VP8LColorCacheGetIndex(hashers, pixel);
|
||||
if (VP8LColorCacheLookup(hashers, key) == pixel) {
|
||||
v = PixOrCopyCreateCacheIdx(key);
|
||||
} else {
|
||||
v = PixOrCopyCreateLiteral(pixel);
|
||||
VP8LColorCacheSet(hashers, key, pixel);
|
||||
}
|
||||
} else {
|
||||
v = PixOrCopyCreateLiteral(pixel);
|
||||
}
|
||||
VP8LBackwardRefsCursorAdd(refs, v);
|
||||
}
|
||||
|
||||
static int BackwardReferencesRle(int xsize, int ysize,
|
||||
const uint32_t* const argb,
|
||||
int cache_bits, VP8LBackwardRefs* const refs) {
|
||||
const int pix_count = xsize * ysize;
|
||||
int i, k;
|
||||
const int use_color_cache = (cache_bits > 0);
|
||||
VP8LColorCache hashers;
|
||||
|
||||
if (use_color_cache && !VP8LColorCacheInit(&hashers, cache_bits)) {
|
||||
return 0;
|
||||
}
|
||||
VP8LClearBackwardRefs(refs);
|
||||
// Add first pixel as literal.
|
||||
AddSingleLiteral(argb[0], use_color_cache, &hashers, refs);
|
||||
i = 1;
|
||||
while (i < pix_count) {
|
||||
const int max_len = MaxFindCopyLength(pix_count - i);
|
||||
const int rle_len = FindMatchLength(argb + i, argb + i - 1, 0, max_len);
|
||||
const int prev_row_len = (i < xsize) ? 0 :
|
||||
FindMatchLength(argb + i, argb + i - xsize, 0, max_len);
|
||||
if (rle_len >= prev_row_len && rle_len >= MIN_LENGTH) {
|
||||
VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(1, rle_len));
|
||||
// We don't need to update the color cache here since it is always the
|
||||
// same pixel being copied, and that does not change the color cache
|
||||
// state.
|
||||
i += rle_len;
|
||||
} else if (prev_row_len >= MIN_LENGTH) {
|
||||
VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(xsize, prev_row_len));
|
||||
if (use_color_cache) {
|
||||
for (k = 0; k < prev_row_len; ++k) {
|
||||
VP8LColorCacheInsert(&hashers, argb[i + k]);
|
||||
}
|
||||
}
|
||||
i += prev_row_len;
|
||||
} else {
|
||||
AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
if (use_color_cache) VP8LColorCacheClear(&hashers);
|
||||
return !refs->error_;
|
||||
}
|
||||
|
||||
static int BackwardReferencesLz77(int xsize, int ysize,
|
||||
const uint32_t* const argb, int cache_bits,
|
||||
const VP8LHashChain* const hash_chain,
|
||||
VP8LBackwardRefs* const refs) {
|
||||
int i;
|
||||
int i_last_check = -1;
|
||||
int ok = 0;
|
||||
int cc_init = 0;
|
||||
const int use_color_cache = (cache_bits > 0);
|
||||
const int pix_count = xsize * ysize;
|
||||
VP8LColorCache hashers;
|
||||
|
||||
if (use_color_cache) {
|
||||
cc_init = VP8LColorCacheInit(&hashers, cache_bits);
|
||||
if (!cc_init) goto Error;
|
||||
}
|
||||
VP8LClearBackwardRefs(refs);
|
||||
for (i = 0; i < pix_count;) {
|
||||
// Alternative#1: Code the pixels starting at 'i' using backward reference.
|
||||
int offset = 0;
|
||||
int len = 0;
|
||||
int j;
|
||||
VP8LHashChainFindCopy(hash_chain, i, &offset, &len);
|
||||
if (len >= MIN_LENGTH) {
|
||||
const int len_ini = len;
|
||||
int max_reach = 0;
|
||||
const int j_max =
|
||||
(i + len_ini >= pix_count) ? pix_count - 1 : i + len_ini;
|
||||
// Only start from what we have not checked already.
|
||||
i_last_check = (i > i_last_check) ? i : i_last_check;
|
||||
// We know the best match for the current pixel but we try to find the
|
||||
// best matches for the current pixel AND the next one combined.
|
||||
// The naive method would use the intervals:
|
||||
// [i,i+len) + [i+len, length of best match at i+len)
|
||||
// while we check if we can use:
|
||||
// [i,j) (where j<=i+len) + [j, length of best match at j)
|
||||
for (j = i_last_check + 1; j <= j_max; ++j) {
|
||||
const int len_j = VP8LHashChainFindLength(hash_chain, j);
|
||||
const int reach =
|
||||
j + (len_j >= MIN_LENGTH ? len_j : 1); // 1 for single literal.
|
||||
if (reach > max_reach) {
|
||||
len = j - i;
|
||||
max_reach = reach;
|
||||
if (max_reach >= pix_count) break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
len = 1;
|
||||
}
|
||||
// Go with literal or backward reference.
|
||||
assert(len > 0);
|
||||
if (len == 1) {
|
||||
AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
|
||||
} else {
|
||||
VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
|
||||
if (use_color_cache) {
|
||||
for (j = i; j < i + len; ++j) VP8LColorCacheInsert(&hashers, argb[j]);
|
||||
}
|
||||
}
|
||||
i += len;
|
||||
}
|
||||
|
||||
ok = !refs->error_;
|
||||
Error:
|
||||
if (cc_init) VP8LColorCacheClear(&hashers);
|
||||
return ok;
|
||||
}
|
||||
|
||||
// Compute an LZ77 by forcing matches to happen within a given distance cost.
|
||||
// We therefore limit the algorithm to the lowest 32 values in the PlaneCode
|
||||
// definition.
|
||||
#define WINDOW_OFFSETS_SIZE_MAX 32
|
||||
static int BackwardReferencesLz77Box(int xsize, int ysize,
|
||||
const uint32_t* const argb, int cache_bits,
|
||||
const VP8LHashChain* const hash_chain_best,
|
||||
VP8LHashChain* hash_chain,
|
||||
VP8LBackwardRefs* const refs) {
|
||||
int i;
|
||||
const int pix_count = xsize * ysize;
|
||||
uint16_t* counts;
|
||||
int window_offsets[WINDOW_OFFSETS_SIZE_MAX] = {0};
|
||||
int window_offsets_new[WINDOW_OFFSETS_SIZE_MAX] = {0};
|
||||
int window_offsets_size = 0;
|
||||
int window_offsets_new_size = 0;
|
||||
uint16_t* const counts_ini =
|
||||
(uint16_t*)WebPSafeMalloc(xsize * ysize, sizeof(*counts_ini));
|
||||
int best_offset_prev = -1, best_length_prev = -1;
|
||||
if (counts_ini == NULL) return 0;
|
||||
|
||||
// counts[i] counts how many times a pixel is repeated starting at position i.
|
||||
i = pix_count - 2;
|
||||
counts = counts_ini + i;
|
||||
counts[1] = 1;
|
||||
for (; i >= 0; --i, --counts) {
|
||||
if (argb[i] == argb[i + 1]) {
|
||||
// Max out the counts to MAX_LENGTH.
|
||||
counts[0] = counts[1] + (counts[1] != MAX_LENGTH);
|
||||
} else {
|
||||
counts[0] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Figure out the window offsets around a pixel. They are stored in a
|
||||
// spiraling order around the pixel as defined by VP8LDistanceToPlaneCode.
|
||||
{
|
||||
int x, y;
|
||||
for (y = 0; y <= 6; ++y) {
|
||||
for (x = -6; x <= 6; ++x) {
|
||||
const int offset = y * xsize + x;
|
||||
int plane_code;
|
||||
// Ignore offsets that bring us after the pixel.
|
||||
if (offset <= 0) continue;
|
||||
plane_code = VP8LDistanceToPlaneCode(xsize, offset) - 1;
|
||||
if (plane_code >= WINDOW_OFFSETS_SIZE_MAX) continue;
|
||||
window_offsets[plane_code] = offset;
|
||||
}
|
||||
}
|
||||
// For narrow images, not all plane codes are reached, so remove those.
|
||||
for (i = 0; i < WINDOW_OFFSETS_SIZE_MAX; ++i) {
|
||||
if (window_offsets[i] == 0) continue;
|
||||
window_offsets[window_offsets_size++] = window_offsets[i];
|
||||
}
|
||||
// Given a pixel P, find the offsets that reach pixels unreachable from P-1
|
||||
// with any of the offsets in window_offsets[].
|
||||
for (i = 0; i < window_offsets_size; ++i) {
|
||||
int j;
|
||||
int is_reachable = 0;
|
||||
for (j = 0; j < window_offsets_size && !is_reachable; ++j) {
|
||||
is_reachable |= (window_offsets[i] == window_offsets[j] + 1);
|
||||
}
|
||||
if (!is_reachable) {
|
||||
window_offsets_new[window_offsets_new_size] = window_offsets[i];
|
||||
++window_offsets_new_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
hash_chain->offset_length_[0] = 0;
|
||||
for (i = 1; i < pix_count; ++i) {
|
||||
int ind;
|
||||
int best_length = VP8LHashChainFindLength(hash_chain_best, i);
|
||||
int best_offset;
|
||||
int do_compute = 1;
|
||||
|
||||
if (best_length >= MAX_LENGTH) {
|
||||
// Do not recompute the best match if we already have a maximal one in the
|
||||
// window.
|
||||
best_offset = VP8LHashChainFindOffset(hash_chain_best, i);
|
||||
for (ind = 0; ind < window_offsets_size; ++ind) {
|
||||
if (best_offset == window_offsets[ind]) {
|
||||
do_compute = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (do_compute) {
|
||||
// Figure out if we should use the offset/length from the previous pixel
|
||||
// as an initial guess and therefore only inspect the offsets in
|
||||
// window_offsets_new[].
|
||||
const int use_prev =
|
||||
(best_length_prev > 1) && (best_length_prev < MAX_LENGTH);
|
||||
const int num_ind =
|
||||
use_prev ? window_offsets_new_size : window_offsets_size;
|
||||
best_length = use_prev ? best_length_prev - 1 : 0;
|
||||
best_offset = use_prev ? best_offset_prev : 0;
|
||||
// Find the longest match in a window around the pixel.
|
||||
for (ind = 0; ind < num_ind; ++ind) {
|
||||
int curr_length = 0;
|
||||
int j = i;
|
||||
int j_offset =
|
||||
use_prev ? i - window_offsets_new[ind] : i - window_offsets[ind];
|
||||
if (j_offset < 0 || argb[j_offset] != argb[i]) continue;
|
||||
// The longest match is the sum of how many times each pixel is
|
||||
// repeated.
|
||||
do {
|
||||
const int counts_j_offset = counts_ini[j_offset];
|
||||
const int counts_j = counts_ini[j];
|
||||
if (counts_j_offset != counts_j) {
|
||||
curr_length +=
|
||||
(counts_j_offset < counts_j) ? counts_j_offset : counts_j;
|
||||
break;
|
||||
}
|
||||
// The same color is repeated counts_pos times at j_offset and j.
|
||||
curr_length += counts_j_offset;
|
||||
j_offset += counts_j_offset;
|
||||
j += counts_j_offset;
|
||||
} while (curr_length <= MAX_LENGTH && j < pix_count &&
|
||||
argb[j_offset] == argb[j]);
|
||||
if (best_length < curr_length) {
|
||||
best_offset =
|
||||
use_prev ? window_offsets_new[ind] : window_offsets[ind];
|
||||
if (curr_length >= MAX_LENGTH) {
|
||||
best_length = MAX_LENGTH;
|
||||
break;
|
||||
} else {
|
||||
best_length = curr_length;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
assert(i + best_length <= pix_count);
|
||||
assert(best_length <= MAX_LENGTH);
|
||||
if (best_length <= MIN_LENGTH) {
|
||||
hash_chain->offset_length_[i] = 0;
|
||||
best_offset_prev = 0;
|
||||
best_length_prev = 0;
|
||||
} else {
|
||||
hash_chain->offset_length_[i] =
|
||||
(best_offset << MAX_LENGTH_BITS) | (uint32_t)best_length;
|
||||
best_offset_prev = best_offset;
|
||||
best_length_prev = best_length;
|
||||
}
|
||||
}
|
||||
hash_chain->offset_length_[0] = 0;
|
||||
WebPSafeFree(counts_ini);
|
||||
|
||||
return BackwardReferencesLz77(xsize, ysize, argb, cache_bits, hash_chain,
|
||||
refs);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
static void BackwardReferences2DLocality(int xsize,
|
||||
const VP8LBackwardRefs* const refs) {
|
||||
VP8LRefsCursor c = VP8LRefsCursorInit(refs);
|
||||
while (VP8LRefsCursorOk(&c)) {
|
||||
if (PixOrCopyIsCopy(c.cur_pos)) {
|
||||
const int dist = c.cur_pos->argb_or_distance;
|
||||
const int transformed_dist = VP8LDistanceToPlaneCode(xsize, dist);
|
||||
c.cur_pos->argb_or_distance = transformed_dist;
|
||||
}
|
||||
VP8LRefsCursorNext(&c);
|
||||
}
|
||||
}
|
||||
|
||||
// Evaluate optimal cache bits for the local color cache.
|
||||
// The input *best_cache_bits sets the maximum cache bits to use (passing 0
|
||||
// implies disabling the local color cache). The local color cache is also
|
||||
// disabled for the lower (<= 25) quality.
|
||||
// Returns 0 in case of memory error.
|
||||
static int CalculateBestCacheSize(const uint32_t* argb, int quality,
|
||||
const VP8LBackwardRefs* const refs,
|
||||
int* const best_cache_bits) {
|
||||
int i;
|
||||
const int cache_bits_max = (quality <= 25) ? 0 : *best_cache_bits;
|
||||
double entropy_min = MAX_ENTROPY;
|
||||
int cc_init[MAX_COLOR_CACHE_BITS + 1] = { 0 };
|
||||
VP8LColorCache hashers[MAX_COLOR_CACHE_BITS + 1];
|
||||
VP8LRefsCursor c = VP8LRefsCursorInit(refs);
|
||||
VP8LHistogram* histos[MAX_COLOR_CACHE_BITS + 1] = { NULL };
|
||||
int ok = 0;
|
||||
|
||||
assert(cache_bits_max >= 0 && cache_bits_max <= MAX_COLOR_CACHE_BITS);
|
||||
|
||||
if (cache_bits_max == 0) {
|
||||
*best_cache_bits = 0;
|
||||
// Local color cache is disabled.
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Allocate data.
|
||||
for (i = 0; i <= cache_bits_max; ++i) {
|
||||
histos[i] = VP8LAllocateHistogram(i);
|
||||
if (histos[i] == NULL) goto Error;
|
||||
if (i == 0) continue;
|
||||
cc_init[i] = VP8LColorCacheInit(&hashers[i], i);
|
||||
if (!cc_init[i]) goto Error;
|
||||
}
|
||||
|
||||
// Find the cache_bits giving the lowest entropy. The search is done in a
|
||||
// brute-force way as the function (entropy w.r.t cache_bits) can be
|
||||
// anything in practice.
|
||||
while (VP8LRefsCursorOk(&c)) {
|
||||
const PixOrCopy* const v = c.cur_pos;
|
||||
if (PixOrCopyIsLiteral(v)) {
|
||||
const uint32_t pix = *argb++;
|
||||
const uint32_t a = (pix >> 24) & 0xff;
|
||||
const uint32_t r = (pix >> 16) & 0xff;
|
||||
const uint32_t g = (pix >> 8) & 0xff;
|
||||
const uint32_t b = (pix >> 0) & 0xff;
|
||||
// The keys of the caches can be derived from the longest one.
|
||||
int key = VP8LHashPix(pix, 32 - cache_bits_max);
|
||||
// Do not use the color cache for cache_bits = 0.
|
||||
++histos[0]->blue_[b];
|
||||
++histos[0]->literal_[g];
|
||||
++histos[0]->red_[r];
|
||||
++histos[0]->alpha_[a];
|
||||
// Deal with cache_bits > 0.
|
||||
for (i = cache_bits_max; i >= 1; --i, key >>= 1) {
|
||||
if (VP8LColorCacheLookup(&hashers[i], key) == pix) {
|
||||
++histos[i]->literal_[NUM_LITERAL_CODES + NUM_LENGTH_CODES + key];
|
||||
} else {
|
||||
VP8LColorCacheSet(&hashers[i], key, pix);
|
||||
++histos[i]->blue_[b];
|
||||
++histos[i]->literal_[g];
|
||||
++histos[i]->red_[r];
|
||||
++histos[i]->alpha_[a];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// We should compute the contribution of the (distance,length)
|
||||
// histograms but those are the same independently from the cache size.
|
||||
// As those constant contributions are in the end added to the other
|
||||
// histogram contributions, we can safely ignore them.
|
||||
int len = PixOrCopyLength(v);
|
||||
uint32_t argb_prev = *argb ^ 0xffffffffu;
|
||||
// Update the color caches.
|
||||
do {
|
||||
if (*argb != argb_prev) {
|
||||
// Efficiency: insert only if the color changes.
|
||||
int key = VP8LHashPix(*argb, 32 - cache_bits_max);
|
||||
for (i = cache_bits_max; i >= 1; --i, key >>= 1) {
|
||||
hashers[i].colors_[key] = *argb;
|
||||
}
|
||||
argb_prev = *argb;
|
||||
}
|
||||
argb++;
|
||||
} while (--len != 0);
|
||||
}
|
||||
VP8LRefsCursorNext(&c);
|
||||
}
|
||||
|
||||
for (i = 0; i <= cache_bits_max; ++i) {
|
||||
const double entropy = VP8LHistogramEstimateBits(histos[i]);
|
||||
if (i == 0 || entropy < entropy_min) {
|
||||
entropy_min = entropy;
|
||||
*best_cache_bits = i;
|
||||
}
|
||||
}
|
||||
ok = 1;
|
||||
Error:
|
||||
for (i = 0; i <= cache_bits_max; ++i) {
|
||||
if (cc_init[i]) VP8LColorCacheClear(&hashers[i]);
|
||||
VP8LFreeHistogram(histos[i]);
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
|
||||
// Update (in-place) backward references for specified cache_bits.
|
||||
static int BackwardRefsWithLocalCache(const uint32_t* const argb,
|
||||
int cache_bits,
|
||||
VP8LBackwardRefs* const refs) {
|
||||
int pixel_index = 0;
|
||||
VP8LColorCache hashers;
|
||||
VP8LRefsCursor c = VP8LRefsCursorInit(refs);
|
||||
if (!VP8LColorCacheInit(&hashers, cache_bits)) return 0;
|
||||
|
||||
while (VP8LRefsCursorOk(&c)) {
|
||||
PixOrCopy* const v = c.cur_pos;
|
||||
if (PixOrCopyIsLiteral(v)) {
|
||||
const uint32_t argb_literal = v->argb_or_distance;
|
||||
const int ix = VP8LColorCacheContains(&hashers, argb_literal);
|
||||
if (ix >= 0) {
|
||||
// hashers contains argb_literal
|
||||
*v = PixOrCopyCreateCacheIdx(ix);
|
||||
} else {
|
||||
VP8LColorCacheInsert(&hashers, argb_literal);
|
||||
}
|
||||
++pixel_index;
|
||||
} else {
|
||||
// refs was created without local cache, so it can not have cache indexes.
|
||||
int k;
|
||||
assert(PixOrCopyIsCopy(v));
|
||||
for (k = 0; k < v->len; ++k) {
|
||||
VP8LColorCacheInsert(&hashers, argb[pixel_index++]);
|
||||
}
|
||||
}
|
||||
VP8LRefsCursorNext(&c);
|
||||
}
|
||||
VP8LColorCacheClear(&hashers);
|
||||
return 1;
|
||||
}
|
||||
|
||||
static VP8LBackwardRefs* GetBackwardReferencesLowEffort(
|
||||
int width, int height, const uint32_t* const argb,
|
||||
int* const cache_bits, const VP8LHashChain* const hash_chain,
|
||||
VP8LBackwardRefs* const refs_lz77) {
|
||||
*cache_bits = 0;
|
||||
if (!BackwardReferencesLz77(width, height, argb, 0, hash_chain, refs_lz77)) {
|
||||
return NULL;
|
||||
}
|
||||
BackwardReferences2DLocality(width, refs_lz77);
|
||||
return refs_lz77;
|
||||
}
|
||||
|
||||
extern int VP8LBackwardReferencesTraceBackwards(
|
||||
int xsize, int ysize, const uint32_t* const argb, int cache_bits,
|
||||
const VP8LHashChain* const hash_chain,
|
||||
const VP8LBackwardRefs* const refs_src, VP8LBackwardRefs* const refs_dst);
|
||||
static VP8LBackwardRefs* GetBackwardReferences(
|
||||
int width, int height, const uint32_t* const argb, int quality,
|
||||
int lz77_types_to_try, int* const cache_bits,
|
||||
const VP8LHashChain* const hash_chain, VP8LBackwardRefs* best,
|
||||
VP8LBackwardRefs* worst) {
|
||||
const int cache_bits_initial = *cache_bits;
|
||||
double bit_cost_best = -1;
|
||||
VP8LHistogram* histo = NULL;
|
||||
int lz77_type, lz77_type_best = 0;
|
||||
VP8LHashChain hash_chain_box;
|
||||
memset(&hash_chain_box, 0, sizeof(hash_chain_box));
|
||||
|
||||
histo = VP8LAllocateHistogram(MAX_COLOR_CACHE_BITS);
|
||||
if (histo == NULL) goto Error;
|
||||
|
||||
for (lz77_type = 1; lz77_types_to_try;
|
||||
lz77_types_to_try &= ~lz77_type, lz77_type <<= 1) {
|
||||
int res = 0;
|
||||
double bit_cost;
|
||||
int cache_bits_tmp = cache_bits_initial;
|
||||
if ((lz77_types_to_try & lz77_type) == 0) continue;
|
||||
switch (lz77_type) {
|
||||
case kLZ77RLE:
|
||||
res = BackwardReferencesRle(width, height, argb, 0, worst);
|
||||
break;
|
||||
case kLZ77Standard:
|
||||
// Compute LZ77 with no cache (0 bits), as the ideal LZ77 with a color
|
||||
// cache is not that different in practice.
|
||||
res = BackwardReferencesLz77(width, height, argb, 0, hash_chain, worst);
|
||||
break;
|
||||
case kLZ77Box:
|
||||
if (!VP8LHashChainInit(&hash_chain_box, width * height)) goto Error;
|
||||
res = BackwardReferencesLz77Box(width, height, argb, 0, hash_chain,
|
||||
&hash_chain_box, worst);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
if (!res) goto Error;
|
||||
|
||||
// Next, try with a color cache and update the references.
|
||||
if (!CalculateBestCacheSize(argb, quality, worst, &cache_bits_tmp)) {
|
||||
goto Error;
|
||||
}
|
||||
if (cache_bits_tmp > 0) {
|
||||
if (!BackwardRefsWithLocalCache(argb, cache_bits_tmp, worst)) {
|
||||
goto Error;
|
||||
}
|
||||
}
|
||||
|
||||
// Keep the best backward references.
|
||||
VP8LHistogramCreate(histo, worst, cache_bits_tmp);
|
||||
bit_cost = VP8LHistogramEstimateBits(histo);
|
||||
if (lz77_type_best == 0 || bit_cost < bit_cost_best) {
|
||||
VP8LBackwardRefs* const tmp = worst;
|
||||
worst = best;
|
||||
best = tmp;
|
||||
bit_cost_best = bit_cost;
|
||||
*cache_bits = cache_bits_tmp;
|
||||
lz77_type_best = lz77_type;
|
||||
}
|
||||
}
|
||||
assert(lz77_type_best > 0);
|
||||
|
||||
// Improve on simple LZ77 but only for high quality (TraceBackwards is
|
||||
// costly).
|
||||
if ((lz77_type_best == kLZ77Standard || lz77_type_best == kLZ77Box) &&
|
||||
quality >= 25) {
|
||||
const VP8LHashChain* const hash_chain_tmp =
|
||||
(lz77_type_best == kLZ77Standard) ? hash_chain : &hash_chain_box;
|
||||
if (VP8LBackwardReferencesTraceBackwards(width, height, argb, *cache_bits,
|
||||
hash_chain_tmp, best, worst)) {
|
||||
double bit_cost_trace;
|
||||
VP8LHistogramCreate(histo, worst, *cache_bits);
|
||||
bit_cost_trace = VP8LHistogramEstimateBits(histo);
|
||||
if (bit_cost_trace < bit_cost_best) best = worst;
|
||||
}
|
||||
}
|
||||
|
||||
BackwardReferences2DLocality(width, best);
|
||||
|
||||
Error:
|
||||
VP8LHashChainClear(&hash_chain_box);
|
||||
VP8LFreeHistogram(histo);
|
||||
return best;
|
||||
}
|
||||
|
||||
VP8LBackwardRefs* VP8LGetBackwardReferences(
|
||||
int width, int height, const uint32_t* const argb, int quality,
|
||||
int low_effort, int lz77_types_to_try, int* const cache_bits,
|
||||
const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_tmp1,
|
||||
VP8LBackwardRefs* const refs_tmp2) {
|
||||
if (low_effort) {
|
||||
return GetBackwardReferencesLowEffort(width, height, argb, cache_bits,
|
||||
hash_chain, refs_tmp1);
|
||||
} else {
|
||||
return GetBackwardReferences(width, height, argb, quality,
|
||||
lz77_types_to_try, cache_bits, hash_chain,
|
||||
refs_tmp1, refs_tmp2);
|
||||
}
|
||||
}
|
|
@ -10,13 +10,13 @@
|
|||
// Author: Jyrki Alakuijala (jyrki@google.com)
|
||||
//
|
||||
|
||||
#ifndef WEBP_ENC_BACKWARD_REFERENCES_H_
|
||||
#define WEBP_ENC_BACKWARD_REFERENCES_H_
|
||||
#ifndef WEBP_ENC_BACKWARD_REFERENCES_ENC_H_
|
||||
#define WEBP_ENC_BACKWARD_REFERENCES_ENC_H_
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include "../webp/types.h"
|
||||
#include "../webp/format_constants.h"
|
||||
#include "src/webp/types.h"
|
||||
#include "src/webp/format_constants.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
@ -91,11 +91,6 @@ static WEBP_INLINE uint32_t PixOrCopyLength(const PixOrCopy* const p) {
|
|||
return p->len;
|
||||
}
|
||||
|
||||
static WEBP_INLINE uint32_t PixOrCopyArgb(const PixOrCopy* const p) {
|
||||
assert(p->mode == kLiteral);
|
||||
return p->argb_or_distance;
|
||||
}
|
||||
|
||||
static WEBP_INLINE uint32_t PixOrCopyCacheIdx(const PixOrCopy* const p) {
|
||||
assert(p->mode == kCacheIdx);
|
||||
assert(p->argb_or_distance < (1U << MAX_COLOR_CACHE_BITS));
|
||||
|
@ -113,6 +108,16 @@ static WEBP_INLINE uint32_t PixOrCopyDistance(const PixOrCopy* const p) {
|
|||
#define HASH_BITS 18
|
||||
#define HASH_SIZE (1 << HASH_BITS)
|
||||
|
||||
// If you change this, you need MAX_LENGTH_BITS + WINDOW_SIZE_BITS <= 32 as it
|
||||
// is used in VP8LHashChain.
|
||||
#define MAX_LENGTH_BITS 12
|
||||
#define WINDOW_SIZE_BITS 20
|
||||
// We want the max value to be attainable and stored in MAX_LENGTH_BITS bits.
|
||||
#define MAX_LENGTH ((1 << MAX_LENGTH_BITS) - 1)
|
||||
#if MAX_LENGTH_BITS + WINDOW_SIZE_BITS > 32
|
||||
#error "MAX_LENGTH_BITS + WINDOW_SIZE_BITS > 32"
|
||||
#endif
|
||||
|
||||
typedef struct VP8LHashChain VP8LHashChain;
|
||||
struct VP8LHashChain {
|
||||
// The 20 most significant bits contain the offset at which the best match
|
||||
|
@ -134,6 +139,24 @@ int VP8LHashChainFill(VP8LHashChain* const p, int quality,
|
|||
int low_effort);
|
||||
void VP8LHashChainClear(VP8LHashChain* const p); // release memory
|
||||
|
||||
static WEBP_INLINE int VP8LHashChainFindOffset(const VP8LHashChain* const p,
|
||||
const int base_position) {
|
||||
return p->offset_length_[base_position] >> MAX_LENGTH_BITS;
|
||||
}
|
||||
|
||||
static WEBP_INLINE int VP8LHashChainFindLength(const VP8LHashChain* const p,
|
||||
const int base_position) {
|
||||
return p->offset_length_[base_position] & ((1U << MAX_LENGTH_BITS) - 1);
|
||||
}
|
||||
|
||||
static WEBP_INLINE void VP8LHashChainFindCopy(const VP8LHashChain* const p,
|
||||
int base_position,
|
||||
int* const offset_ptr,
|
||||
int* const length_ptr) {
|
||||
*offset_ptr = VP8LHashChainFindOffset(p, base_position);
|
||||
*length_ptr = VP8LHashChainFindLength(p, base_position);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// VP8LBackwardRefs (block-based backward-references storage)
|
||||
|
||||
|
@ -158,9 +181,6 @@ struct VP8LBackwardRefs {
|
|||
void VP8LBackwardRefsInit(VP8LBackwardRefs* const refs, int block_size);
|
||||
// Release memory for backward references.
|
||||
void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs);
|
||||
// Copies the 'src' backward refs to the 'dst'. Returns 0 in case of error.
|
||||
int VP8LBackwardRefsCopy(const VP8LBackwardRefs* const src,
|
||||
VP8LBackwardRefs* const dst);
|
||||
|
||||
// Cursor for iterating on references content
|
||||
typedef struct {
|
||||
|
@ -189,6 +209,12 @@ static WEBP_INLINE void VP8LRefsCursorNext(VP8LRefsCursor* const c) {
|
|||
// -----------------------------------------------------------------------------
|
||||
// Main entry points
|
||||
|
||||
enum VP8LLZ77Type {
|
||||
kLZ77Standard = 1,
|
||||
kLZ77RLE = 2,
|
||||
kLZ77Box = 4
|
||||
};
|
||||
|
||||
// Evaluates best possible backward references for specified quality.
|
||||
// The input cache_bits to 'VP8LGetBackwardReferences' sets the maximum cache
|
||||
// bits to use (passing 0 implies disabling the local color cache).
|
||||
|
@ -197,11 +223,12 @@ static WEBP_INLINE void VP8LRefsCursorNext(VP8LRefsCursor* const c) {
|
|||
// refs[0] or refs[1].
|
||||
VP8LBackwardRefs* VP8LGetBackwardReferences(
|
||||
int width, int height, const uint32_t* const argb, int quality,
|
||||
int low_effort, int* const cache_bits,
|
||||
const VP8LHashChain* const hash_chain, VP8LBackwardRefs refs[2]);
|
||||
int low_effort, int lz77_types_to_try, int* const cache_bits,
|
||||
const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_tmp1,
|
||||
VP8LBackwardRefs* const refs_tmp2);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // WEBP_ENC_BACKWARD_REFERENCES_H_
|
||||
#endif // WEBP_ENC_BACKWARD_REFERENCES_ENC_H_
|
|
@ -12,10 +12,10 @@
|
|||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "../webp/config.h"
|
||||
#include "src/webp/config.h"
|
||||
#endif
|
||||
|
||||
#include "../webp/encode.h"
|
||||
#include "src/webp/encode.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// WebPConfig
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// Author: Skal (pascal.massimino@gmail.com)
|
||||
|
||||
#include "./cost_enc.h"
|
||||
#include "src/enc/cost_enc.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Level cost tables
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue