Framework for using real FFT in ARMv7 and Neon platforms.

Review URL: https://webrtc-codereview.appspot.com/785004

git-svn-id: http://webrtc.googlecode.com/svn/trunk@2803 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
kma@webrtc.org 2012-09-21 18:51:12 +00:00
parent cf1375a1f1
commit f9e6cc2e27
11 changed files with 399 additions and 246 deletions

View File

@ -35,6 +35,7 @@ LOCAL_SRC_FILES := \
lpc_to_refl_coef.c \
min_max_operations.c \
randomization_functions.c \
real_fft.c \
refl_coef_to_lpc.c \
resample.c \
resample_48khz.c \

View File

@ -19,16 +19,65 @@ struct RealFFT;
extern "C" {
#endif
// TODO(andrew): documentation.
typedef int (*RealForwardFFT)(struct RealFFT* self,
const int16_t* data_in,
int16_t* data_out);
typedef int (*RealInverseFFT)(struct RealFFT* self,
const int16_t* data_in,
int16_t* data_out);
extern RealForwardFFT WebRtcSpl_RealForwardFFT;
extern RealInverseFFT WebRtcSpl_RealInverseFFT;
struct RealFFT* WebRtcSpl_CreateRealFFT(int order);
void WebRtcSpl_FreeRealFFT(struct RealFFT* self);
// TODO(andrew): This currently functions exactly the same as ComplexFFT().
// Manage the surrounding operations (ComplexBitReverse etc) here instead.
// TODO(kma): Implement FFT functions for real signals.
// Compute the forward FFT for a complex signal of length 2^order.
// Input Arguments:
// self - pointer to preallocated and initialized FFT specification structure.
// data_in - the input signal.
//
// data must be of length 2^(order + 1) to hold the complex output.
int WebRtcSpl_RealForwardFFT(struct RealFFT* self, int16_t* data);
int WebRtcSpl_RealInverseFFT(struct RealFFT* self, int16_t* data);
// Output Arguments:
// data_out - the output signal; must be different to data_in.
//
// Return Value:
// 0 - FFT calculation is successful.
// -1 - Error
//
int WebRtcSpl_RealForwardFFTC(struct RealFFT* self,
const int16_t* data_in,
int16_t* data_out);
#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
int WebRtcSpl_RealForwardFFTNeon(struct RealFFT* self,
const int16_t* data_in,
int16_t* data_out);
#endif
// Compute the inverse FFT for a complex signal of length 2^order.
// Input Arguments:
// self - pointer to preallocated and initialized FFT specification structure.
// data_in - the input signal.
//
// Output Arguments:
// data_out - the output signal; must be different to data_in.
//
// Return Value:
// 0 or a positive number - a value that the elements in the |data_out| should
// be shifted left with in order to get correct
// physical values.
// -1 - Error
int WebRtcSpl_RealInverseFFTC(struct RealFFT* self,
const int16_t* data_in,
int16_t* data_out);
#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON)
int WebRtcSpl_RealInverseFFTNeon(struct RealFFT* self,
const int16_t* data_in,
int16_t* data_out);
#endif
#ifdef __cplusplus
}

View File

@ -20,12 +20,15 @@ struct RealFFT {
struct RealFFT* WebRtcSpl_CreateRealFFT(int order) {
struct RealFFT* self = NULL;
// This constraint comes from ComplexFFT().
if (order > 10 || order < 0) {
return NULL;
}
self = malloc(sizeof(struct RealFFT));
self->order = order;
return self;
}
@ -33,10 +36,37 @@ void WebRtcSpl_FreeRealFFT(struct RealFFT* self) {
free(self);
}
int WebRtcSpl_RealForwardFFT(struct RealFFT* self, int16_t* data) {
return WebRtcSpl_ComplexFFT(data, self->order, 1);
// WebRtcSpl_ComplexFFT and WebRtcSpl_ComplexIFFT use in-place algorithm,
// so copy data from data_in to data_out in the next two functions.
int WebRtcSpl_RealForwardFFTC(struct RealFFT* self,
const int16_t* data_in,
int16_t* data_out) {
memcpy(data_out, data_in, sizeof(int16_t) * (1 << (self->order + 1)));
WebRtcSpl_ComplexBitReverse(data_out, self->order);
return WebRtcSpl_ComplexFFT(data_out, self->order, 1);
}
int WebRtcSpl_RealInverseFFT(struct RealFFT* self, int16_t* data) {
return WebRtcSpl_ComplexIFFT(data, self->order, 1);
int WebRtcSpl_RealInverseFFTC(struct RealFFT* self,
const int16_t* data_in,
int16_t* data_out) {
memcpy(data_out, data_in, sizeof(int16_t) * (1 << (self->order + 1)));
WebRtcSpl_ComplexBitReverse(data_out, self->order);
return WebRtcSpl_ComplexIFFT(data_out, self->order, 1);
}
#if defined(WEBRTC_DETECT_ARM_NEON) || defined(WEBRTC_ARCH_ARM_NEON)
// TODO(kma): Replace the following function bodies into optimized functions
// for ARM Neon.
int WebRtcSpl_RealForwardFFTNeon(struct RealFFT* self,
const int16_t* data_in,
int16_t* data_out) {
return WebRtcSpl_RealForwardFFTC(self, data_in, data_out);
}
int WebRtcSpl_RealInverseFFTNeon(struct RealFFT* self,
const int16_t* data_in,
int16_t* data_out) {
return WebRtcSpl_RealInverseFFTC(self, data_in, data_out);
}
#endif

View File

@ -17,11 +17,13 @@
namespace webrtc {
namespace {
const int kOrder = 3;
const int kOrder = 4;
const int kLength = 1 << (kOrder + 1); // +1 to hold complex data.
const int16_t kRefData[kLength] = {
11739, -6848, -8688, 31980, -30295, 25242, 27085, 19410, -26299, -15607,
-10791, 11778, -23819, 14498, -25772, 10076
11739, 6848, -8688, 31980, -30295, 25242, 27085, 19410,
-26299, 15607, -10791, 11778, -23819, 14498, -25772, 10076,
1173, 6848, -8688, 31980, -30295, 2522, 27085, 19410,
-2629, 5607, -3, 1178, -23819, 1498, -25772, 10076
};
class RealFFTTest : public ::testing::Test {
@ -38,49 +40,35 @@ TEST_F(RealFFTTest, CreateFailsOnBadInput) {
EXPECT_TRUE(fft == NULL);
}
// TODO(andrew): Look more into why this was failing.
TEST_F(RealFFTTest, DISABLED_TransformIsInvertible) {
int16_t data[kLength] = {0};
memcpy(data, kRefData, sizeof(kRefData));
RealFFT* fft = NULL;
fft = WebRtcSpl_CreateRealFFT(kOrder);
EXPECT_TRUE(fft != NULL);
EXPECT_EQ(0, WebRtcSpl_RealForwardFFT(fft, data));
int scale = WebRtcSpl_RealInverseFFT(fft, data);
EXPECT_GE(scale, 0);
for (int i = 0; i < kLength; i++) {
EXPECT_EQ(data[i] << scale, kRefData[i]);
}
WebRtcSpl_FreeRealFFT(fft);
}
// TODO(andrew): This won't always be the case, but verifies the current code
// at least.
TEST_F(RealFFTTest, RealAndComplexAreIdentical) {
int16_t real_data[kLength] = {0};
int16_t real_data_out[kLength] = {0};
int16_t complex_data[kLength] = {0};
memcpy(real_data, kRefData, sizeof(kRefData));
memcpy(complex_data, kRefData, sizeof(kRefData));
RealFFT* fft = NULL;
fft = WebRtcSpl_CreateRealFFT(kOrder);
RealFFT* fft = WebRtcSpl_CreateRealFFT(kOrder);
EXPECT_TRUE(fft != NULL);
EXPECT_EQ(0, WebRtcSpl_RealForwardFFT(fft, real_data));
EXPECT_EQ(0, WebRtcSpl_RealForwardFFT(fft, real_data, real_data_out));
WebRtcSpl_ComplexBitReverse(complex_data, kOrder);
EXPECT_EQ(0, WebRtcSpl_ComplexFFT(complex_data, kOrder, 1));
for (int i = 0; i < kLength; i++) {
EXPECT_EQ(real_data[i], complex_data[i]);
EXPECT_EQ(real_data_out[i], complex_data[i]);
}
int real_scale = WebRtcSpl_RealInverseFFT(fft, real_data);
int complex_scale = WebRtcSpl_ComplexIFFT(complex_data, kOrder, 1);
memcpy(complex_data, kRefData, sizeof(kRefData));
int real_scale = WebRtcSpl_RealInverseFFT(fft, real_data, real_data_out);
EXPECT_GE(real_scale, 0);
WebRtcSpl_ComplexBitReverse(complex_data, kOrder);
int complex_scale = WebRtcSpl_ComplexIFFT(complex_data, kOrder, 1);
EXPECT_EQ(real_scale, complex_scale);
for (int i = 0; i < kLength; i++) {
EXPECT_EQ(real_data[i], complex_data[i]);
EXPECT_EQ(real_data_out[i], complex_data[i]);
}
WebRtcSpl_FreeRealFFT(fft);
}

View File

@ -18,6 +18,7 @@
* (AEC, NS, codecs etc.).
*/
#include "common_audio/signal_processing/include/real_fft.h"
#include "common_audio/signal_processing/include/signal_processing_library.h"
#include "system_wrappers/interface/cpu_features_wrapper.h"
@ -31,6 +32,8 @@ MinValueW32 WebRtcSpl_MinValueW32;
CrossCorrelation WebRtcSpl_CrossCorrelation;
DownsampleFast WebRtcSpl_DownsampleFast;
ScaleAndAddVectorsWithRound WebRtcSpl_ScaleAndAddVectorsWithRound;
RealForwardFFT WebRtcSpl_RealForwardFFT;
RealInverseFFT WebRtcSpl_RealInverseFFT;
/* Initialize function pointers to the generic C version. */
static void InitPointersToC() {
@ -44,6 +47,8 @@ static void InitPointersToC() {
WebRtcSpl_DownsampleFast = WebRtcSpl_DownsampleFastC;
WebRtcSpl_ScaleAndAddVectorsWithRound =
WebRtcSpl_ScaleAndAddVectorsWithRoundC;
WebRtcSpl_RealForwardFFT = WebRtcSpl_RealForwardFFTC;
WebRtcSpl_RealInverseFFT = WebRtcSpl_RealInverseFFTC;
}
#if defined(WEBRTC_DETECT_ARM_NEON) || defined(WEBRTC_ARCH_ARM_NEON)
@ -59,6 +64,8 @@ static void InitPointersToNeon() {
WebRtcSpl_DownsampleFast = WebRtcSpl_DownsampleFastNeon;
WebRtcSpl_ScaleAndAddVectorsWithRound =
WebRtcSpl_ScaleAndAddVectorsWithRoundNeon;
WebRtcSpl_RealForwardFFT = WebRtcSpl_RealForwardFFTNeon;
WebRtcSpl_RealInverseFFT = WebRtcSpl_RealInverseFFTNeon;
}
#endif
@ -76,7 +83,6 @@ static void InitFunctionPointers(void) {
#endif /* WEBRTC_DETECT_ARM_NEON */
}
#if defined(WEBRTC_POSIX)
#include <pthread.h>

View File

@ -14,6 +14,7 @@
#include <stddef.h>
#include <stdlib.h>
#include "common_audio/signal_processing/include/real_fft.h"
#include "cpu_features_wrapper.h"
#include "delay_estimator_wrapper.h"
#include "echo_control_mobile.h"
@ -313,6 +314,13 @@ int WebRtcAecm_CreateCore(AecmCore_t **aecmInst)
return -1;
}
aecm->real_fft = WebRtcSpl_CreateRealFFT(PART_LEN_SHIFT);
if (aecm->real_fft == NULL) {
WebRtcAecm_FreeCore(aecm);
aecm = NULL;
return -1;
}
// Init some aecm pointers. 16 and 32 byte alignment is only necessary
// for Neon code currently.
aecm->xBuf = (WebRtc_Word16*) (((uintptr_t)aecm->xBuf_buf + 31) & ~ 31);
@ -350,7 +358,8 @@ void WebRtcAecm_InitEchoPathCore(AecmCore_t* aecm, const WebRtc_Word16* echo_pat
aecm->mseChannelCount = 0;
}
static void WindowAndFFTC(WebRtc_Word16* fft,
static void WindowAndFFTC(AecmCore_t* aecm,
WebRtc_Word16* fft,
const WebRtc_Word16* time_signal,
complex16_t* freq_signal,
int time_signal_scaling)
@ -375,31 +384,14 @@ static void WindowAndFFTC(WebRtc_Word16* fft,
// initialized the array with all zeros
}
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
// Take only the first PART_LEN2 samples
for (i = 0, j = 0; j < PART_LEN2; i += 1, j += 2)
{
freq_signal[i].real = fft[j];
// The imaginary part has to switch sign
freq_signal[i].imag = - fft[j+1];
// Do forward FFT, then take only the first PART_LEN complex samples,
// and change signs of the imaginary parts.
WebRtcSpl_RealForwardFFT(aecm->real_fft, fft, (int16_t*)freq_signal);
for (i = 0; i < PART_LEN; i++) {
freq_signal[i].imag = -freq_signal[i].imag;
}
}
// Initialize function pointers for ARM Neon platform.
#if (defined WEBRTC_DETECT_ARM_NEON || defined WEBRTC_ARCH_ARM_NEON)
static void WebRtcAecm_InitNeon(void)
{
WebRtcAecm_WindowAndFFT = WebRtcAecm_WindowAndFFTNeon;
WebRtcAecm_InverseFFTAndWindow = WebRtcAecm_InverseFFTAndWindowNeon;
WebRtcAecm_CalcLinearEnergies = WebRtcAecm_CalcLinearEnergiesNeon;
WebRtcAecm_StoreAdaptiveChannel = WebRtcAecm_StoreAdaptiveChannelNeon;
WebRtcAecm_ResetAdaptiveChannel = WebRtcAecm_ResetAdaptiveChannelNeon;
}
#endif
static void InverseFFTAndWindowC(AecmCore_t* aecm,
WebRtc_Word16* fft,
complex16_t* efw,
@ -428,32 +420,23 @@ static void InverseFFTAndWindowC(AecmCore_t* aecm,
fft[PART_LEN2] = efw[PART_LEN].real;
fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
// inverse FFT, result should be scaled with outCFFT
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
//take only the real values and scale with outCFFT
for (i = 0; i < PART_LEN2; i++)
{
j = WEBRTC_SPL_LSHIFT_W32(i, 1);
fft[i] = fft[j];
}
for (i = 0; i < PART_LEN; i++)
{
fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
fft[i],
// Inverse FFT. Then take only the real values, and keep outCFFT
// to scale the samples in the next block.
outCFFT = WebRtcSpl_RealInverseFFT(aecm->real_fft, fft, (int16_t*)efw);
for (i = 0; i < PART_LEN; i++) {
efw[i].real = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
efw[i].real,
WebRtcAecm_kSqrtHanning[i],
14);
tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)efw[i].real,
outCFFT - aecm->dfaCleanQDomain);
fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
efw[i].real = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
tmp32no1 + aecm->outBuf[i],
WEBRTC_SPL_WORD16_MIN);
output[i] = fft[i];
output[i] = efw[i].real;
tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
fft[PART_LEN + i],
efw[PART_LEN + i].real,
WebRtcAecm_kSqrtHanning[PART_LEN - i],
14);
tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1,
@ -542,6 +525,19 @@ static void ResetAdaptiveChannelC(AecmCore_t* aecm)
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16);
}
// Initialize function pointers for ARM Neon platform.
#if (defined WEBRTC_DETECT_ARM_NEON || defined WEBRTC_ARCH_ARM_NEON)
static void WebRtcAecm_InitNeon(void)
{
// TODO(kma): Check why WebRtcAecm_InverseFFTAndWindowNeon() doesn't work.
WebRtcAecm_WindowAndFFT = WebRtcAecm_WindowAndFFTNeon;
WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowC;
WebRtcAecm_StoreAdaptiveChannel = WebRtcAecm_StoreAdaptiveChannelNeon;
WebRtcAecm_ResetAdaptiveChannel = WebRtcAecm_ResetAdaptiveChannelNeon;
WebRtcAecm_CalcLinearEnergies = WebRtcAecm_CalcLinearEnergiesNeon;
}
#endif
// WebRtcAecm_InitCore(...)
//
// This function initializes the AECM instant created with WebRtcAecm_CreateCore(...)
@ -704,6 +700,8 @@ int WebRtcAecm_FreeCore(AecmCore_t *aecm)
WebRtc_FreeBuffer(aecm->outFrameBuf);
WebRtc_FreeDelayEstimator(aecm->delay_estimator);
WebRtcSpl_FreeRealFFT(aecm->real_fft);
free(aecm);
return 0;
@ -1376,7 +1374,8 @@ static WebRtc_Word16 CalcSuppressionGain(AecmCore_t * const aecm)
// the frequency domain array
// return value The Q-domain of current frequency values
//
static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal,
static int TimeToFrequencyDomain(AecmCore_t* aecm,
const WebRtc_Word16* time_signal,
complex16_t* freq_signal,
WebRtc_UWord16* freq_signal_abs,
WebRtc_UWord32* freq_signal_sum_abs)
@ -1407,12 +1406,11 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal,
time_signal_scaling = WebRtcSpl_NormW16(tmp16no1);
#endif
WebRtcAecm_WindowAndFFT(fft, time_signal, freq_signal, time_signal_scaling);
WebRtcAecm_WindowAndFFT(aecm, fft, time_signal, freq_signal, time_signal_scaling);
// Extract imaginary and real part, calculate the magnitude for all frequency bins
freq_signal[0].imag = 0;
freq_signal[PART_LEN].imag = 0;
freq_signal[PART_LEN].real = fft[PART_LEN2];
freq_signal_abs[0] = (WebRtc_UWord16)WEBRTC_SPL_ABS_W16(
freq_signal[0].real);
freq_signal_abs[PART_LEN] = (WebRtc_UWord16)WEBRTC_SPL_ABS_W16(
@ -1530,8 +1528,8 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm,
// TODO (kma): define fft with complex16_t.
WebRtc_Word16 fft_buf[PART_LEN4 + 2 + 16]; // +2 to make a loop safe.
WebRtc_Word32 echoEst32_buf[PART_LEN1 + 8];
WebRtc_Word32 dfw_buf[PART_LEN1 + 8];
WebRtc_Word32 efw_buf[PART_LEN1 + 8];
WebRtc_Word32 dfw_buf[PART_LEN2 + 8];
WebRtc_Word32 efw_buf[PART_LEN2 + 8];
WebRtc_Word16* fft = (WebRtc_Word16*) (((uintptr_t) fft_buf + 31) & ~ 31);
WebRtc_Word32* echoEst32 = (WebRtc_Word32*) (((uintptr_t) echoEst32_buf + 31) & ~ 31);
@ -1575,13 +1573,15 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm,
}
// Transform far end signal from time domain to frequency domain.
far_q = TimeToFrequencyDomain(aecm->xBuf,
far_q = TimeToFrequencyDomain(aecm,
aecm->xBuf,
dfw,
xfa,
&xfaSum);
// Transform noisy near end signal from time domain to frequency domain.
zerosDBufNoisy = TimeToFrequencyDomain(aecm->dBufNoisy,
zerosDBufNoisy = TimeToFrequencyDomain(aecm,
aecm->dBufNoisy,
dfw,
dfaNoisy,
&dfaNoisySum);
@ -1598,7 +1598,8 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm,
} else
{
// Transform clean near end signal from time domain to frequency domain.
zerosDBufClean = TimeToFrequencyDomain(aecm->dBufClean,
zerosDBufClean = TimeToFrequencyDomain(aecm,
aecm->dBufClean,
dfw,
dfaClean,
&dfaCleanSum);

View File

@ -117,6 +117,8 @@ typedef struct
WebRtc_Word16 supGainErrParamDiffAB;
WebRtc_Word16 supGainErrParamDiffBD;
struct RealFFT* real_fft;
#ifdef AEC_DEBUG
FILE *farFile;
FILE *nearFile;
@ -276,6 +278,7 @@ typedef void (*ResetAdaptiveChannel)(AecmCore_t* aecm);
extern ResetAdaptiveChannel WebRtcAecm_ResetAdaptiveChannel;
typedef void (*WindowAndFFT)(
AecmCore_t* aecm,
WebRtc_Word16* fft,
const WebRtc_Word16* time_signal,
complex16_t* freq_signal,
@ -293,7 +296,8 @@ extern InverseFFTAndWindow WebRtcAecm_InverseFFTAndWindow;
// and defined as static in file aecm_core.c, while those for ARM Neon platforms
// are declared below and defined in file aecm_core_neon.s.
#if (defined WEBRTC_DETECT_ARM_NEON) || defined (WEBRTC_ARCH_ARM_NEON)
void WebRtcAecm_WindowAndFFTNeon(WebRtc_Word16* fft,
void WebRtcAecm_WindowAndFFTNeon(AecmCore_t* aecm,
WebRtc_Word16* fft,
const WebRtc_Word16* time_signal,
complex16_t* freq_signal,
int time_signal_scaling);

View File

@ -13,6 +13,10 @@
#include <arm_neon.h>
#include <assert.h>
#include "common_audio/signal_processing/include/real_fft.h"
// TODO(kma): Re-write the corresponding assembly file, the offset
// generating script and makefile, to replace these C functions.
// Square root of Hanning window in Q14.
static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__((aligned(8))) = {
@ -34,55 +38,74 @@ static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__((aligned(8))) =
1594, 1196, 798, 399
};
void WebRtcAecm_WindowAndFFTNeon(WebRtc_Word16* fft,
void WebRtcAecm_WindowAndFFTNeon(AecmCore_t* aecm,
WebRtc_Word16* fft,
const WebRtc_Word16* time_signal,
complex16_t* freq_signal,
int time_signal_scaling) {
int i, j;
int i = 0;
const int16_t* p_time_signal = time_signal;
const int16_t* p_time_signal_offset = &time_signal[PART_LEN];
const int16_t* p_hanning = WebRtcAecm_kSqrtHanning;
const int16_t* p_hanning_reversed = kSqrtHanningReversed;
int16_t* p_fft = fft;
int16_t* p_fft_offset = &fft[PART_LEN2];
int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling);
__asm__("vmov.i16 d21, #0" ::: "d21");
assert((uintptr_t)p_time_signal % 8 == 0);
assert((uintptr_t)freq_signal % 32 == 0);
assert((uintptr_t)p_hanning % 8 == 0);
assert((uintptr_t)p_fft % 16 == 0);
for (i = 0, j = 0; i < PART_LEN; i += 4, j += 8) {
int16x4_t tmp16x4_0;
int16x4_t tmp16x4_1;
int32x4_t tmp32x4_0;
__asm __volatile(
"vdup.16 d16, %0\n\t"
"vmov.i16 d21, #0\n\t"
"vmov.i16 d27, #0\n\t"
:
:"r"(time_signal_scaling)
: "d16", "d21", "d27"
);
/* Window near end */
// fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((time_signal[i]
// << time_signal_scaling), WebRtcAecm_kSqrtHanning[i], 14);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i]));
tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
__asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
__asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[j]) : "q10");
// fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
// (time_signal[PART_LEN + i] << time_signal_scaling),
// WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i + PART_LEN]));
tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
__asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
__asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[PART_LEN2 + j]) : "q10");
for (i = 0; i < PART_LEN; i += 4) {
__asm __volatile(
"vld1.16 d0, [%[p_time_signal], :64]!\n\t"
"vld1.16 d22, [%[p_time_signal_offset], :64]!\n\t"
"vld1.16 d17, [%[p_hanning], :64]!\n\t"
"vld1.16 d23, [%[p_hanning_reversed], :64]!\n\t"
"vshl.s16 d18, d0, d16\n\t"
"vshl.s16 d22, d22, d16\n\t"
"vmull.s16 q9, d18, d17\n\t"
"vmull.s16 q12, d22, d23\n\t"
"vshrn.i32 d20, q9, #14\n\t"
"vshrn.i32 d26, q12, #14\n\t"
"vst2.16 {d20, d21}, [%[p_fft], :128]!\n\t"
"vst2.16 {d26, d27}, [%[p_fft_offset], :128]!\n\t"
:[p_time_signal]"+r"(p_time_signal),
[p_time_signal_offset]"+r"(p_time_signal_offset),
[p_hanning]"+r"(p_hanning),
[p_hanning_reversed]"+r"(p_hanning_reversed),
[p_fft]"+r"(p_fft),
[p_fft_offset]"+r"(p_fft_offset)
:
:"d0", "d16", "d17", "d18", "d19", "d20", "d21",
"d22", "d23", "d24", "d25", "d26", "d27"
);
}
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
// Do forward FFT, then take only the first PART_LEN complex samples,
// and change signs of the imaginary parts.
WebRtcSpl_RealForwardFFT(aecm->real_fft, (int16_t*)fft,
(int16_t*)freq_signal);
// Take only the first PART_LEN2 samples, and switch the sign of the imaginary part.
for (i = 0, j = 0; j < PART_LEN2; i += 8, j += 16) {
__asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
__asm__("vneg.s16 d22, d22" : : : "q10");
__asm__("vneg.s16 d23, d23" : : : "q11");
__asm__("vst2.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&freq_signal[i].real): "q10", "q11");
for (i = 0; i < PART_LEN; i += 8) {
__asm __volatile(
"vld2.16 {d20, d21, d22, d23}, [%[freq_signal], :256]\n\t"
"vneg.s16 d22, d22\n\t"
"vneg.s16 d23, d23\n\t"
"vst2.16 {d20, d21, d22, d23}, [%[freq_signal], :256]!\n\t"
:[freq_signal]"+r"(freq_signal)
:
: "d20", "d21", "d22", "d23"
);
}
}
@ -93,34 +116,47 @@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
const WebRtc_Word16* nearendClean) {
int i, j, outCFFT;
assert((uintptr_t)efw % 32 == 0);
assert((uintptr_t)fft % 16 == 0);
assert((uintptr_t)output% 8 == 0);
assert((uintptr_t)WebRtcAecm_kSqrtHanning % 8 == 0);
assert((uintptr_t)kSqrtHanningReversed % 8 == 0);
assert((uintptr_t)(aecm->outBuf) % 8 == 0);
assert((uintptr_t)(aecm->xBuf) % 32 == 0);
assert((uintptr_t)(aecm->dBufNoisy) % 32 == 0);
assert((uintptr_t)(aecm->dBufClean) % 32 == 0);
// Synthesis
complex16_t* p_efw = efw;
int16_t* p_fft = fft;
int16_t* p_fft_offset = &fft[PART_LEN4 - 6];
for (i = 0, j = 0; i < PART_LEN; i += 4, j += 8) {
// We overwrite two more elements in fft[], but it's ok.
__asm__("vld2.16 {d20, d21}, [%0, :128]" : : "r"(&(efw[i].real)) : "q10");
__asm__("vmov q11, q10" : : : "q10", "q11");
__asm__("vneg.s16 d23, d23" : : : "q11");
__asm__("vst2.16 {d22, d23}, [%0, :128]" : : "r"(&fft[j]): "q11");
__asm__("vrev64.16 q10, q10" : : : "q10");
__asm__("vst2.16 {d20, d21}, [%0]" : : "r"(&fft[PART_LEN4 - j - 6]): "q10");
__asm __volatile(
"vld2.16 {q10}, [%[p_efw], :128]!\n\t"
"vmov q11, q10\n\t"
"vneg.s16 d23, d23\n\t"
"vst2.16 {d22, d23}, [%[p_fft], :128]!\n\t"
"vrev64.16 q10, q10\n\t"
"vst2.16 {q10}, [%[p_fft_offset], %[offset]]\n\t"
:[p_efw]"+r"(p_efw),
[p_fft]"+r"(p_fft),
[p_fft_offset]"+r"(p_fft_offset)
:[offset]"r"(-16)
:"d20", "d21", "d22", "d23"
);
}
fft[PART_LEN2] = efw[PART_LEN].real;
fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
// Inverse FFT, result should be scaled with outCFFT.
WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
// Take only the real values and scale with outCFFT.
for (i = 0, j = 0; i < PART_LEN2; i += 8, j += 16) {
__asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
__asm__("vst1.16 {d20, d21}, [%0, :128]" : : "r"(&fft[i]): "q10");
}
// Inverse FFT. Then take only the real values, and keep outCFFT
// to scale the samples.
outCFFT = WebRtcSpl_RealInverseFFT(aecm->real_fft, fft, (int16_t*)efw);
int32x4_t tmp32x4_2;
__asm__("vdup.32 %q0, %1" : "=w"(tmp32x4_2) : "r"((WebRtc_Word32)
__asm __volatile("vdup.32 %q0, %1" : "=w"(tmp32x4_2) : "r"((WebRtc_Word32)
(outCFFT - aecm->dfaCleanQDomain)));
for (i = 0; i < PART_LEN; i += 4) {
int16x4_t tmp16x4_0;
@ -128,59 +164,59 @@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
int32x4_t tmp32x4_0;
int32x4_t tmp32x4_1;
// fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
// fft[i], WebRtcAecm_kSqrtHanning[i], 14);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[i]));
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
__asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
__asm__("vrshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
//efw[i].real = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
// efw[i].real, WebRtcAecm_kSqrtHanning[i], 14);
__asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&efw[i].real));
__asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
__asm __volatile("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
__asm __volatile("vrshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
// tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
//tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)efw[i].real,
// outCFFT - aecm->dfaCleanQDomain);
__asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
__asm __volatile("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
// fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
// tmp32no1 + outBuf[i], WEBRTC_SPL_WORD16_MIN);
// output[i] = fft[i];
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i]));
__asm__("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0));
__asm__("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1));
__asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
__asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&fft[i]));
__asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i]));
//efw[i].real = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
// tmp32no1 + aecm->outBuf[i], WEBRTC_SPL_WORD16_MIN);
// output[i] = efw[i].real;
__asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i]));
__asm __volatile("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0));
__asm __volatile("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1));
__asm __volatile("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
__asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&efw[i].real));
__asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i]));
// tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
// fft[PART_LEN + i], WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[PART_LEN + i]));
__asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
__asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
__asm__("vshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
// efw[PART_LEN + i].real, WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
__asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&efw[PART_LEN + i].real));
__asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
__asm __volatile("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
__asm __volatile("vshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
// tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, outCFFT - aecm->dfaCleanQDomain);
__asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
// outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
// WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN);
__asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
__asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i]));
__asm __volatile("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
// aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
// WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN);
__asm __volatile("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
__asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i]));
}
// Copy the current block to the old position (outBuf is shifted elsewhere).
for (i = 0; i < PART_LEN; i += 16) {
__asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
__asm __volatile("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&aecm->xBuf[i + PART_LEN]) : "q10");
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i]): "q10");
__asm __volatile("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i]): "q10");
}
for (i = 0; i < PART_LEN; i += 16) {
__asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
__asm __volatile("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&aecm->dBufNoisy[i + PART_LEN]) : "q10");
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
__asm __volatile("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&aecm->dBufNoisy[i]): "q10");
}
if (nearendClean != NULL) {
for (i = 0; i < PART_LEN; i += 16) {
__asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
__asm __volatile("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&aecm->dBufClean[i + PART_LEN]) : "q10");
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
__asm __volatile("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&aecm->dBufClean[i]): "q10");
}
}
@ -198,48 +234,54 @@ void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm,
register WebRtc_UWord32 echo_energy_stored_r;
register WebRtc_UWord32 echo_energy_adapt_r;
__asm__("vmov.i32 q14, #0" : : : "q14"); // far_energy
__asm__("vmov.i32 q8, #0" : : : "q8"); // echo_energy_stored
__asm__("vmov.i32 q9, #0" : : : "q9"); // echo_energy_adapt
assert((uintptr_t)echo_est % 32 == 0);
assert((uintptr_t)(aecm->channelStored) % 16 == 0);
assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0);
assert((uintptr_t)(aecm->channelStored) % 16 == 0);
assert((uintptr_t)(aecm->channelStored) % 16 == 0);
__asm __volatile("vmov.i32 q14, #0" : : : "q14"); // far_energy
__asm __volatile("vmov.i32 q8, #0" : : : "q8"); // echo_energy_stored
__asm __volatile("vmov.i32 q9, #0" : : : "q9"); // echo_energy_adapt
for (i = 0; i < PART_LEN - 7; i += 8) {
// far_energy += (WebRtc_UWord32)(far_spectrum[i]);
__asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
__asm__("vaddw.u16 q14, q14, d26" : : : "q14", "q13");
__asm__("vaddw.u16 q14, q14, d27" : : : "q14", "q13");
__asm __volatile("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
__asm __volatile("vaddw.u16 q14, q14, d26" : : : "q14", "q13");
__asm __volatile("vaddw.u16 q14, q14, d27" : : : "q14", "q13");
// Get estimated echo energies for adaptive channel and stored channel.
// echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
__asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
__asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
__asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
__asm__("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]):
__asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
__asm __volatile("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
__asm __volatile("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
__asm __volatile("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]):
"q10", "q11");
// echo_energy_stored += (WebRtc_UWord32)echoEst[i];
__asm__("vadd.u32 q8, q10" : : : "q10", "q8");
__asm__("vadd.u32 q8, q11" : : : "q11", "q8");
__asm __volatile("vadd.u32 q8, q10" : : : "q10", "q8");
__asm __volatile("vadd.u32 q8, q11" : : : "q11", "q8");
// echo_energy_adapt += WEBRTC_SPL_UMUL_16_16(
// aecm->channelAdapt16[i], far_spectrum[i]);
__asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
__asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
__asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
__asm__("vadd.u32 q9, q10" : : : "q9", "q15");
__asm__("vadd.u32 q9, q11" : : : "q9", "q11");
__asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
__asm __volatile("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
__asm __volatile("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
__asm __volatile("vadd.u32 q9, q10" : : : "q9", "q15");
__asm __volatile("vadd.u32 q9, q11" : : : "q9", "q11");
}
__asm__("vadd.u32 d28, d29" : : : "q14");
__asm__("vpadd.u32 d28, d28" : : : "q14");
__asm__("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14");
__asm __volatile("vadd.u32 d28, d29" : : : "q14");
__asm __volatile("vpadd.u32 d28, d28" : : : "q14");
__asm __volatile("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14");
__asm__("vadd.u32 d18, d19" : : : "q9");
__asm__("vpadd.u32 d18, d18" : : : "q9");
__asm__("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9");
__asm __volatile("vadd.u32 d18, d19" : : : "q9");
__asm __volatile("vpadd.u32 d18, d18" : : : "q9");
__asm __volatile("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9");
__asm__("vadd.u32 d16, d17" : : : "q8");
__asm__("vpadd.u32 d16, d16" : : : "q8");
__asm__("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8");
__asm __volatile("vadd.u32 d16, d17" : : : "q8");
__asm __volatile("vpadd.u32 d16, d16" : : : "q8");
__asm __volatile("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8");
// Get estimated echo energies for adaptive channel and stored channel.
echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
@ -254,17 +296,21 @@ void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm,
WebRtc_Word32* echo_est) {
int i;
assert((uintptr_t)echo_est % 32 == 0);
assert((uintptr_t)(aecm->channelStored) % 16 == 0);
assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0);
// During startup we store the channel every block.
// Recalculate echo estimate.
for (i = 0; i < PART_LEN - 7; i += 8) {
// aecm->channelStored[i] = acem->channelAdapt16[i];
// echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
__asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
__asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
__asm__("vst1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
__asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
__asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
__asm __volatile("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
__asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
__asm __volatile("vst1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
__asm __volatile("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
__asm __volatile("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
__asm __volatile("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&echo_est[i]) : "q10", "q11");
}
aecm->channelStored[i] = aecm->channelAdapt16[i];
@ -274,21 +320,24 @@ void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm,
void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore_t* aecm) {
int i;
assert((uintptr_t)(aecm->channelStored) % 16 == 0);
assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0);
assert((uintptr_t)(aecm->channelAdapt32) % 32 == 0);
for (i = 0; i < PART_LEN - 7; i += 8) {
// aecm->channelAdapt16[i] = aecm->channelStored[i];
// aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)
// aecm->channelStored[i], 16);
__asm__("vld1.16 {d24, d25}, [%0, :128]" : :
__asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : :
"r"(&aecm->channelStored[i]) : "q12");
__asm__("vst1.16 {d24, d25}, [%0, :128]" : :
__asm __volatile("vst1.16 {d24, d25}, [%0, :128]" : :
"r"(&aecm->channelAdapt16[i]) : "q12");
__asm__("vshll.s16 q10, d24, #16" : : : "q12", "q13", "q10");
__asm__("vshll.s16 q11, d25, #16" : : : "q12", "q13", "q11");
__asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
__asm __volatile("vshll.s16 q10, d24, #16" : : : "q12", "q13", "q10");
__asm __volatile("vshll.s16 q11, d25, #16" : : : "q12", "q13", "q11");
__asm __volatile("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
"r"(&aecm->channelAdapt32[i]): "q10", "q11");
}
aecm->channelAdapt16[i] = aecm->channelStored[i];
aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
(WebRtc_Word32)aecm->channelStored[i], 16);
}

View File

@ -8,18 +8,22 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include <stdlib.h>
#include <string.h>
#include "noise_suppression_x.h"
#include <stdlib.h>
#include "common_audio/signal_processing/include/real_fft.h"
#include "nsx_core.h"
#include "nsx_defines.h"
int WebRtcNsx_Create(NsxHandle** nsxInst) {
*nsxInst = (NsxHandle*)malloc(sizeof(NsxInst_t));
if (*nsxInst != NULL) {
NsxInst_t* self = malloc(sizeof(NsxInst_t));
*nsxInst = (NsxHandle*)self;
if (self != NULL) {
WebRtcSpl_Init();
(*(NsxInst_t**)nsxInst)->initFlag = 0;
self->real_fft = NULL;
self->initFlag = 0;
return 0;
} else {
return -1;
@ -28,6 +32,7 @@ int WebRtcNsx_Create(NsxHandle** nsxInst) {
}
int WebRtcNsx_Free(NsxHandle* nsxInst) {
WebRtcSpl_FreeRealFFT(((NsxInst_t*)nsxInst)->real_fft);
free(nsxInst);
return 0;
}

View File

@ -16,6 +16,7 @@
#include <stdlib.h>
#include <stdio.h>
#include "common_audio/signal_processing/include/real_fft.h"
#include "cpu_features_wrapper.h"
#include "nsx_core.h"
@ -794,6 +795,14 @@ WebRtc_Word32 WebRtcNsx_InitCore(NsxInst_t* inst, WebRtc_UWord32 fs) {
inst->anaLen2 = WEBRTC_SPL_RSHIFT_W16(inst->anaLen, 1);
inst->magnLen = inst->anaLen2 + 1;
if (inst->real_fft != NULL) {
WebRtcSpl_FreeRealFFT(inst->real_fft);
}
inst->real_fft = WebRtcSpl_CreateRealFFT(inst->stages);
if (inst->real_fft == NULL) {
return -1;
}
WebRtcSpl_ZerosArrayW16(inst->analysisBuffer, ANAL_BLOCKL_MAX);
WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer, ANAL_BLOCKL_MAX);
@ -1548,8 +1557,7 @@ void WebRtcNsx_DataAnalysis(NsxInst_t* inst, short* speechFrame, WebRtc_UWord16*
WebRtc_Word16 frac = 0;
WebRtc_Word16 log2 = 0;
WebRtc_Word16 matrix_determinant = 0;
WebRtc_Word16 winData[ANAL_BLOCKL_MAX], maxWinData;
WebRtc_Word16 realImag[ANAL_BLOCKL_MAX << 1];
WebRtc_Word16 maxWinData;
int i, j;
int zeros;
@ -1557,6 +1565,13 @@ void WebRtcNsx_DataAnalysis(NsxInst_t* inst, short* speechFrame, WebRtc_UWord16*
int right_shifts_in_magnU16 = 0;
int right_shifts_in_initMagnEst = 0;
int16_t winData_buff[ANAL_BLOCKL_MAX * 2 + 16];
int16_t realImag_buff[ANAL_BLOCKL_MAX * 2 + 16];
// Align the structures to 32-byte boundary for the FFT function.
int16_t* winData = (int16_t*) (((uintptr_t)winData_buff + 31) & ~31);
int16_t* realImag = (int16_t*) (((uintptr_t) realImag_buff + 31) & ~31);
// Update analysis buffer for lower band, and window data before FFT.
WebRtcNsx_AnalysisUpdate(inst, winData, speechFrame);
@ -1585,14 +1600,13 @@ void WebRtcNsx_DataAnalysis(NsxInst_t* inst, short* speechFrame, WebRtc_UWord16*
// create realImag as winData interleaved with zeros (= imag. part), normalize it
WebRtcNsx_CreateComplexBuffer(inst, winData, realImag);
// bit-reverse position of elements in array and FFT the array
WebRtcSpl_ComplexBitReverse(realImag, inst->stages); // Q(normData-stages)
WebRtcSpl_ComplexFFT(realImag, inst->stages, 1);
// FFT output will be in winData[].
WebRtcSpl_RealForwardFFT(inst->real_fft, realImag, winData);
inst->imag[0] = 0; // Q(normData-stages)
inst->imag[inst->anaLen2] = 0;
inst->real[0] = realImag[0]; // Q(normData-stages)
inst->real[inst->anaLen2] = realImag[inst->anaLen];
inst->real[0] = winData[0]; // Q(normData-stages)
inst->real[inst->anaLen2] = winData[inst->anaLen];
// Q(2*(normData-stages))
inst->magnEnergy = (WebRtc_UWord32)WEBRTC_SPL_MUL_16_16(inst->real[0], inst->real[0]);
inst->magnEnergy += (WebRtc_UWord32)WEBRTC_SPL_MUL_16_16(inst->real[inst->anaLen2],
@ -1604,12 +1618,12 @@ void WebRtcNsx_DataAnalysis(NsxInst_t* inst, short* speechFrame, WebRtc_UWord16*
if (inst->blockIndex >= END_STARTUP_SHORT) {
for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) {
inst->real[i] = realImag[j];
inst->imag[i] = -realImag[j + 1];
inst->real[i] = winData[j];
inst->imag[i] = -winData[j + 1];
// magnitude spectrum
// energy in Q(2*(normData-stages))
tmpU32no1 = (WebRtc_UWord32)WEBRTC_SPL_MUL_16_16(realImag[j], realImag[j]);
tmpU32no1 += (WebRtc_UWord32)WEBRTC_SPL_MUL_16_16(realImag[j + 1], realImag[j + 1]);
tmpU32no1 = (WebRtc_UWord32)WEBRTC_SPL_MUL_16_16(winData[j], winData[j]);
tmpU32no1 += (WebRtc_UWord32)WEBRTC_SPL_MUL_16_16(winData[j + 1], winData[j + 1]);
inst->magnEnergy += tmpU32no1; // Q(2*(normData-stages))
magnU16[i] = (WebRtc_UWord16)WebRtcSpl_SqrtFloor(tmpU32no1); // Q(normData-stages)
@ -1653,12 +1667,12 @@ void WebRtcNsx_DataAnalysis(NsxInst_t* inst, short* speechFrame, WebRtc_UWord16*
sum_log_i_log_magn = (WEBRTC_SPL_MUL_16_16(kLogIndex[inst->anaLen2], log2) >> 3);
for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) {
inst->real[i] = realImag[j];
inst->imag[i] = -realImag[j + 1];
inst->real[i] = winData[j];
inst->imag[i] = -winData[j + 1];
// magnitude spectrum
// energy in Q(2*(normData-stages))
tmpU32no1 = (WebRtc_UWord32)WEBRTC_SPL_MUL_16_16(realImag[j], realImag[j]);
tmpU32no1 += (WebRtc_UWord32)WEBRTC_SPL_MUL_16_16(realImag[j + 1], realImag[j + 1]);
tmpU32no1 = (WebRtc_UWord32)WEBRTC_SPL_MUL_16_16(winData[j], winData[j]);
tmpU32no1 += (WebRtc_UWord32)WEBRTC_SPL_MUL_16_16(winData[j + 1], winData[j + 1]);
inst->magnEnergy += tmpU32no1; // Q(2*(normData-stages))
magnU16[i] = (WebRtc_UWord16)WebRtcSpl_SqrtFloor(tmpU32no1); // Q(normData-stages)
@ -1780,7 +1794,13 @@ void WebRtcNsx_DataAnalysis(NsxInst_t* inst, short* speechFrame, WebRtc_UWord16*
void WebRtcNsx_DataSynthesis(NsxInst_t* inst, short* outFrame) {
WebRtc_Word32 energyOut;
WebRtc_Word16 realImag[ANAL_BLOCKL_MAX << 1];
int16_t realImag_buff[ANAL_BLOCKL_MAX * 2 + 16];
int16_t rfft_out_buff[ANAL_BLOCKL_MAX * 2 + 16];
// Align the structures to 32-byte boundary for the FFT function.
int16_t* realImag = (int16_t*) (((uintptr_t)realImag_buff + 31) & ~31);
int16_t* rfft_out = (int16_t*) (((uintptr_t) rfft_out_buff + 31) & ~31);
WebRtc_Word16 tmp16no1, tmp16no2;
WebRtc_Word16 energyRatio;
WebRtc_Word16 gainFactor, gainFactor1, gainFactor2;
@ -1807,12 +1827,11 @@ void WebRtcNsx_DataSynthesis(NsxInst_t* inst, short* outFrame) {
// Filter the data in the frequency domain, and create spectrum.
WebRtcNsx_PrepareSpectrum(inst, realImag);
// bit-reverse position of elements in array and IFFT it
WebRtcSpl_ComplexBitReverse(realImag, inst->stages);
outCIFFT = WebRtcSpl_ComplexIFFT(realImag, inst->stages, 1);
// Inverse FFT output will be in rfft_out[].
outCIFFT = WebRtcSpl_RealInverseFFT(inst->real_fft, realImag, rfft_out);
// Denormalize.
WebRtcNsx_Denormalize(inst, realImag, outCIFFT);
WebRtcNsx_Denormalize(inst, rfft_out, outCIFFT);
//scale factor: only do it after END_STARTUP_LONG time
gainFactor = 8192; // 8192 = Q13(1.0)

View File

@ -99,6 +99,7 @@ typedef struct NsxInst_t_ {
int scaleEnergyIn;
int normData;
struct RealFFT* real_fft;
} NsxInst_t;
#ifdef __cplusplus