NetEq: Change NetEq's ramp-up behavior after expansions

NetEq tapers down the audio produced through loss concealment when the
expansion has been going on for some time. When the audio packets starts
coming in again, there is a ramp-up that happens. This ramp-up could
before this change extend over more than one 10 ms block, which made
keeping track of the scaling factor necessary. With this change, we make
this ramp-up quicker in the rare cases when it lasted more than 10 ms,
so that it always ramps up to 100% within one block. This way, we can
remove the mute_factor_array.

This change breaks bit-exactness, but careful listening could not reveal
an audible difference.

This change is a part of a larger refactoring of NetEq's PLC code.

Bug: webrtc:9180
Change-Id: I4c513ce3ed8d66f9beec2abfb1f0c7ffaac7a21e
Reviewed-on: https://webrtc-review.googlesource.com/77180
Commit-Queue: Henrik Lundin <henrik.lundin@webrtc.org>
Reviewed-by: Minyue Li <minyue@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#23342}
This commit is contained in:
Henrik Lundin 2018-05-22 10:40:23 +02:00 committed by Commit Bot
parent 7a84fcf47a
commit 6dc82e8f8b
9 changed files with 81 additions and 150 deletions

View File

@ -984,35 +984,35 @@ class AcmReceiverBitExactnessOldApi : public ::testing::Test {
#if (defined(WEBRTC_CODEC_ISAC) || defined(WEBRTC_CODEC_ISACFX)) && \
defined(WEBRTC_CODEC_ILBC)
TEST_F(AcmReceiverBitExactnessOldApi, 8kHzOutput) {
Run(8000, PlatformChecksum("2adede965c6f87de7142c51552111d08",
"028c0fc414b1c9ab7e582dccdf381e98",
"36c95170c1393d4b765d1c17b61ef977",
Run(8000, PlatformChecksum("7294941b62293e143d6d6c84955923fd",
"f26b8c9aa8257c7185925fa5b102f46a",
"65e5ef5c3de9c2abf3c8d0989772e9fc",
"4598140b5e4f7ee66c5adad609e65a3e",
"bac5db6dff44323be401060f1279a532"));
"04a1d3e735730b6d7dbd5df10f717d27"));
}
TEST_F(AcmReceiverBitExactnessOldApi, 16kHzOutput) {
Run(16000, PlatformChecksum("c2550a3db7632de409e8db0093df1c12",
"edd31f4b6665cd5b9041fb93f2316594",
"22128bca51650cb61c80bed63b595603",
Run(16000, PlatformChecksum("de8143dd3cc23241f1e1d5cf14e04b8a",
"eada3f321120d8d5afffbe4170a55d2f",
"135d8c3c7b92aa13d45cad7c91068e3e",
"f2aad418af974a3b1694d5ae5cc2c3c7",
"61c3cb9386b9503feebcb829c9be54bd"));
"728b69068332efade35b1a9c32029e1b"));
}
TEST_F(AcmReceiverBitExactnessOldApi, 32kHzOutput) {
Run(32000, PlatformChecksum("85e28d7950132d56f90b099c90f82153",
"7b903f5c89997f271b405e63c245ef45",
"8b8fc6c6fd1dcdcfb3dd90e1ce597f10",
Run(32000, PlatformChecksum("521d336237bdcc9ab44050e9da8917fc",
"73d44a7bedb6dfa7c70cf997223d8c10",
"f3ee2f14b03fb8e98f526f82583f84d9",
"100869c8dcde51346c2073e52a272d98",
"fdec5301dc649a47d407382b587e14da"));
"5f338b4bc38707d0a14d75a357e1546e"));
}
TEST_F(AcmReceiverBitExactnessOldApi, 48kHzOutput) {
Run(48000, PlatformChecksum("ab611510e8fd6d5210a23cc04d3f0e8e",
"d8609bc9b495d81f29779344c68bcc47",
"ec5ebb90cda0ea5bb89e79d698af65de",
Run(48000, PlatformChecksum("5955e31373828969de7fb308fb58a84e",
"83c0eca235b1a806426ff6ca8655cdf7",
"1126a8c03d1ebc6aa7348b9c541e2082",
"bd44bf97e7899186532f91235cef444d",
"0baae2972cca142027d4af44f95f0bd5"));
"9d092dbc96e7ef6870b78c1056e87315"));
}
TEST_F(AcmReceiverBitExactnessOldApi, 48kHzOutputExternalDecoder) {
@ -1094,11 +1094,11 @@ TEST_F(AcmReceiverBitExactnessOldApi, 48kHzOutputExternalDecoder) {
rtc::scoped_refptr<rtc::RefCountedObject<ADFactory>> factory(
new rtc::RefCountedObject<ADFactory>);
Run(48000, PlatformChecksum("ab611510e8fd6d5210a23cc04d3f0e8e",
"d8609bc9b495d81f29779344c68bcc47",
"ec5ebb90cda0ea5bb89e79d698af65de",
Run(48000, PlatformChecksum("5955e31373828969de7fb308fb58a84e",
"83c0eca235b1a806426ff6ca8655cdf7",
"1126a8c03d1ebc6aa7348b9c541e2082",
"bd44bf97e7899186532f91235cef444d",
"0baae2972cca142027d4af44f95f0bd5"),
"9d092dbc96e7ef6870b78c1056e87315"),
factory, [](AudioCodingModule* acm) {
acm->RegisterReceiveCodec(0, {"MockPCMu", 8000, 1});
});

View File

@ -44,7 +44,6 @@ Merge::Merge(int fs_hz,
Merge::~Merge() = default;
size_t Merge::Process(int16_t* input, size_t input_length,
int16_t* external_mute_factor_array,
AudioMultiVector* output) {
// TODO(hlundin): Change to an enumerator and skip assert.
assert(fs_hz_ == 8000 || fs_hz_ == 16000 || fs_hz_ == 32000 ||
@ -73,20 +72,9 @@ size_t Merge::Process(int16_t* input, size_t input_length,
input_length_per_channel, 0, input_channel.get());
expanded_[channel].CopyTo(expanded_length, 0, expanded_channel.get());
int16_t new_mute_factor = SignalScaling(
input_channel.get(), input_length_per_channel, expanded_channel.get());
// Adjust muting factor (product of "main" muting factor and expand muting
// factor).
int16_t* external_mute_factor = &external_mute_factor_array[channel];
*external_mute_factor =
(*external_mute_factor * expand_->MuteFactor(channel)) >> 14;
// Update |external_mute_factor| if it is lower than |new_mute_factor|.
if (new_mute_factor > *external_mute_factor) {
*external_mute_factor = std::min(new_mute_factor,
static_cast<int16_t>(16384));
}
const int16_t new_mute_factor = std::min<int16_t>(
16384, SignalScaling(input_channel.get(), input_length_per_channel,
expanded_channel.get()));
if (channel == 0) {
// Downsample, correlate, and find strongest correlation period for the
@ -110,18 +98,24 @@ size_t Merge::Process(int16_t* input, size_t input_length,
expanded_length - best_correlation_index);
interpolation_length = std::min(interpolation_length,
input_length_per_channel);
if (*external_mute_factor < 16384) {
RTC_DCHECK_LE(new_mute_factor, 16384);
int16_t mute_factor =
std::max(expand_->MuteFactor(channel), new_mute_factor);
RTC_DCHECK_GE(mute_factor, 0);
if (mute_factor < 16384) {
// Set a suitable muting slope (Q20). 0.004 for NB, 0.002 for WB,
// and so on.
int increment = 4194 / fs_mult_;
*external_mute_factor =
static_cast<int16_t>(DspHelper::RampSignal(input_channel.get(),
interpolation_length,
*external_mute_factor,
increment));
// and so on, or as fast as it takes to come back to full gain within the
// frame length.
const int back_to_fullscale_inc = static_cast<int>(
((16384 - mute_factor) << 6) / input_length_per_channel);
const int increment = std::max(4194 / fs_mult_, back_to_fullscale_inc);
mute_factor = static_cast<int16_t>(DspHelper::RampSignal(
input_channel.get(), interpolation_length, mute_factor, increment));
DspHelper::UnmuteSignal(&input_channel[interpolation_length],
input_length_per_channel - interpolation_length,
external_mute_factor, increment,
&mute_factor, increment,
&decoded_output[interpolation_length]);
} else {
// No muting needed.
@ -134,12 +128,12 @@ size_t Merge::Process(int16_t* input, size_t input_length,
// Do overlap and mix linearly.
int16_t increment =
static_cast<int16_t>(16384 / (interpolation_length + 1)); // In Q14.
int16_t mute_factor = 16384 - increment;
int16_t local_mute_factor = 16384 - increment;
memmove(temp_data_.data(), expanded_channel.get(),
sizeof(int16_t) * best_correlation_index);
DspHelper::CrossFade(&expanded_channel[best_correlation_index],
input_channel.get(), interpolation_length,
&mute_factor, increment, decoded_output);
&local_mute_factor, increment, decoded_output);
output_length = best_correlation_index + input_length_per_channel;
if (channel == 0) {

View File

@ -43,11 +43,8 @@ class Merge {
// |input|, having |input_length| samples in total for all channels
// (interleaved). The result is written to |output|. The number of channels
// allocated in |output| defines the number of channels that will be used when
// de-interleaving |input|. The values in |external_mute_factor_array| (Q14)
// will be used to scale the audio, and is updated in the process. The array
// must have |num_channels_| elements.
// de-interleaving |input|.
virtual size_t Process(int16_t* input, size_t input_length,
int16_t* external_mute_factor_array,
AudioMultiVector* output);
virtual size_t RequiredFutureSamples();

View File

@ -1536,9 +1536,8 @@ int NetEqImpl::DecodeLoop(PacketList* packet_list, const Operations& operation,
void NetEqImpl::DoNormal(const int16_t* decoded_buffer, size_t decoded_length,
AudioDecoder::SpeechType speech_type, bool play_dtmf) {
assert(normal_.get());
assert(mute_factor_array_.get());
normal_->Process(decoded_buffer, decoded_length, last_mode_,
mute_factor_array_.get(), algorithm_buffer_.get());
algorithm_buffer_.get());
if (decoded_length != 0) {
last_mode_ = kModeNormal;
}
@ -1558,10 +1557,8 @@ void NetEqImpl::DoNormal(const int16_t* decoded_buffer, size_t decoded_length,
void NetEqImpl::DoMerge(int16_t* decoded_buffer, size_t decoded_length,
AudioDecoder::SpeechType speech_type, bool play_dtmf) {
assert(mute_factor_array_.get());
assert(merge_.get());
size_t new_length = merge_->Process(decoded_buffer, decoded_length,
mute_factor_array_.get(),
algorithm_buffer_.get());
// Correction can be negative.
int expand_length_correction =
@ -1803,9 +1800,8 @@ int NetEqImpl::DoRfc3389Cng(PacketList* packet_list, bool play_dtmf) {
void NetEqImpl::DoCodecInternalCng(const int16_t* decoded_buffer,
size_t decoded_length) {
RTC_DCHECK(normal_.get());
RTC_DCHECK(mute_factor_array_.get());
normal_->Process(decoded_buffer, decoded_length, last_mode_,
mute_factor_array_.get(), algorithm_buffer_.get());
algorithm_buffer_.get());
last_mode_ = kModeCodecInternalCng;
expand_->Reset();
}
@ -2065,12 +2061,6 @@ void NetEqImpl::SetSampleRateAndChannels(int fs_hz, size_t channels) {
last_mode_ = kModeNormal;
// Create a new array of mute factors and set all to 1.
mute_factor_array_.reset(new int16_t[channels]);
for (size_t i = 0; i < channels; ++i) {
mute_factor_array_[i] = 16384; // 1.0 in Q14.
}
ComfortNoiseDecoder* cng_decoder = decoder_database_->GetActiveCngDecoder();
if (cng_decoder)
cng_decoder->Reset();

View File

@ -418,7 +418,6 @@ class NetEqImpl : public webrtc::NetEq {
size_t decoder_frame_length_ RTC_GUARDED_BY(crit_sect_);
Modes last_mode_ RTC_GUARDED_BY(crit_sect_);
Operations last_operation_ RTC_GUARDED_BY(crit_sect_);
std::unique_ptr<int16_t[]> mute_factor_array_ RTC_GUARDED_BY(crit_sect_);
size_t decoded_buffer_length_ RTC_GUARDED_BY(crit_sect_);
std::unique_ptr<int16_t[]> decoded_buffer_ RTC_GUARDED_BY(crit_sect_);
uint32_t playout_timestamp_ RTC_GUARDED_BY(crit_sect_);

View File

@ -463,18 +463,18 @@ TEST_F(NetEqDecodingTest, MAYBE_TestBitExactness) {
webrtc::test::ResourcePath("audio_coding/neteq_universal_new", "rtp");
const std::string output_checksum = PlatformChecksum(
"09fa7646e2ad032a0b156177b95f09012430f81f",
"1c64eb8b55ce8878676c6a1e6ddd78f48de0668b",
"0c6dc227f781c81a229970f8fceda1a012498cba",
"15c4a2202877a414515e218bdb7992f0ad53e5af",
"not used",
"09fa7646e2ad032a0b156177b95f09012430f81f",
"759fef89a5de52bd17e733dc255c671ce86be909");
"0c6dc227f781c81a229970f8fceda1a012498cba",
"25fc4c863caa499aa447a5b8d059f5452cbcc500");
const std::string network_stats_checksum =
PlatformChecksum("5b4262ca328e5f066af5d34f3380521583dd20de",
"80235b6d727281203acb63b98f9a9e85d95f7ec0",
PlatformChecksum("4b2370f5c794741d2a46be5c7935c66ef3fb53e9",
"e339cb2adf5ab3dfc21cb7205d670a34751e8336",
"not used",
"5b4262ca328e5f066af5d34f3380521583dd20de",
"5b4262ca328e5f066af5d34f3380521583dd20de");
"4b2370f5c794741d2a46be5c7935c66ef3fb53e9",
"4b2370f5c794741d2a46be5c7935c66ef3fb53e9");
const std::string rtcp_stats_checksum = PlatformChecksum(
"b8880bf9fed2487efbddcb8d94b9937a29ae521d",
@ -502,18 +502,18 @@ TEST_F(NetEqDecodingTest, MAYBE_TestOpusBitExactness) {
webrtc::test::ResourcePath("audio_coding/neteq_opus", "rtp");
const std::string output_checksum = PlatformChecksum(
"e0e4063d55941792a725fde1f781ebbbe83b8165",
"b77d2f25db1e59e054160a89e7defc7bfdb12f31",
"a8a525c476a922456a35b5b13861b55cd8c9c7b8",
"e0e4063d55941792a725fde1f781ebbbe83b8165",
"e0e4063d55941792a725fde1f781ebbbe83b8165");
"14a63b3c7b925c82296be4bafc71bec85f2915c2",
"b7b7ed802b0e18ee416973bf3b9ae98599b0181d",
"5876e52dda90d5ca433c3726555b907b97c86374",
"14a63b3c7b925c82296be4bafc71bec85f2915c2",
"14a63b3c7b925c82296be4bafc71bec85f2915c2");
const std::string network_stats_checksum =
PlatformChecksum("9e72233c78baf685e500dd6c94212b30a4c5f27d",
"c237d7ca04cbb1ea2e3b27a7c8963015deb985e7",
"4f1e9734bc80a290faaf9d611efcb8d7802dbc4f",
"9e72233c78baf685e500dd6c94212b30a4c5f27d",
"9e72233c78baf685e500dd6c94212b30a4c5f27d");
PlatformChecksum("adb3272498e436d1c019cbfd71610e9510c54497",
"fa935a91abc7291db47428a2d7c5361b98713a92",
"42106aa5267300f709f63737707ef07afd9dac61",
"adb3272498e436d1c019cbfd71610e9510c54497",
"adb3272498e436d1c019cbfd71610e9510c54497");
const std::string rtcp_stats_checksum = PlatformChecksum(
"e37c797e3de6a64dda88c9ade7a013d022a2e1e0",

View File

@ -27,7 +27,6 @@ namespace webrtc {
int Normal::Process(const int16_t* input,
size_t length,
Modes last_mode,
int16_t* external_mute_factor_array,
AudioMultiVector* output) {
if (length == 0) {
// Nothing to process.
@ -66,10 +65,8 @@ int Normal::Process(const int16_t* input,
size_t length_per_channel = length / output->Channels();
std::unique_ptr<int16_t[]> signal(new int16_t[length_per_channel]);
for (size_t channel_ix = 0; channel_ix < output->Channels(); ++channel_ix) {
// Adjust muting factor (main muting factor times expand muting factor).
external_mute_factor_array[channel_ix] = static_cast<int16_t>(
(external_mute_factor_array[channel_ix] *
expand_->MuteFactor(channel_ix)) >> 14);
// Set muting factor to the same as expand muting factor.
int16_t mute_factor = expand_->MuteFactor(channel_ix);
(*output)[channel_ix].CopyTo(length_per_channel, 0, signal.get());
@ -92,7 +89,7 @@ int Normal::Process(const int16_t* input,
energy = 0;
}
int mute_factor;
int local_mute_factor = 16384; // 1.0 in Q14.
if ((energy != 0) &&
(energy > background_noise_.Energy(channel_ix))) {
// Normalize new frame energy to 15 bits.
@ -103,29 +100,30 @@ int Normal::Process(const int16_t* input,
int16_t energy_scaled =
static_cast<int16_t>(WEBRTC_SPL_SHIFT_W32(energy, scaling));
int32_t ratio = WebRtcSpl_DivW32W16(bgn_energy, energy_scaled);
mute_factor = WebRtcSpl_SqrtFloor(ratio << 14);
} else {
mute_factor = 16384; // 1.0 in Q14.
}
if (mute_factor > external_mute_factor_array[channel_ix]) {
external_mute_factor_array[channel_ix] =
static_cast<int16_t>(std::min(mute_factor, 16384));
local_mute_factor =
std::min(local_mute_factor, WebRtcSpl_SqrtFloor(ratio << 14));
}
mute_factor = std::max<int16_t>(mute_factor, local_mute_factor);
RTC_DCHECK_LE(mute_factor, 16384);
RTC_DCHECK_GE(mute_factor, 0);
// If muted increase by 0.64 for every 20 ms (NB/WB 0.0040/0.0020 in Q14).
int increment = 64 / fs_mult;
// If muted increase by 0.64 for every 20 ms (NB/WB 0.0040/0.0020 in Q14),
// or as fast as it takes to come back to full gain within the frame
// length.
const int back_to_fullscale_inc =
static_cast<int>((16384 - mute_factor) / length_per_channel);
const int increment = std::max(64 / fs_mult, back_to_fullscale_inc);
for (size_t i = 0; i < length_per_channel; i++) {
// Scale with mute factor.
RTC_DCHECK_LT(channel_ix, output->Channels());
RTC_DCHECK_LT(i, output->Size());
int32_t scaled_signal = (*output)[channel_ix][i] *
external_mute_factor_array[channel_ix];
int32_t scaled_signal = (*output)[channel_ix][i] * mute_factor;
// Shift 14 with proper rounding.
(*output)[channel_ix][i] =
static_cast<int16_t>((scaled_signal + 8192) >> 14);
// Increase mute_factor towards 16384.
external_mute_factor_array[channel_ix] = static_cast<int16_t>(std::min(
external_mute_factor_array[channel_ix] + increment, 16384));
mute_factor =
static_cast<int16_t>(std::min(mute_factor + increment, 16384));
}
// Interpolate the expanded data into the new vector.
@ -153,8 +151,6 @@ int Normal::Process(const int16_t* input,
static const size_t kCngLength = 48;
RTC_DCHECK_LE(8 * fs_mult, kCngLength);
int16_t cng_output[kCngLength];
// Reset mute factor and start up fresh.
external_mute_factor_array[0] = 16384;
ComfortNoiseDecoder* cng_decoder = decoder_database_->GetActiveCngDecoder();
if (cng_decoder) {
@ -186,28 +182,6 @@ int Normal::Process(const int16_t* input,
}
RTC_DCHECK_GT(win_up_Q14,
(1 << 14) - 32); // Worst case rouding is a length of 34
} else if (external_mute_factor_array[0] < 16384) {
// Previous was neither of Expand, FadeToBGN or RFC3389_CNG, but we are
// still ramping up from previous muting.
// If muted increase by 0.64 for every 20 ms (NB/WB 0.0040/0.0020 in Q14).
int increment = 64 / fs_mult;
size_t length_per_channel = length / output->Channels();
for (size_t i = 0; i < length_per_channel; i++) {
for (size_t channel_ix = 0; channel_ix < output->Channels();
++channel_ix) {
// Scale with mute factor.
RTC_DCHECK_LT(channel_ix, output->Channels());
RTC_DCHECK_LT(i, output->Size());
int32_t scaled_signal = (*output)[channel_ix][i] *
external_mute_factor_array[channel_ix];
// Shift 14 with proper rounding.
(*output)[channel_ix][i] =
static_cast<int16_t>((scaled_signal + 8192) >> 14);
// Increase mute_factor towards 16384.
external_mute_factor_array[channel_ix] = static_cast<int16_t>(std::min(
16384, external_mute_factor_array[channel_ix] + increment));
}
}
}
return static_cast<int>(length);

View File

@ -53,11 +53,9 @@ class Normal {
// result is written to |output|. The number of channels allocated in
// |output| defines the number of channels that will be used when
// de-interleaving |input|. |last_mode| contains the mode used in the previous
// GetAudio call (i.e., not the current one), and |external_mute_factor| is
// a pointer to the mute factor in the NetEqImpl class.
// GetAudio call (i.e., not the current one).
int Process(const int16_t* input, size_t length,
Modes last_mode,
int16_t* external_mute_factor_array,
AudioMultiVector* output);
private:

View File

@ -68,16 +68,10 @@ TEST(Normal, AvoidDivideByZero) {
Normal normal(fs, &db, bgn, &expand);
int16_t input[1000] = {0};
std::unique_ptr<int16_t[]> mute_factor_array(new int16_t[channels]);
for (size_t i = 0; i < channels; ++i) {
mute_factor_array[i] = 16384;
}
AudioMultiVector output(channels);
// Zero input length.
EXPECT_EQ(
0,
normal.Process(input, 0, kModeExpand, mute_factor_array.get(), &output));
EXPECT_EQ(0, normal.Process(input, 0, kModeExpand, &output));
EXPECT_EQ(0u, output.Size());
// Try to make energy_length >> scaling = 0;
@ -93,7 +87,6 @@ TEST(Normal, AvoidDivideByZero) {
normal.Process(input,
input_size_samples,
kModeExpand,
mute_factor_array.get(),
&output));
EXPECT_CALL(db, Die()); // Called when |db| goes out of scope.
@ -114,18 +107,11 @@ TEST(Normal, InputLengthAndChannelsDoNotMatch) {
Normal normal(fs, &db, bgn, &expand);
int16_t input[1000] = {0};
std::unique_ptr<int16_t[]> mute_factor_array(new int16_t[channels]);
for (size_t i = 0; i < channels; ++i) {
mute_factor_array[i] = 16384;
}
AudioMultiVector output(channels);
// Let the number of samples be one sample less than 80 samples per channel.
size_t input_len = 80 * channels - 1;
EXPECT_EQ(
0,
normal.Process(
input, input_len, kModeExpand, mute_factor_array.get(), &output));
EXPECT_EQ(0, normal.Process(input, input_len, kModeExpand, &output));
EXPECT_EQ(0u, output.Size());
EXPECT_CALL(db, Die()); // Called when |db| goes out of scope.
@ -147,12 +133,6 @@ TEST(Normal, LastModeExpand120msPacket) {
Normal normal(kFs, &db, bgn, &expand);
int16_t input[kPacketsizeBytes] = {0};
std::unique_ptr<int16_t[]> mute_factor_array(new int16_t[kChannels]);
for (size_t i = 0; i < kChannels; ++i) {
mute_factor_array[i] = 16384;
}
AudioMultiVector output(kChannels);
EXPECT_CALL(expand, SetParametersForNormalAfterExpand());
@ -162,7 +142,6 @@ TEST(Normal, LastModeExpand120msPacket) {
normal.Process(input,
kPacketsizeBytes,
kModeExpand,
mute_factor_array.get(),
&output));
EXPECT_EQ(kPacketsizeBytes, output.Size());