From cf808d2366e58b33540931d182f36800d9a15b0d Mon Sep 17 00:00:00 2001 From: Henrik Lundin Date: Wed, 27 May 2015 14:33:29 +0200 Subject: [PATCH] Add new fast mode for NetEq's Accelerate operation This change instroduces a mode where the Accelerate operation will be more aggressive. When enabled, it will allow acceleration at lower correlation levels, and possibly remove multiple pitch periods at once. The feature is enabled through NetEq::Config, and is off by default. This means that bit-exactness tests are currently not affected. A unit test was added for the Accelerate class, with and without fast mode enabled. BUG=4691 R=minyue@webrtc.org Review URL: https://webrtc-codereview.appspot.com/50039004 Cr-Commit-Position: refs/heads/master@{#9295} --- .../modules/audio_coding/neteq/accelerate.cc | 31 ++++--- .../modules/audio_coding/neteq/accelerate.h | 6 +- .../neteq/decision_logic_normal.cc | 16 ++-- webrtc/modules/audio_coding/neteq/defines.h | 1 + .../audio_coding/neteq/interface/neteq.h | 4 +- webrtc/modules/audio_coding/neteq/neteq.cc | 3 +- .../modules/audio_coding/neteq/neteq_impl.cc | 37 +++++---- .../modules/audio_coding/neteq/neteq_impl.h | 4 +- .../audio_coding/neteq/preemptive_expand.cc | 11 ++- .../audio_coding/neteq/preemptive_expand.h | 11 +-- .../audio_coding/neteq/time_stretch.cc | 16 ++-- .../modules/audio_coding/neteq/time_stretch.h | 9 ++- .../neteq/time_stretch_unittest.cc | 81 ++++++++++++++++++- 13 files changed, 172 insertions(+), 58 deletions(-) diff --git a/webrtc/modules/audio_coding/neteq/accelerate.cc b/webrtc/modules/audio_coding/neteq/accelerate.cc index 6acd778a23..ad7423810d 100644 --- a/webrtc/modules/audio_coding/neteq/accelerate.cc +++ b/webrtc/modules/audio_coding/neteq/accelerate.cc @@ -14,11 +14,11 @@ namespace webrtc { -Accelerate::ReturnCodes Accelerate::Process( - const int16_t* input, - size_t input_length, - AudioMultiVector* output, - int16_t* length_change_samples) { +Accelerate::ReturnCodes Accelerate::Process(const int16_t* input, + size_t input_length, + bool fast_accelerate, + AudioMultiVector* output, + int16_t* length_change_samples) { // Input length must be (almost) 30 ms. static const int k15ms = 120; // 15 ms = 120 samples at 8 kHz sample rate. if (num_channels_ == 0 || static_cast(input_length) / num_channels_ < @@ -28,7 +28,7 @@ Accelerate::ReturnCodes Accelerate::Process( output->PushBackInterleaved(input, input_length); return kError; } - return TimeStretch::Process(input, input_length, output, + return TimeStretch::Process(input, input_length, fast_accelerate, output, length_change_samples); } @@ -41,17 +41,30 @@ void Accelerate::SetParametersForPassiveSpeech(size_t /*len*/, } Accelerate::ReturnCodes Accelerate::CheckCriteriaAndStretch( - const int16_t* input, size_t input_length, size_t peak_index, - int16_t best_correlation, bool active_speech, + const int16_t* input, + size_t input_length, + size_t peak_index, + int16_t best_correlation, + bool active_speech, + bool fast_mode, AudioMultiVector* output) const { // Check for strong correlation or passive speech. - if ((best_correlation > kCorrelationThreshold) || !active_speech) { + // Use 8192 (0.5 in Q14) in fast mode. + const int correlation_threshold = fast_mode ? 8192 : kCorrelationThreshold; + if ((best_correlation > correlation_threshold) || !active_speech) { // Do accelerate operation by overlap add. // Pre-calculate common multiplication with |fs_mult_|. // 120 corresponds to 15 ms. size_t fs_mult_120 = fs_mult_ * 120; + if (fast_mode) { + // Fit as many multiples of |peak_index| as possible in fs_mult_120. + // TODO(henrik.lundin) Consider finding multiple correlation peaks and + // pick the one with the longest correlation lag in this case. + peak_index = (fs_mult_120 / peak_index) * peak_index; + } + assert(fs_mult_120 >= peak_index); // Should be handled in Process(). // Copy first part; 0 to 15 ms. output->PushBackInterleaved(input, fs_mult_120 * num_channels_); diff --git a/webrtc/modules/audio_coding/neteq/accelerate.h b/webrtc/modules/audio_coding/neteq/accelerate.h index 36bc0946c9..684f74bb8c 100644 --- a/webrtc/modules/audio_coding/neteq/accelerate.h +++ b/webrtc/modules/audio_coding/neteq/accelerate.h @@ -38,9 +38,12 @@ class Accelerate : public TimeStretch { // read from |input|, of length |input_length| elements, and are written to // |output|. The number of samples removed through time-stretching is // is provided in the output |length_change_samples|. The method returns - // the outcome of the operation as an enumerator value. + // the outcome of the operation as an enumerator value. If |fast_accelerate| + // is true, the algorithm will relax the requirements on finding strong + // correlations, and may remove multiple pitch periods if possible. ReturnCodes Process(const int16_t* input, size_t input_length, + bool fast_accelerate, AudioMultiVector* output, int16_t* length_change_samples); @@ -58,6 +61,7 @@ class Accelerate : public TimeStretch { size_t peak_index, int16_t best_correlation, bool active_speech, + bool fast_mode, AudioMultiVector* output) const override; private: diff --git a/webrtc/modules/audio_coding/neteq/decision_logic_normal.cc b/webrtc/modules/audio_coding/neteq/decision_logic_normal.cc index f2382845b0..89fdb51b0b 100644 --- a/webrtc/modules/audio_coding/neteq/decision_logic_normal.cc +++ b/webrtc/modules/audio_coding/neteq/decision_logic_normal.cc @@ -132,15 +132,13 @@ Operations DecisionLogicNormal::ExpectedPacketAvailable(Modes prev_mode, // Check criterion for time-stretching. int low_limit, high_limit; delay_manager_->BufferLimits(&low_limit, &high_limit); - if ((buffer_level_filter_->filtered_current_level() >= high_limit && - TimescaleAllowed()) || - buffer_level_filter_->filtered_current_level() >= high_limit << 2) { - // Buffer level higher than limit and time-scaling allowed, - // or buffer level really high. - return kAccelerate; - } else if ((buffer_level_filter_->filtered_current_level() < low_limit) - && TimescaleAllowed()) { - return kPreemptiveExpand; + if (buffer_level_filter_->filtered_current_level() >= high_limit << 2) + return kFastAccelerate; + if (TimescaleAllowed()) { + if (buffer_level_filter_->filtered_current_level() >= high_limit) + return kAccelerate; + if (buffer_level_filter_->filtered_current_level() < low_limit) + return kPreemptiveExpand; } } return kNormal; diff --git a/webrtc/modules/audio_coding/neteq/defines.h b/webrtc/modules/audio_coding/neteq/defines.h index 33d1bd9c3f..3ed6b61889 100644 --- a/webrtc/modules/audio_coding/neteq/defines.h +++ b/webrtc/modules/audio_coding/neteq/defines.h @@ -18,6 +18,7 @@ enum Operations { kMerge, kExpand, kAccelerate, + kFastAccelerate, kPreemptiveExpand, kRfc3389Cng, kRfc3389CngNoPacket, diff --git a/webrtc/modules/audio_coding/neteq/interface/neteq.h b/webrtc/modules/audio_coding/neteq/interface/neteq.h index 439f049269..88bf2087ff 100644 --- a/webrtc/modules/audio_coding/neteq/interface/neteq.h +++ b/webrtc/modules/audio_coding/neteq/interface/neteq.h @@ -80,7 +80,8 @@ class NetEq { // |max_delay_ms| has the same effect as calling SetMaximumDelay(). max_delay_ms(2000), background_noise_mode(kBgnOff), - playout_mode(kPlayoutOn) {} + playout_mode(kPlayoutOn), + enable_fast_accelerate(false) {} std::string ToString() const; @@ -90,6 +91,7 @@ class NetEq { int max_delay_ms; BackgroundNoiseMode background_noise_mode; NetEqPlayoutMode playout_mode; + bool enable_fast_accelerate; }; enum ReturnCodes { diff --git a/webrtc/modules/audio_coding/neteq/neteq.cc b/webrtc/modules/audio_coding/neteq/neteq.cc index ea100690d2..c8c4c46cfb 100644 --- a/webrtc/modules/audio_coding/neteq/neteq.cc +++ b/webrtc/modules/audio_coding/neteq/neteq.cc @@ -34,7 +34,8 @@ std::string NetEq::Config::ToString() const { << (enable_audio_classifier ? "true" : "false") << ", max_packets_in_buffer=" << max_packets_in_buffer << ", background_noise_mode=" << background_noise_mode - << ", playout_mode=" << playout_mode; + << ", playout_mode=" << playout_mode + << ", enable_fast_accelerate=" << enable_fast_accelerate; return ss.str(); } diff --git a/webrtc/modules/audio_coding/neteq/neteq_impl.cc b/webrtc/modules/audio_coding/neteq/neteq_impl.cc index fe078fd890..1351e66634 100644 --- a/webrtc/modules/audio_coding/neteq/neteq_impl.cc +++ b/webrtc/modules/audio_coding/neteq/neteq_impl.cc @@ -92,6 +92,7 @@ NetEqImpl::NetEqImpl(const NetEq::Config& config, decoder_error_code_(0), background_noise_mode_(config.background_noise_mode), playout_mode_(config.playout_mode), + enable_fast_accelerate_(config.enable_fast_accelerate), decoded_packet_sequence_number_(-1), decoded_packet_timestamp_(0) { LOG(LS_INFO) << "NetEq config: " << config.ToString(); @@ -745,9 +746,12 @@ int NetEqImpl::GetAudioInternal(size_t max_length, int16_t* output, return_value = DoExpand(play_dtmf); break; } - case kAccelerate: { + case kAccelerate: + case kFastAccelerate: { + const bool fast_accelerate = + enable_fast_accelerate_ && (operation == kFastAccelerate); return_value = DoAccelerate(decoded_buffer_.get(), length, speech_type, - play_dtmf); + play_dtmf, fast_accelerate); break; } case kPreemptiveExpand: { @@ -956,9 +960,8 @@ int NetEqImpl::GetDecision(Operations* operation, // Check if we already have enough samples in the |sync_buffer_|. If so, // change decision to normal, unless the decision was merge, accelerate, or // preemptive expand. - if (samples_left >= output_size_samples_ && - *operation != kMerge && - *operation != kAccelerate && + if (samples_left >= output_size_samples_ && *operation != kMerge && + *operation != kAccelerate && *operation != kFastAccelerate && *operation != kPreemptiveExpand) { *operation = kNormal; return 0; @@ -1034,8 +1037,9 @@ int NetEqImpl::GetDecision(Operations* operation, decision_logic_->set_generated_noise_samples(0); return 0; } - case kAccelerate: { - // In order to do a accelerate we need at least 30 ms of audio data. + case kAccelerate: + case kFastAccelerate: { + // In order to do an accelerate we need at least 30 ms of audio data. if (samples_left >= samples_30_ms) { // Already have enough data, so we do not need to extract any more. decision_logic_->set_sample_memory(samples_left); @@ -1124,13 +1128,13 @@ int NetEqImpl::GetDecision(Operations* operation, } } - if (*operation == kAccelerate || + if (*operation == kAccelerate || *operation == kFastAccelerate || *operation == kPreemptiveExpand) { decision_logic_->set_sample_memory(samples_left + extracted_samples); decision_logic_->set_prev_time_scale(true); } - if (*operation == kAccelerate) { + if (*operation == kAccelerate || *operation == kFastAccelerate) { // Check that we have enough data (30ms) to do accelerate. if (extracted_samples + samples_left < samples_30_ms) { // TODO(hlundin): Write test for this. @@ -1263,7 +1267,8 @@ int NetEqImpl::DecodeLoop(PacketList* packet_list, Operations* operation, assert(sync_buffer_->Channels() == decoder->Channels()); assert(decoded_buffer_length_ >= kMaxFrameSize * decoder->Channels()); assert(*operation == kNormal || *operation == kAccelerate || - *operation == kMerge || *operation == kPreemptiveExpand); + *operation == kFastAccelerate || *operation == kMerge || + *operation == kPreemptiveExpand); packet_list->pop_front(); size_t payload_length = packet->payload_length; int16_t decode_length; @@ -1427,9 +1432,11 @@ int NetEqImpl::DoExpand(bool play_dtmf) { return 0; } -int NetEqImpl::DoAccelerate(int16_t* decoded_buffer, size_t decoded_length, +int NetEqImpl::DoAccelerate(int16_t* decoded_buffer, + size_t decoded_length, AudioDecoder::SpeechType speech_type, - bool play_dtmf) { + bool play_dtmf, + bool fast_accelerate) { const size_t required_samples = 240 * fs_mult_; // Must have 30 ms. size_t borrowed_samples_per_channel = 0; size_t num_channels = algorithm_buffer_->Channels(); @@ -1447,9 +1454,9 @@ int NetEqImpl::DoAccelerate(int16_t* decoded_buffer, size_t decoded_length, } int16_t samples_removed; - Accelerate::ReturnCodes return_code = accelerate_->Process( - decoded_buffer, decoded_length, algorithm_buffer_.get(), - &samples_removed); + Accelerate::ReturnCodes return_code = + accelerate_->Process(decoded_buffer, decoded_length, fast_accelerate, + algorithm_buffer_.get(), &samples_removed); stats_.AcceleratedSamples(samples_removed); switch (return_code) { case Accelerate::kSuccess: diff --git a/webrtc/modules/audio_coding/neteq/neteq_impl.h b/webrtc/modules/audio_coding/neteq/neteq_impl.h index 248071f825..55ba067221 100644 --- a/webrtc/modules/audio_coding/neteq/neteq_impl.h +++ b/webrtc/modules/audio_coding/neteq/neteq_impl.h @@ -278,7 +278,8 @@ class NetEqImpl : public webrtc::NetEq { int DoAccelerate(int16_t* decoded_buffer, size_t decoded_length, AudioDecoder::SpeechType speech_type, - bool play_dtmf) EXCLUSIVE_LOCKS_REQUIRED(crit_sect_); + bool play_dtmf, + bool fast_accelerate) EXCLUSIVE_LOCKS_REQUIRED(crit_sect_); // Sub-method which calls the PreemptiveExpand class to perform the // preemtive expand operation. @@ -392,6 +393,7 @@ class NetEqImpl : public webrtc::NetEq { int decoder_error_code_ GUARDED_BY(crit_sect_); const BackgroundNoiseMode background_noise_mode_ GUARDED_BY(crit_sect_); NetEqPlayoutMode playout_mode_ GUARDED_BY(crit_sect_); + bool enable_fast_accelerate_ GUARDED_BY(crit_sect_); // These values are used by NACK module to estimate time-to-play of // a missing packet. Occasionally, NetEq might decide to decode more diff --git a/webrtc/modules/audio_coding/neteq/preemptive_expand.cc b/webrtc/modules/audio_coding/neteq/preemptive_expand.cc index b2dc3e60cb..6a3f8ecf1a 100644 --- a/webrtc/modules/audio_coding/neteq/preemptive_expand.cc +++ b/webrtc/modules/audio_coding/neteq/preemptive_expand.cc @@ -34,7 +34,8 @@ PreemptiveExpand::ReturnCodes PreemptiveExpand::Process( output->PushBackInterleaved(input, input_length); return kError; } - return TimeStretch::Process(input, input_length, output, + const bool kFastMode = false; // Fast mode is not available for PE Expand. + return TimeStretch::Process(input, input_length, kFastMode, output, length_change_samples); } @@ -54,8 +55,12 @@ void PreemptiveExpand::SetParametersForPassiveSpeech(size_t len, } PreemptiveExpand::ReturnCodes PreemptiveExpand::CheckCriteriaAndStretch( - const int16_t *input, size_t input_length, size_t peak_index, - int16_t best_correlation, bool active_speech, + const int16_t* input, + size_t input_length, + size_t peak_index, + int16_t best_correlation, + bool active_speech, + bool /*fast_mode*/, AudioMultiVector* output) const { // Pre-calculate common multiplication with |fs_mult_|. // 120 corresponds to 15 ms. diff --git a/webrtc/modules/audio_coding/neteq/preemptive_expand.h b/webrtc/modules/audio_coding/neteq/preemptive_expand.h index 750c16bde3..c583a48a5b 100644 --- a/webrtc/modules/audio_coding/neteq/preemptive_expand.h +++ b/webrtc/modules/audio_coding/neteq/preemptive_expand.h @@ -58,11 +58,12 @@ class PreemptiveExpand : public TimeStretch { // Checks the criteria for performing the time-stretching operation and, // if possible, performs the time-stretching. - ReturnCodes CheckCriteriaAndStretch(const int16_t* pw16_decoded, - size_t len, - size_t w16_bestIndex, - int16_t w16_bestCorr, - bool w16_VAD, + ReturnCodes CheckCriteriaAndStretch(const int16_t* input, + size_t input_length, + size_t peak_index, + int16_t best_correlation, + bool active_speech, + bool /*fast_mode*/, AudioMultiVector* output) const override; private: diff --git a/webrtc/modules/audio_coding/neteq/time_stretch.cc b/webrtc/modules/audio_coding/neteq/time_stretch.cc index 02305c83e5..5577cd2ecb 100644 --- a/webrtc/modules/audio_coding/neteq/time_stretch.cc +++ b/webrtc/modules/audio_coding/neteq/time_stretch.cc @@ -19,12 +19,11 @@ namespace webrtc { -TimeStretch::ReturnCodes TimeStretch::Process( - const int16_t* input, - size_t input_len, - AudioMultiVector* output, - int16_t* length_change_samples) { - +TimeStretch::ReturnCodes TimeStretch::Process(const int16_t* input, + size_t input_len, + bool fast_mode, + AudioMultiVector* output, + int16_t* length_change_samples) { // Pre-calculate common multiplication with |fs_mult_|. int fs_mult_120 = fs_mult_ * 120; // Corresponds to 15 ms. @@ -140,8 +139,9 @@ TimeStretch::ReturnCodes TimeStretch::Process( // Check accelerate criteria and stretch the signal. - ReturnCodes return_value = CheckCriteriaAndStretch( - input, input_len, peak_index, best_correlation, active_speech, output); + ReturnCodes return_value = + CheckCriteriaAndStretch(input, input_len, peak_index, best_correlation, + active_speech, fast_mode, output); switch (return_value) { case kSuccess: *length_change_samples = peak_index; diff --git a/webrtc/modules/audio_coding/neteq/time_stretch.h b/webrtc/modules/audio_coding/neteq/time_stretch.h index 9396d8ff51..7c84e1a153 100644 --- a/webrtc/modules/audio_coding/neteq/time_stretch.h +++ b/webrtc/modules/audio_coding/neteq/time_stretch.h @@ -58,6 +58,7 @@ class TimeStretch { // PreemptiveExpand. ReturnCodes Process(const int16_t* input, size_t input_len, + bool fast_mode, AudioMultiVector* output, int16_t* length_change_samples); @@ -73,8 +74,12 @@ class TimeStretch { // if possible, performs the time-stretching. This method must be implemented // by the sub-classes. virtual ReturnCodes CheckCriteriaAndStretch( - const int16_t* input, size_t input_length, size_t peak_index, - int16_t best_correlation, bool active_speech, + const int16_t* input, + size_t input_length, + size_t peak_index, + int16_t best_correlation, + bool active_speech, + bool fast_mode, AudioMultiVector* output) const = 0; static const int kCorrelationLen = 50; diff --git a/webrtc/modules/audio_coding/neteq/time_stretch_unittest.cc b/webrtc/modules/audio_coding/neteq/time_stretch_unittest.cc index 3d1e06936a..05385a1e3e 100644 --- a/webrtc/modules/audio_coding/neteq/time_stretch_unittest.cc +++ b/webrtc/modules/audio_coding/neteq/time_stretch_unittest.cc @@ -13,14 +13,24 @@ #include "webrtc/modules/audio_coding/neteq/accelerate.h" #include "webrtc/modules/audio_coding/neteq/preemptive_expand.h" +#include + #include "testing/gtest/include/gtest/gtest.h" +#include "webrtc/base/checks.h" +#include "webrtc/base/scoped_ptr.h" +#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" #include "webrtc/modules/audio_coding/neteq/background_noise.h" +#include "webrtc/modules/audio_coding/neteq/tools/input_audio_file.h" +#include "webrtc/test/testsupport/fileutils.h" namespace webrtc { +namespace { +const size_t kNumChannels = 1; +} + TEST(TimeStretch, CreateAndDestroy) { const int kSampleRate = 8000; - const size_t kNumChannels = 1; const int kOverlapSamples = 5 * kSampleRate / 8000; BackgroundNoise bgn(kNumChannels); Accelerate accelerate(kSampleRate, kNumChannels, bgn); @@ -30,7 +40,6 @@ TEST(TimeStretch, CreateAndDestroy) { TEST(TimeStretch, CreateUsingFactory) { const int kSampleRate = 8000; - const size_t kNumChannels = 1; const int kOverlapSamples = 5 * kSampleRate / 8000; BackgroundNoise bgn(kNumChannels); @@ -47,6 +56,72 @@ TEST(TimeStretch, CreateUsingFactory) { delete preemptive_expand; } -// TODO(hlundin): Write more tests. +class TimeStretchTest : public ::testing::Test { + protected: + TimeStretchTest() + : input_file_(new test::InputAudioFile( + test::ResourcePath("audio_coding/testfile32kHz", "pcm"))), + sample_rate_hz_(32000), + block_size_(30 * sample_rate_hz_ / 1000), // 30 ms + audio_(new int16_t[block_size_]), + background_noise_(kNumChannels) { + WebRtcSpl_Init(); + } + + const int16_t* Next30Ms() { + CHECK(input_file_->Read(block_size_, audio_.get())); + return audio_.get(); + } + + // Returns the total length change (in samples) that the accelerate operation + // resulted in during the run. + int TestAccelerate(int loops, bool fast_mode) { + Accelerate accelerate(sample_rate_hz_, kNumChannels, background_noise_); + int total_length_change = 0; + for (int i = 0; i < loops; ++i) { + AudioMultiVector output(kNumChannels); + int16_t length_change; + UpdateReturnStats(accelerate.Process(Next30Ms(), block_size_, fast_mode, + &output, &length_change)); + total_length_change += length_change; + } + return total_length_change; + } + + void UpdateReturnStats(TimeStretch::ReturnCodes ret) { + switch (ret) { + case TimeStretch::kSuccess: + case TimeStretch::kSuccessLowEnergy: + case TimeStretch::kNoStretch: + ++return_stats_[ret]; + break; + case TimeStretch::kError: + FAIL() << "Process returned an error"; + } + } + + rtc::scoped_ptr input_file_; + const int sample_rate_hz_; + const size_t block_size_; + rtc::scoped_ptr audio_; + std::map return_stats_; + BackgroundNoise background_noise_; +}; + +TEST_F(TimeStretchTest, Accelerate) { + // TestAccelerate returns the total length change in samples. + EXPECT_EQ(15268, TestAccelerate(100, false)); + EXPECT_EQ(9, return_stats_[TimeStretch::kSuccess]); + EXPECT_EQ(58, return_stats_[TimeStretch::kSuccessLowEnergy]); + EXPECT_EQ(33, return_stats_[TimeStretch::kNoStretch]); +} + +TEST_F(TimeStretchTest, AccelerateFastMode) { + // TestAccelerate returns the total length change in samples. + EXPECT_EQ(21400, TestAccelerate(100, true)); + EXPECT_EQ(31, return_stats_[TimeStretch::kSuccess]); + EXPECT_EQ(58, return_stats_[TimeStretch::kSuccessLowEnergy]); + EXPECT_EQ(11, return_stats_[TimeStretch::kNoStretch]); +} } // namespace webrtc