diff --git a/webrtc/modules/audio_coding/neteq/accelerate.cc b/webrtc/modules/audio_coding/neteq/accelerate.cc index 6acd778a23..ad7423810d 100644 --- a/webrtc/modules/audio_coding/neteq/accelerate.cc +++ b/webrtc/modules/audio_coding/neteq/accelerate.cc @@ -14,11 +14,11 @@ namespace webrtc { -Accelerate::ReturnCodes Accelerate::Process( - const int16_t* input, - size_t input_length, - AudioMultiVector* output, - int16_t* length_change_samples) { +Accelerate::ReturnCodes Accelerate::Process(const int16_t* input, + size_t input_length, + bool fast_accelerate, + AudioMultiVector* output, + int16_t* length_change_samples) { // Input length must be (almost) 30 ms. static const int k15ms = 120; // 15 ms = 120 samples at 8 kHz sample rate. if (num_channels_ == 0 || static_cast(input_length) / num_channels_ < @@ -28,7 +28,7 @@ Accelerate::ReturnCodes Accelerate::Process( output->PushBackInterleaved(input, input_length); return kError; } - return TimeStretch::Process(input, input_length, output, + return TimeStretch::Process(input, input_length, fast_accelerate, output, length_change_samples); } @@ -41,17 +41,30 @@ void Accelerate::SetParametersForPassiveSpeech(size_t /*len*/, } Accelerate::ReturnCodes Accelerate::CheckCriteriaAndStretch( - const int16_t* input, size_t input_length, size_t peak_index, - int16_t best_correlation, bool active_speech, + const int16_t* input, + size_t input_length, + size_t peak_index, + int16_t best_correlation, + bool active_speech, + bool fast_mode, AudioMultiVector* output) const { // Check for strong correlation or passive speech. - if ((best_correlation > kCorrelationThreshold) || !active_speech) { + // Use 8192 (0.5 in Q14) in fast mode. + const int correlation_threshold = fast_mode ? 8192 : kCorrelationThreshold; + if ((best_correlation > correlation_threshold) || !active_speech) { // Do accelerate operation by overlap add. // Pre-calculate common multiplication with |fs_mult_|. // 120 corresponds to 15 ms. size_t fs_mult_120 = fs_mult_ * 120; + if (fast_mode) { + // Fit as many multiples of |peak_index| as possible in fs_mult_120. + // TODO(henrik.lundin) Consider finding multiple correlation peaks and + // pick the one with the longest correlation lag in this case. + peak_index = (fs_mult_120 / peak_index) * peak_index; + } + assert(fs_mult_120 >= peak_index); // Should be handled in Process(). // Copy first part; 0 to 15 ms. output->PushBackInterleaved(input, fs_mult_120 * num_channels_); diff --git a/webrtc/modules/audio_coding/neteq/accelerate.h b/webrtc/modules/audio_coding/neteq/accelerate.h index 36bc0946c9..684f74bb8c 100644 --- a/webrtc/modules/audio_coding/neteq/accelerate.h +++ b/webrtc/modules/audio_coding/neteq/accelerate.h @@ -38,9 +38,12 @@ class Accelerate : public TimeStretch { // read from |input|, of length |input_length| elements, and are written to // |output|. The number of samples removed through time-stretching is // is provided in the output |length_change_samples|. The method returns - // the outcome of the operation as an enumerator value. + // the outcome of the operation as an enumerator value. If |fast_accelerate| + // is true, the algorithm will relax the requirements on finding strong + // correlations, and may remove multiple pitch periods if possible. ReturnCodes Process(const int16_t* input, size_t input_length, + bool fast_accelerate, AudioMultiVector* output, int16_t* length_change_samples); @@ -58,6 +61,7 @@ class Accelerate : public TimeStretch { size_t peak_index, int16_t best_correlation, bool active_speech, + bool fast_mode, AudioMultiVector* output) const override; private: diff --git a/webrtc/modules/audio_coding/neteq/decision_logic_normal.cc b/webrtc/modules/audio_coding/neteq/decision_logic_normal.cc index f2382845b0..89fdb51b0b 100644 --- a/webrtc/modules/audio_coding/neteq/decision_logic_normal.cc +++ b/webrtc/modules/audio_coding/neteq/decision_logic_normal.cc @@ -132,15 +132,13 @@ Operations DecisionLogicNormal::ExpectedPacketAvailable(Modes prev_mode, // Check criterion for time-stretching. int low_limit, high_limit; delay_manager_->BufferLimits(&low_limit, &high_limit); - if ((buffer_level_filter_->filtered_current_level() >= high_limit && - TimescaleAllowed()) || - buffer_level_filter_->filtered_current_level() >= high_limit << 2) { - // Buffer level higher than limit and time-scaling allowed, - // or buffer level really high. - return kAccelerate; - } else if ((buffer_level_filter_->filtered_current_level() < low_limit) - && TimescaleAllowed()) { - return kPreemptiveExpand; + if (buffer_level_filter_->filtered_current_level() >= high_limit << 2) + return kFastAccelerate; + if (TimescaleAllowed()) { + if (buffer_level_filter_->filtered_current_level() >= high_limit) + return kAccelerate; + if (buffer_level_filter_->filtered_current_level() < low_limit) + return kPreemptiveExpand; } } return kNormal; diff --git a/webrtc/modules/audio_coding/neteq/defines.h b/webrtc/modules/audio_coding/neteq/defines.h index 33d1bd9c3f..3ed6b61889 100644 --- a/webrtc/modules/audio_coding/neteq/defines.h +++ b/webrtc/modules/audio_coding/neteq/defines.h @@ -18,6 +18,7 @@ enum Operations { kMerge, kExpand, kAccelerate, + kFastAccelerate, kPreemptiveExpand, kRfc3389Cng, kRfc3389CngNoPacket, diff --git a/webrtc/modules/audio_coding/neteq/interface/neteq.h b/webrtc/modules/audio_coding/neteq/interface/neteq.h index 439f049269..88bf2087ff 100644 --- a/webrtc/modules/audio_coding/neteq/interface/neteq.h +++ b/webrtc/modules/audio_coding/neteq/interface/neteq.h @@ -80,7 +80,8 @@ class NetEq { // |max_delay_ms| has the same effect as calling SetMaximumDelay(). max_delay_ms(2000), background_noise_mode(kBgnOff), - playout_mode(kPlayoutOn) {} + playout_mode(kPlayoutOn), + enable_fast_accelerate(false) {} std::string ToString() const; @@ -90,6 +91,7 @@ class NetEq { int max_delay_ms; BackgroundNoiseMode background_noise_mode; NetEqPlayoutMode playout_mode; + bool enable_fast_accelerate; }; enum ReturnCodes { diff --git a/webrtc/modules/audio_coding/neteq/neteq.cc b/webrtc/modules/audio_coding/neteq/neteq.cc index ea100690d2..c8c4c46cfb 100644 --- a/webrtc/modules/audio_coding/neteq/neteq.cc +++ b/webrtc/modules/audio_coding/neteq/neteq.cc @@ -34,7 +34,8 @@ std::string NetEq::Config::ToString() const { << (enable_audio_classifier ? "true" : "false") << ", max_packets_in_buffer=" << max_packets_in_buffer << ", background_noise_mode=" << background_noise_mode - << ", playout_mode=" << playout_mode; + << ", playout_mode=" << playout_mode + << ", enable_fast_accelerate=" << enable_fast_accelerate; return ss.str(); } diff --git a/webrtc/modules/audio_coding/neteq/neteq_impl.cc b/webrtc/modules/audio_coding/neteq/neteq_impl.cc index fe078fd890..1351e66634 100644 --- a/webrtc/modules/audio_coding/neteq/neteq_impl.cc +++ b/webrtc/modules/audio_coding/neteq/neteq_impl.cc @@ -92,6 +92,7 @@ NetEqImpl::NetEqImpl(const NetEq::Config& config, decoder_error_code_(0), background_noise_mode_(config.background_noise_mode), playout_mode_(config.playout_mode), + enable_fast_accelerate_(config.enable_fast_accelerate), decoded_packet_sequence_number_(-1), decoded_packet_timestamp_(0) { LOG(LS_INFO) << "NetEq config: " << config.ToString(); @@ -745,9 +746,12 @@ int NetEqImpl::GetAudioInternal(size_t max_length, int16_t* output, return_value = DoExpand(play_dtmf); break; } - case kAccelerate: { + case kAccelerate: + case kFastAccelerate: { + const bool fast_accelerate = + enable_fast_accelerate_ && (operation == kFastAccelerate); return_value = DoAccelerate(decoded_buffer_.get(), length, speech_type, - play_dtmf); + play_dtmf, fast_accelerate); break; } case kPreemptiveExpand: { @@ -956,9 +960,8 @@ int NetEqImpl::GetDecision(Operations* operation, // Check if we already have enough samples in the |sync_buffer_|. If so, // change decision to normal, unless the decision was merge, accelerate, or // preemptive expand. - if (samples_left >= output_size_samples_ && - *operation != kMerge && - *operation != kAccelerate && + if (samples_left >= output_size_samples_ && *operation != kMerge && + *operation != kAccelerate && *operation != kFastAccelerate && *operation != kPreemptiveExpand) { *operation = kNormal; return 0; @@ -1034,8 +1037,9 @@ int NetEqImpl::GetDecision(Operations* operation, decision_logic_->set_generated_noise_samples(0); return 0; } - case kAccelerate: { - // In order to do a accelerate we need at least 30 ms of audio data. + case kAccelerate: + case kFastAccelerate: { + // In order to do an accelerate we need at least 30 ms of audio data. if (samples_left >= samples_30_ms) { // Already have enough data, so we do not need to extract any more. decision_logic_->set_sample_memory(samples_left); @@ -1124,13 +1128,13 @@ int NetEqImpl::GetDecision(Operations* operation, } } - if (*operation == kAccelerate || + if (*operation == kAccelerate || *operation == kFastAccelerate || *operation == kPreemptiveExpand) { decision_logic_->set_sample_memory(samples_left + extracted_samples); decision_logic_->set_prev_time_scale(true); } - if (*operation == kAccelerate) { + if (*operation == kAccelerate || *operation == kFastAccelerate) { // Check that we have enough data (30ms) to do accelerate. if (extracted_samples + samples_left < samples_30_ms) { // TODO(hlundin): Write test for this. @@ -1263,7 +1267,8 @@ int NetEqImpl::DecodeLoop(PacketList* packet_list, Operations* operation, assert(sync_buffer_->Channels() == decoder->Channels()); assert(decoded_buffer_length_ >= kMaxFrameSize * decoder->Channels()); assert(*operation == kNormal || *operation == kAccelerate || - *operation == kMerge || *operation == kPreemptiveExpand); + *operation == kFastAccelerate || *operation == kMerge || + *operation == kPreemptiveExpand); packet_list->pop_front(); size_t payload_length = packet->payload_length; int16_t decode_length; @@ -1427,9 +1432,11 @@ int NetEqImpl::DoExpand(bool play_dtmf) { return 0; } -int NetEqImpl::DoAccelerate(int16_t* decoded_buffer, size_t decoded_length, +int NetEqImpl::DoAccelerate(int16_t* decoded_buffer, + size_t decoded_length, AudioDecoder::SpeechType speech_type, - bool play_dtmf) { + bool play_dtmf, + bool fast_accelerate) { const size_t required_samples = 240 * fs_mult_; // Must have 30 ms. size_t borrowed_samples_per_channel = 0; size_t num_channels = algorithm_buffer_->Channels(); @@ -1447,9 +1454,9 @@ int NetEqImpl::DoAccelerate(int16_t* decoded_buffer, size_t decoded_length, } int16_t samples_removed; - Accelerate::ReturnCodes return_code = accelerate_->Process( - decoded_buffer, decoded_length, algorithm_buffer_.get(), - &samples_removed); + Accelerate::ReturnCodes return_code = + accelerate_->Process(decoded_buffer, decoded_length, fast_accelerate, + algorithm_buffer_.get(), &samples_removed); stats_.AcceleratedSamples(samples_removed); switch (return_code) { case Accelerate::kSuccess: diff --git a/webrtc/modules/audio_coding/neteq/neteq_impl.h b/webrtc/modules/audio_coding/neteq/neteq_impl.h index 248071f825..55ba067221 100644 --- a/webrtc/modules/audio_coding/neteq/neteq_impl.h +++ b/webrtc/modules/audio_coding/neteq/neteq_impl.h @@ -278,7 +278,8 @@ class NetEqImpl : public webrtc::NetEq { int DoAccelerate(int16_t* decoded_buffer, size_t decoded_length, AudioDecoder::SpeechType speech_type, - bool play_dtmf) EXCLUSIVE_LOCKS_REQUIRED(crit_sect_); + bool play_dtmf, + bool fast_accelerate) EXCLUSIVE_LOCKS_REQUIRED(crit_sect_); // Sub-method which calls the PreemptiveExpand class to perform the // preemtive expand operation. @@ -392,6 +393,7 @@ class NetEqImpl : public webrtc::NetEq { int decoder_error_code_ GUARDED_BY(crit_sect_); const BackgroundNoiseMode background_noise_mode_ GUARDED_BY(crit_sect_); NetEqPlayoutMode playout_mode_ GUARDED_BY(crit_sect_); + bool enable_fast_accelerate_ GUARDED_BY(crit_sect_); // These values are used by NACK module to estimate time-to-play of // a missing packet. Occasionally, NetEq might decide to decode more diff --git a/webrtc/modules/audio_coding/neteq/preemptive_expand.cc b/webrtc/modules/audio_coding/neteq/preemptive_expand.cc index b2dc3e60cb..6a3f8ecf1a 100644 --- a/webrtc/modules/audio_coding/neteq/preemptive_expand.cc +++ b/webrtc/modules/audio_coding/neteq/preemptive_expand.cc @@ -34,7 +34,8 @@ PreemptiveExpand::ReturnCodes PreemptiveExpand::Process( output->PushBackInterleaved(input, input_length); return kError; } - return TimeStretch::Process(input, input_length, output, + const bool kFastMode = false; // Fast mode is not available for PE Expand. + return TimeStretch::Process(input, input_length, kFastMode, output, length_change_samples); } @@ -54,8 +55,12 @@ void PreemptiveExpand::SetParametersForPassiveSpeech(size_t len, } PreemptiveExpand::ReturnCodes PreemptiveExpand::CheckCriteriaAndStretch( - const int16_t *input, size_t input_length, size_t peak_index, - int16_t best_correlation, bool active_speech, + const int16_t* input, + size_t input_length, + size_t peak_index, + int16_t best_correlation, + bool active_speech, + bool /*fast_mode*/, AudioMultiVector* output) const { // Pre-calculate common multiplication with |fs_mult_|. // 120 corresponds to 15 ms. diff --git a/webrtc/modules/audio_coding/neteq/preemptive_expand.h b/webrtc/modules/audio_coding/neteq/preemptive_expand.h index 750c16bde3..c583a48a5b 100644 --- a/webrtc/modules/audio_coding/neteq/preemptive_expand.h +++ b/webrtc/modules/audio_coding/neteq/preemptive_expand.h @@ -58,11 +58,12 @@ class PreemptiveExpand : public TimeStretch { // Checks the criteria for performing the time-stretching operation and, // if possible, performs the time-stretching. - ReturnCodes CheckCriteriaAndStretch(const int16_t* pw16_decoded, - size_t len, - size_t w16_bestIndex, - int16_t w16_bestCorr, - bool w16_VAD, + ReturnCodes CheckCriteriaAndStretch(const int16_t* input, + size_t input_length, + size_t peak_index, + int16_t best_correlation, + bool active_speech, + bool /*fast_mode*/, AudioMultiVector* output) const override; private: diff --git a/webrtc/modules/audio_coding/neteq/time_stretch.cc b/webrtc/modules/audio_coding/neteq/time_stretch.cc index 02305c83e5..5577cd2ecb 100644 --- a/webrtc/modules/audio_coding/neteq/time_stretch.cc +++ b/webrtc/modules/audio_coding/neteq/time_stretch.cc @@ -19,12 +19,11 @@ namespace webrtc { -TimeStretch::ReturnCodes TimeStretch::Process( - const int16_t* input, - size_t input_len, - AudioMultiVector* output, - int16_t* length_change_samples) { - +TimeStretch::ReturnCodes TimeStretch::Process(const int16_t* input, + size_t input_len, + bool fast_mode, + AudioMultiVector* output, + int16_t* length_change_samples) { // Pre-calculate common multiplication with |fs_mult_|. int fs_mult_120 = fs_mult_ * 120; // Corresponds to 15 ms. @@ -140,8 +139,9 @@ TimeStretch::ReturnCodes TimeStretch::Process( // Check accelerate criteria and stretch the signal. - ReturnCodes return_value = CheckCriteriaAndStretch( - input, input_len, peak_index, best_correlation, active_speech, output); + ReturnCodes return_value = + CheckCriteriaAndStretch(input, input_len, peak_index, best_correlation, + active_speech, fast_mode, output); switch (return_value) { case kSuccess: *length_change_samples = peak_index; diff --git a/webrtc/modules/audio_coding/neteq/time_stretch.h b/webrtc/modules/audio_coding/neteq/time_stretch.h index 9396d8ff51..7c84e1a153 100644 --- a/webrtc/modules/audio_coding/neteq/time_stretch.h +++ b/webrtc/modules/audio_coding/neteq/time_stretch.h @@ -58,6 +58,7 @@ class TimeStretch { // PreemptiveExpand. ReturnCodes Process(const int16_t* input, size_t input_len, + bool fast_mode, AudioMultiVector* output, int16_t* length_change_samples); @@ -73,8 +74,12 @@ class TimeStretch { // if possible, performs the time-stretching. This method must be implemented // by the sub-classes. virtual ReturnCodes CheckCriteriaAndStretch( - const int16_t* input, size_t input_length, size_t peak_index, - int16_t best_correlation, bool active_speech, + const int16_t* input, + size_t input_length, + size_t peak_index, + int16_t best_correlation, + bool active_speech, + bool fast_mode, AudioMultiVector* output) const = 0; static const int kCorrelationLen = 50; diff --git a/webrtc/modules/audio_coding/neteq/time_stretch_unittest.cc b/webrtc/modules/audio_coding/neteq/time_stretch_unittest.cc index 3d1e06936a..05385a1e3e 100644 --- a/webrtc/modules/audio_coding/neteq/time_stretch_unittest.cc +++ b/webrtc/modules/audio_coding/neteq/time_stretch_unittest.cc @@ -13,14 +13,24 @@ #include "webrtc/modules/audio_coding/neteq/accelerate.h" #include "webrtc/modules/audio_coding/neteq/preemptive_expand.h" +#include + #include "testing/gtest/include/gtest/gtest.h" +#include "webrtc/base/checks.h" +#include "webrtc/base/scoped_ptr.h" +#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" #include "webrtc/modules/audio_coding/neteq/background_noise.h" +#include "webrtc/modules/audio_coding/neteq/tools/input_audio_file.h" +#include "webrtc/test/testsupport/fileutils.h" namespace webrtc { +namespace { +const size_t kNumChannels = 1; +} + TEST(TimeStretch, CreateAndDestroy) { const int kSampleRate = 8000; - const size_t kNumChannels = 1; const int kOverlapSamples = 5 * kSampleRate / 8000; BackgroundNoise bgn(kNumChannels); Accelerate accelerate(kSampleRate, kNumChannels, bgn); @@ -30,7 +40,6 @@ TEST(TimeStretch, CreateAndDestroy) { TEST(TimeStretch, CreateUsingFactory) { const int kSampleRate = 8000; - const size_t kNumChannels = 1; const int kOverlapSamples = 5 * kSampleRate / 8000; BackgroundNoise bgn(kNumChannels); @@ -47,6 +56,72 @@ TEST(TimeStretch, CreateUsingFactory) { delete preemptive_expand; } -// TODO(hlundin): Write more tests. +class TimeStretchTest : public ::testing::Test { + protected: + TimeStretchTest() + : input_file_(new test::InputAudioFile( + test::ResourcePath("audio_coding/testfile32kHz", "pcm"))), + sample_rate_hz_(32000), + block_size_(30 * sample_rate_hz_ / 1000), // 30 ms + audio_(new int16_t[block_size_]), + background_noise_(kNumChannels) { + WebRtcSpl_Init(); + } + + const int16_t* Next30Ms() { + CHECK(input_file_->Read(block_size_, audio_.get())); + return audio_.get(); + } + + // Returns the total length change (in samples) that the accelerate operation + // resulted in during the run. + int TestAccelerate(int loops, bool fast_mode) { + Accelerate accelerate(sample_rate_hz_, kNumChannels, background_noise_); + int total_length_change = 0; + for (int i = 0; i < loops; ++i) { + AudioMultiVector output(kNumChannels); + int16_t length_change; + UpdateReturnStats(accelerate.Process(Next30Ms(), block_size_, fast_mode, + &output, &length_change)); + total_length_change += length_change; + } + return total_length_change; + } + + void UpdateReturnStats(TimeStretch::ReturnCodes ret) { + switch (ret) { + case TimeStretch::kSuccess: + case TimeStretch::kSuccessLowEnergy: + case TimeStretch::kNoStretch: + ++return_stats_[ret]; + break; + case TimeStretch::kError: + FAIL() << "Process returned an error"; + } + } + + rtc::scoped_ptr input_file_; + const int sample_rate_hz_; + const size_t block_size_; + rtc::scoped_ptr audio_; + std::map return_stats_; + BackgroundNoise background_noise_; +}; + +TEST_F(TimeStretchTest, Accelerate) { + // TestAccelerate returns the total length change in samples. + EXPECT_EQ(15268, TestAccelerate(100, false)); + EXPECT_EQ(9, return_stats_[TimeStretch::kSuccess]); + EXPECT_EQ(58, return_stats_[TimeStretch::kSuccessLowEnergy]); + EXPECT_EQ(33, return_stats_[TimeStretch::kNoStretch]); +} + +TEST_F(TimeStretchTest, AccelerateFastMode) { + // TestAccelerate returns the total length change in samples. + EXPECT_EQ(21400, TestAccelerate(100, true)); + EXPECT_EQ(31, return_stats_[TimeStretch::kSuccess]); + EXPECT_EQ(58, return_stats_[TimeStretch::kSuccessLowEnergy]); + EXPECT_EQ(11, return_stats_[TimeStretch::kNoStretch]); +} } // namespace webrtc