From 46dda83bcb3583401ee6c378f5bee42b6cb619a3 Mon Sep 17 00:00:00 2001 From: Jakob Ivarsson Date: Wed, 3 Jul 2019 16:00:30 +0200 Subject: [PATCH] Improve buffer level estimation with DTX and add CNG time stretching. The functionality is hidden behind field trial for experimentation. Bug: webrtc:10736 Change-Id: I1daf60966717c3ea43bf6ee16d190290ab740ce7 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/144059 Commit-Queue: Jakob Ivarsson Reviewed-by: Minyue Li Cr-Commit-Position: refs/heads/master@{#28474} --- modules/audio_coding/BUILD.gn | 1 + modules/audio_coding/neteq/decision_logic.cc | 110 +++++++++++++------ modules/audio_coding/neteq/decision_logic.h | 7 +- modules/audio_coding/neteq/packet_buffer.cc | 13 ++- modules/audio_coding/neteq/packet_buffer.h | 4 +- 5 files changed, 100 insertions(+), 35 deletions(-) diff --git a/modules/audio_coding/BUILD.gn b/modules/audio_coding/BUILD.gn index 3976600c54..4d3e36f6e7 100644 --- a/modules/audio_coding/BUILD.gn +++ b/modules/audio_coding/BUILD.gn @@ -1029,6 +1029,7 @@ rtc_static_library("neteq") { "../../rtc_base:rtc_base_approved", "../../rtc_base:safe_minmax", "../../rtc_base:sanitizer", + "../../rtc_base/experiments:field_trial_parser", "../../rtc_base/system:fallthrough", "../../system_wrappers", "../../system_wrappers:field_trial", diff --git a/modules/audio_coding/neteq/decision_logic.cc b/modules/audio_coding/neteq/decision_logic.cc index f9f420af0e..fc255e54a9 100644 --- a/modules/audio_coding/neteq/decision_logic.cc +++ b/modules/audio_coding/neteq/decision_logic.cc @@ -14,6 +14,7 @@ #include #include +#include "absl/types/optional.h" #include "modules/audio_coding/neteq/buffer_level_filter.h" #include "modules/audio_coding/neteq/decoder_database.h" #include "modules/audio_coding/neteq/delay_manager.h" @@ -21,12 +22,15 @@ #include "modules/audio_coding/neteq/packet_buffer.h" #include "modules/audio_coding/neteq/sync_buffer.h" #include "rtc_base/checks.h" +#include "rtc_base/experiments/field_trial_parser.h" #include "rtc_base/logging.h" #include "rtc_base/numerics/safe_conversions.h" +#include "system_wrappers/include/field_trial.h" namespace { constexpr int kPostponeDecodingLevel = 50; +constexpr int kDefaultTargetLevelWindowMs = 100; } // namespace @@ -65,8 +69,24 @@ DecisionLogic::DecisionLogic(int fs_hz, disallow_time_stretching_(disallow_time_stretching), timescale_countdown_( tick_timer_->GetNewCountdown(kMinTimescaleInterval + 1)), - num_consecutive_expands_(0) { + num_consecutive_expands_(0), + time_stretched_cn_samples_(0), + estimate_dtx_delay_("estimate_dtx_delay", false), + time_stretch_cn_("time_stretch_cn", false), + target_level_window_ms_("target_level_window", + kDefaultTargetLevelWindowMs, + 0, + absl::nullopt) { SetSampleRate(fs_hz, output_size_samples); + const std::string field_trial_name = + field_trial::FindFullName("WebRTC-Audio-NetEqDecisionLogicSettings"); + ParseFieldTrial( + {&estimate_dtx_delay_, &time_stretch_cn_, &target_level_window_ms_}, + field_trial_name); + RTC_LOG(LS_INFO) << "NetEq decision logic settings:" + << " estimate_dtx_delay=" << estimate_dtx_delay_ + << " time_stretch_cn=" << time_stretch_cn_ + << " target_level_window_ms=" << target_level_window_ms_; } DecisionLogic::~DecisionLogic() = default; @@ -79,6 +99,7 @@ void DecisionLogic::Reset() { prev_time_scale_ = false; timescale_countdown_.reset(); num_consecutive_expands_ = 0; + time_stretched_cn_samples_ = 0; } void DecisionLogic::SoftReset() { @@ -87,12 +108,13 @@ void DecisionLogic::SoftReset() { prev_time_scale_ = false; timescale_countdown_ = tick_timer_->GetNewCountdown(kMinTimescaleInterval + 1); + time_stretched_cn_samples_ = 0; } void DecisionLogic::SetSampleRate(int fs_hz, size_t output_size_samples) { // TODO(hlundin): Change to an enumerator and skip assert. assert(fs_hz == 8000 || fs_hz == 16000 || fs_hz == 32000 || fs_hz == 48000); - fs_mult_ = fs_hz / 8000; + sample_rate_ = fs_hz; output_size_samples_ = output_size_samples; } @@ -113,9 +135,11 @@ Operations DecisionLogic::GetDecision(const SyncBuffer& sync_buffer, cng_state_ = kCngInternalOn; } - // TODO(jakobi): Use buffer span instead of num samples. - const size_t cur_size_samples = - packet_buffer_.NumSamplesInBuffer(decoder_frame_length); + size_t cur_size_samples = + estimate_dtx_delay_ + ? packet_buffer_.GetSpanSamples(decoder_frame_length, sample_rate_, + true) + : packet_buffer_.NumSamplesInBuffer(decoder_frame_length); prev_time_scale_ = prev_time_scale_ && (prev_mode == kModeAccelerateSuccess || @@ -125,9 +149,9 @@ Operations DecisionLogic::GetDecision(const SyncBuffer& sync_buffer, // Do not update buffer history if currently playing CNG since it will bias // the filtered buffer level. - if ((prev_mode != kModeRfc3389Cng) && (prev_mode != kModeCodecInternalCng) && + if (prev_mode != kModeRfc3389Cng && prev_mode != kModeCodecInternalCng && !(next_packet && next_packet->frame && - next_packet->frame->IsDtxPacket())) { + next_packet->frame->IsDtxPacket() && !estimate_dtx_delay_)) { FilterBufferLevel(cur_size_samples); } @@ -173,7 +197,8 @@ Operations DecisionLogic::GetDecision(const SyncBuffer& sync_buffer, // if the mute factor is low enough (otherwise the expansion was short enough // to not be noticable). // Note that the MuteFactor is in Q14, so a value of 16384 corresponds to 1. - size_t current_span = packet_buffer_.GetSpanSamples(decoder_frame_length); + size_t current_span = packet_buffer_.GetSpanSamples( + decoder_frame_length, sample_rate_, estimate_dtx_delay_); if ((prev_mode == kModeExpand || prev_mode == kModeCodecPlc) && expand.MuteFactor(0) < 16384 / 2 && current_span < static_cast(delay_manager_->TargetLevel() * @@ -183,8 +208,7 @@ Operations DecisionLogic::GetDecision(const SyncBuffer& sync_buffer, return kExpand; } - const uint32_t five_seconds_samples = - static_cast(5 * 8000 * fs_mult_); + const uint32_t five_seconds_samples = static_cast(5 * sample_rate_); // Check if the required packet is available. if (target_timestamp == available_timestamp) { return ExpectedPacketAvailable(prev_mode, play_dtmf); @@ -212,14 +236,15 @@ void DecisionLogic::FilterBufferLevel(size_t buffer_size_samples) { buffer_level_filter_->SetTargetBufferLevel( delay_manager_->base_target_level()); - int sample_memory_local = 0; + int time_stretched_samples = time_stretched_cn_samples_; if (prev_time_scale_) { - sample_memory_local = sample_memory_; + time_stretched_samples += sample_memory_; timescale_countdown_ = tick_timer_->GetNewCountdown(kMinTimescaleInterval); } - buffer_level_filter_->Update(buffer_size_samples, sample_memory_local); + buffer_level_filter_->Update(buffer_size_samples, time_stretched_samples); prev_time_scale_ = false; + time_stretched_cn_samples_ = 0; } Operations DecisionLogic::CngOperation(Modes prev_mode, @@ -323,30 +348,53 @@ Operations DecisionLogic::FuturePacketAvailable( return kNormal; } - const size_t cur_size_samples = - packet_buffer_.NumPacketsInBuffer() * decoder_frame_length; - // If previous was comfort noise, then no merge is needed. if (prev_mode == kModeRfc3389Cng || prev_mode == kModeCodecInternalCng) { - // Keep the same delay as before the CNG, but make sure that the number of - // samples in buffer is no higher than 4 times the optimal level. (Note that - // TargetLevel() is in Q8.) - if (static_cast(generated_noise_samples + target_timestamp) >= - available_timestamp || - cur_size_samples > - ((delay_manager_->TargetLevel() * packet_length_samples_) >> 8) * - 4) { - // Time to play this new packet. - return kNormal; + size_t cur_size_samples = + estimate_dtx_delay_ + ? cur_size_samples = packet_buffer_.GetSpanSamples( + decoder_frame_length, sample_rate_, true) + : packet_buffer_.NumPacketsInBuffer() * decoder_frame_length; + // Target level is in number of packets in Q8. + const size_t target_level_samples = + (delay_manager_->TargetLevel() * packet_length_samples_) >> 8; + const bool generated_enough_noise = + static_cast(generated_noise_samples + target_timestamp) >= + available_timestamp; + + if (time_stretch_cn_) { + const size_t target_threshold_samples = + target_level_window_ms_ / 2 * (sample_rate_ / 1000); + const bool above_target_window = + cur_size_samples > target_level_samples + target_threshold_samples; + const bool below_target_window = + target_level_samples > target_threshold_samples && + cur_size_samples < target_level_samples - target_threshold_samples; + // Keep the delay same as before CNG, but make sure that it is within the + // target window. + if ((generated_enough_noise && !below_target_window) || + above_target_window) { + time_stretched_cn_samples_ = timestamp_leap - generated_noise_samples; + return kNormal; + } } else { - // Too early to play this new packet; keep on playing comfort noise. - if (prev_mode == kModeRfc3389Cng) { - return kRfc3389CngNoPacket; - } else { // prevPlayMode == kModeCodecInternalCng. - return kCodecInternalCng; + // Keep the same delay as before the CNG, but make sure that the number of + // samples in buffer is no higher than 4 times the optimal level. + if (generated_enough_noise || + cur_size_samples > target_level_samples * 4) { + // Time to play this new packet. + return kNormal; } } + + // Too early to play this new packet; keep on playing comfort noise. + if (prev_mode == kModeRfc3389Cng) { + return kRfc3389CngNoPacket; + } + // prevPlayMode == kModeCodecInternalCng. + return kCodecInternalCng; } + // Do not merge unless we have done an expand before. if (prev_mode == kModeExpand) { return kMerge; diff --git a/modules/audio_coding/neteq/decision_logic.h b/modules/audio_coding/neteq/decision_logic.h index 49020b0aab..5a9bffb1bf 100644 --- a/modules/audio_coding/neteq/decision_logic.h +++ b/modules/audio_coding/neteq/decision_logic.h @@ -14,6 +14,7 @@ #include "modules/audio_coding/neteq/defines.h" #include "modules/audio_coding/neteq/tick_timer.h" #include "rtc_base/constructor_magic.h" +#include "rtc_base/experiments/field_trial_parser.h" namespace webrtc { @@ -167,7 +168,7 @@ class DecisionLogic final { DelayManager* delay_manager_; BufferLevelFilter* buffer_level_filter_; const TickTimer* tick_timer_; - int fs_mult_; + int sample_rate_; size_t output_size_samples_; CngState cng_state_; // Remember if comfort noise is interrupted by other // event (e.g., DTMF). @@ -178,6 +179,10 @@ class DecisionLogic final { bool disallow_time_stretching_; std::unique_ptr timescale_countdown_; int num_consecutive_expands_; + int time_stretched_cn_samples_; + FieldTrialParameter estimate_dtx_delay_; + FieldTrialParameter time_stretch_cn_; + FieldTrialConstrained target_level_window_ms_; RTC_DISALLOW_COPY_AND_ASSIGN(DecisionLogic); }; diff --git a/modules/audio_coding/neteq/packet_buffer.cc b/modules/audio_coding/neteq/packet_buffer.cc index e90fadce3a..540d2792a1 100644 --- a/modules/audio_coding/neteq/packet_buffer.cc +++ b/modules/audio_coding/neteq/packet_buffer.cc @@ -26,6 +26,7 @@ #include "modules/audio_coding/neteq/tick_timer.h" #include "rtc_base/checks.h" #include "rtc_base/logging.h" +#include "rtc_base/numerics/safe_conversions.h" namespace webrtc { namespace { @@ -287,14 +288,22 @@ size_t PacketBuffer::NumSamplesInBuffer(size_t last_decoded_length) const { return num_samples; } -size_t PacketBuffer::GetSpanSamples(size_t last_decoded_length) const { +size_t PacketBuffer::GetSpanSamples(size_t last_decoded_length, + size_t sample_rate, + bool count_dtx_waiting_time) const { if (buffer_.size() == 0) { return 0; } size_t span = buffer_.back().timestamp - buffer_.front().timestamp; if (buffer_.back().frame && buffer_.back().frame->Duration() > 0) { - span += buffer_.back().frame->Duration(); + size_t duration = buffer_.back().frame->Duration(); + if (count_dtx_waiting_time && buffer_.back().frame->IsDtxPacket()) { + size_t waiting_time_samples = rtc::dchecked_cast( + buffer_.back().waiting_time->ElapsedMs() * (sample_rate / 1000)); + duration = std::max(duration, waiting_time_samples); + } + span += duration; } else { span += last_decoded_length; } diff --git a/modules/audio_coding/neteq/packet_buffer.h b/modules/audio_coding/neteq/packet_buffer.h index 0837027a5e..c00db294c0 100644 --- a/modules/audio_coding/neteq/packet_buffer.h +++ b/modules/audio_coding/neteq/packet_buffer.h @@ -123,7 +123,9 @@ class PacketBuffer { // Returns the total duration in samples that the packets in the buffer spans // across. - virtual size_t GetSpanSamples(size_t last_decoded_length) const; + virtual size_t GetSpanSamples(size_t last_decoded_length, + size_t sample_rate, + bool count_dtx_waiting_time) const; // Returns true if the packet buffer contains any DTX or CNG packets. virtual bool ContainsDtxOrCngPacket(