From d6ef33e59b81a2d85e91aa7a4367c36320eefc81 Mon Sep 17 00:00:00 2001 From: Tommi Date: Wed, 3 Jul 2024 14:23:48 +0200 Subject: [PATCH] Remove PushResampler::InitializeIfNeeded This switches from accepting a sample rate and convert to channel size over to accepting the channel size. Instead of InitializeIfNeeded: * Offer a way to explicitly initialize PushResampler via the ctor (needed for VoiceActivityDetectorWrapper) * Implicitly check for the right configuration from within Resample(). (All calls to Resample() were preceded by a call to Initialize) As part of this, refactor VoiceActivityDetectorWrapper (VADW): * VADW is now initialized in the constructor and more const. * Remove VADW::Initialize() and instead reconstruct VADW if needed. Add constants for max sample rate and num channels to audio_util.h In many cases the numbers for these values are embedded in the code which has led to some inconsistency. Bug: chromium:335805780 Change-Id: Iead0d52eb1b261a8d64e93f51401147c8fba32f0 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/353360 Reviewed-by: Gustaf Ullberg Commit-Queue: Tomas Gunnarsson Cr-Commit-Position: refs/heads/main@{#42587} --- audio/audio_transport_impl.cc | 3 - audio/remix_resample.cc | 8 --- common_audio/include/audio_util.h | 11 ++++ .../resampler/include/push_resampler.h | 19 +++--- common_audio/resampler/push_resampler.cc | 61 +++++++++++-------- .../resampler/push_resampler_unittest.cc | 21 +++---- modules/audio_coding/acm2/acm_resampler.cc | 8 --- modules/audio_processing/agc2/vad_wrapper.cc | 24 +++----- modules/audio_processing/agc2/vad_wrapper.h | 7 +-- .../agc2/vad_wrapper_unittest.cc | 2 +- modules/audio_processing/audio_buffer.cc | 16 +++-- modules/audio_processing/audio_buffer.h | 4 +- .../audio_processing/audio_processing_impl.cc | 16 ++--- .../audio_processing_unittest.cc | 4 +- 14 files changed, 94 insertions(+), 110 deletions(-) diff --git a/audio/audio_transport_impl.cc b/audio/audio_transport_impl.cc index 363b924f10..b9d83321b9 100644 --- a/audio/audio_transport_impl.cc +++ b/audio/audio_transport_impl.cc @@ -81,9 +81,6 @@ int Resample(const AudioFrame& frame, RTC_CHECK_EQ(destination.data().size(), frame.num_channels_ * target_number_of_samples_per_channel); - resampler->InitializeIfNeeded(frame.sample_rate_hz_, destination_sample_rate, - static_cast(frame.num_channels())); - // TODO(yujo): Add special case handling of muted frames. return resampler->Resample(frame.data_view(), destination); } diff --git a/audio/remix_resample.cc b/audio/remix_resample.cc index b5f673a07c..93e86517ae 100644 --- a/audio/remix_resample.cc +++ b/audio/remix_resample.cc @@ -61,14 +61,6 @@ void RemixAndResample(InterleavedView src_data, src_data = downmixed; } - if (resampler->InitializeIfNeeded(sample_rate_hz, dst_frame->sample_rate_hz_, - src_data.num_channels()) == -1) { - RTC_FATAL() << "InitializeIfNeeded failed: sample_rate_hz = " - << sample_rate_hz << ", dst_frame->sample_rate_hz_ = " - << dst_frame->sample_rate_hz_ - << ", num_channels = " << src_data.num_channels(); - } - // TODO(yujo): for muted input frames, don't resample. Either 1) allow // resampler to return output length without doing the resample, so we know // how much to zero here; or 2) make resampler accept a hint that the input is diff --git a/common_audio/include/audio_util.h b/common_audio/include/audio_util.h index 12e65259a4..672ebd1aa0 100644 --- a/common_audio/include/audio_util.h +++ b/common_audio/include/audio_util.h @@ -25,6 +25,17 @@ namespace webrtc { typedef std::numeric_limits limits_int16; +// TODO(tommi, peah): Move these constants to their own header, e.g. +// `audio_constants.h`. Also consider if they should be in api/. + +// Absolute highest acceptable sample rate supported for audio processing, +// capture and codecs. Note that for some components some cases a lower limit +// applies which typically is 48000 but in some cases is lower. +constexpr int kMaxSampleRateHz = 384000; + +// Number of samples per channel for 10ms of audio at the highest sample rate. +constexpr size_t kMaxSamplesPerChannel10ms = kMaxSampleRateHz / 100u; + // The conversion functions use the following naming convention: // S16: int16_t [-32768, 32767] // Float: float [-1.0, 1.0] diff --git a/common_audio/resampler/include/push_resampler.h b/common_audio/resampler/include/push_resampler.h index 4a23d4624e..394e96b133 100644 --- a/common_audio/resampler/include/push_resampler.h +++ b/common_audio/resampler/include/push_resampler.h @@ -23,16 +23,13 @@ class PushSincResampler; // Wraps PushSincResampler to provide stereo support. // Note: This implementation assumes 10ms buffer sizes throughout. template -class PushResampler { +class PushResampler final { public: PushResampler(); - virtual ~PushResampler(); - - // Must be called whenever the parameters change. Free to be called at any - // time as it is a no-op if parameters have not changed since the last call. - int InitializeIfNeeded(int src_sample_rate_hz, - int dst_sample_rate_hz, - size_t num_channels); + PushResampler(size_t src_samples_per_channel, + size_t dst_samples_per_channel, + size_t num_channels); + ~PushResampler(); // Returns the total number of samples provided in destination (e.g. 32 kHz, // 2 channel audio gives 640 samples). @@ -42,6 +39,12 @@ class PushResampler { int Resample(MonoView src, MonoView dst); private: + // Ensures that source and destination buffers for deinterleaving are + // correctly configured prior to resampling that requires deinterleaving. + void EnsureInitialized(size_t src_samples_per_channel, + size_t dst_samples_per_channel, + size_t num_channels); + // Buffers used for when a deinterleaving step is necessary. std::unique_ptr source_; std::unique_ptr destination_; diff --git a/common_audio/resampler/push_resampler.cc b/common_audio/resampler/push_resampler.cc index 1cbf421b6d..2e75679c82 100644 --- a/common_audio/resampler/push_resampler.cc +++ b/common_audio/resampler/push_resampler.cc @@ -22,55 +22,66 @@ namespace webrtc { +namespace { +// Maximum concurrent number of channels for `PushResampler<>`. +// Note that this may be different from what the maximum is for audio codecs. +constexpr int kMaxNumberOfChannels = 8; +} // namespace + template PushResampler::PushResampler() = default; +template +PushResampler::PushResampler(size_t src_samples_per_channel, + size_t dst_samples_per_channel, + size_t num_channels) { + EnsureInitialized(src_samples_per_channel, dst_samples_per_channel, + num_channels); +} + template PushResampler::~PushResampler() = default; template -int PushResampler::InitializeIfNeeded(int src_sample_rate_hz, - int dst_sample_rate_hz, +void PushResampler::EnsureInitialized(size_t src_samples_per_channel, + size_t dst_samples_per_channel, size_t num_channels) { - // These checks used to be factored out of this template function due to - // Windows debug build issues with clang. http://crbug.com/615050 - RTC_CHECK_GT(src_sample_rate_hz, 0); - RTC_CHECK_GT(dst_sample_rate_hz, 0); - RTC_CHECK_GT(num_channels, 0); + RTC_DCHECK_GT(src_samples_per_channel, 0); + RTC_DCHECK_GT(dst_samples_per_channel, 0); + RTC_DCHECK_GT(num_channels, 0); + RTC_DCHECK_LE(src_samples_per_channel, kMaxSamplesPerChannel10ms); + RTC_DCHECK_LE(dst_samples_per_channel, kMaxSamplesPerChannel10ms); + RTC_DCHECK_LE(num_channels, kMaxNumberOfChannels); - const size_t src_size_10ms_mono = - static_cast(src_sample_rate_hz / 100); - const size_t dst_size_10ms_mono = - static_cast(dst_sample_rate_hz / 100); - - if (src_size_10ms_mono == SamplesPerChannel(source_view_) && - dst_size_10ms_mono == SamplesPerChannel(destination_view_) && + if (src_samples_per_channel == SamplesPerChannel(source_view_) && + dst_samples_per_channel == SamplesPerChannel(destination_view_) && num_channels == NumChannels(source_view_)) { // No-op if settings haven't changed. - return 0; + return; } // Allocate two buffers for all source and destination channels. // Then organize source and destination views together with an array of // resamplers for each channel in the deinterlaved buffers. - source_.reset(new T[src_size_10ms_mono * num_channels]); - destination_.reset(new T[dst_size_10ms_mono * num_channels]); - source_view_ = - DeinterleavedView(source_.get(), src_size_10ms_mono, num_channels); - destination_view_ = DeinterleavedView(destination_.get(), - dst_size_10ms_mono, num_channels); + source_.reset(new T[src_samples_per_channel * num_channels]); + destination_.reset(new T[dst_samples_per_channel * num_channels]); + source_view_ = DeinterleavedView(source_.get(), src_samples_per_channel, + num_channels); + destination_view_ = DeinterleavedView( + destination_.get(), dst_samples_per_channel, num_channels); resamplers_.resize(num_channels); for (size_t i = 0; i < num_channels; ++i) { - resamplers_[i] = std::make_unique(src_size_10ms_mono, - dst_size_10ms_mono); + resamplers_[i] = std::make_unique( + src_samples_per_channel, dst_samples_per_channel); } - - return 0; } template int PushResampler::Resample(InterleavedView src, InterleavedView dst) { + EnsureInitialized(SamplesPerChannel(src), SamplesPerChannel(dst), + NumChannels(src)); + RTC_DCHECK_EQ(NumChannels(src), NumChannels(source_view_)); RTC_DCHECK_EQ(NumChannels(dst), NumChannels(destination_view_)); RTC_DCHECK_EQ(SamplesPerChannel(src), SamplesPerChannel(source_view_)); diff --git a/common_audio/resampler/push_resampler_unittest.cc b/common_audio/resampler/push_resampler_unittest.cc index 91f2233aad..4fba2f412e 100644 --- a/common_audio/resampler/push_resampler_unittest.cc +++ b/common_audio/resampler/push_resampler_unittest.cc @@ -19,29 +19,24 @@ namespace webrtc { TEST(PushResamplerTest, VerifiesInputParameters) { - PushResampler resampler; - EXPECT_EQ(0, resampler.InitializeIfNeeded(16000, 16000, 1)); - EXPECT_EQ(0, resampler.InitializeIfNeeded(16000, 16000, 2)); - EXPECT_EQ(0, resampler.InitializeIfNeeded(16000, 16000, 8)); + PushResampler resampler1(160, 160, 1); + PushResampler resampler2(160, 160, 2); + PushResampler resampler3(160, 160, 8); } #if RTC_DCHECK_IS_ON && GTEST_HAS_DEATH_TEST && !defined(WEBRTC_ANDROID) TEST(PushResamplerDeathTest, VerifiesBadInputParameters1) { - PushResampler resampler; - RTC_EXPECT_DEATH(resampler.InitializeIfNeeded(-1, 16000, 1), - "src_sample_rate_hz"); + RTC_EXPECT_DEATH(PushResampler(-1, 160, 1), + "src_samples_per_channel"); } TEST(PushResamplerDeathTest, VerifiesBadInputParameters2) { - PushResampler resampler; - RTC_EXPECT_DEATH(resampler.InitializeIfNeeded(16000, -1, 1), - "dst_sample_rate_hz"); + RTC_EXPECT_DEATH(PushResampler(160, -1, 1), + "dst_samples_per_channel"); } TEST(PushResamplerDeathTest, VerifiesBadInputParameters3) { - PushResampler resampler; - RTC_EXPECT_DEATH(resampler.InitializeIfNeeded(16000, 16000, 0), - "num_channels"); + RTC_EXPECT_DEATH(PushResampler(160, 16000, 0), "num_channels"); } #endif diff --git a/modules/audio_coding/acm2/acm_resampler.cc b/modules/audio_coding/acm2/acm_resampler.cc index 6e2a3bcac4..0f1be66eb5 100644 --- a/modules/audio_coding/acm2/acm_resampler.cc +++ b/modules/audio_coding/acm2/acm_resampler.cc @@ -44,14 +44,6 @@ int ACMResampler::Resample10Msec(const int16_t* in_audio, return static_cast(dst.samples_per_channel()); } - if (resampler_.InitializeIfNeeded(in_freq_hz, out_freq_hz, - num_audio_channels) != 0) { - RTC_LOG(LS_ERROR) << "InitializeIfNeeded(" << in_freq_hz << ", " - << out_freq_hz << ", " << num_audio_channels - << ") failed."; - return -1; - } - int out_length = resampler_.Resample(src, dst); if (out_length == -1) { RTC_LOG(LS_ERROR) << "Resample(" << in_audio << ", " << src.data().size() diff --git a/modules/audio_processing/agc2/vad_wrapper.cc b/modules/audio_processing/agc2/vad_wrapper.cc index d4fa32ada9..3bafdc7c56 100644 --- a/modules/audio_processing/agc2/vad_wrapper.cc +++ b/modules/audio_processing/agc2/vad_wrapper.cc @@ -73,28 +73,20 @@ VoiceActivityDetectorWrapper::VoiceActivityDetectorWrapper( int sample_rate_hz) : vad_reset_period_frames_( rtc::CheckedDivExact(vad_reset_period_ms, kFrameDurationMs)), + frame_size_(rtc::CheckedDivExact(sample_rate_hz, kNumFramesPerSecond)), time_to_vad_reset_(vad_reset_period_frames_), - vad_(std::move(vad)) { - RTC_DCHECK(vad_); + vad_(std::move(vad)), + resampled_buffer_( + rtc::CheckedDivExact(vad_->SampleRateHz(), kNumFramesPerSecond)), + resampler_(frame_size_, + resampled_buffer_.size(), + /*num_channels=*/1) { RTC_DCHECK_GT(vad_reset_period_frames_, 1); - resampled_buffer_.resize( - rtc::CheckedDivExact(vad_->SampleRateHz(), kNumFramesPerSecond)); - Initialize(sample_rate_hz); + vad_->Reset(); } VoiceActivityDetectorWrapper::~VoiceActivityDetectorWrapper() = default; -void VoiceActivityDetectorWrapper::Initialize(int sample_rate_hz) { - RTC_DCHECK_GT(sample_rate_hz, 0); - frame_size_ = rtc::CheckedDivExact(sample_rate_hz, kNumFramesPerSecond); - int status = - resampler_.InitializeIfNeeded(sample_rate_hz, vad_->SampleRateHz(), - /*num_channels=*/1); - constexpr int kStatusOk = 0; - RTC_DCHECK_EQ(status, kStatusOk); - vad_->Reset(); -} - float VoiceActivityDetectorWrapper::Analyze(AudioFrameView frame) { // Periodically reset the VAD. time_to_vad_reset_--; diff --git a/modules/audio_processing/agc2/vad_wrapper.h b/modules/audio_processing/agc2/vad_wrapper.h index 459c471630..8751dbaf75 100644 --- a/modules/audio_processing/agc2/vad_wrapper.h +++ b/modules/audio_processing/agc2/vad_wrapper.h @@ -60,9 +60,6 @@ class VoiceActivityDetectorWrapper { delete; ~VoiceActivityDetectorWrapper(); - // Initializes the VAD wrapper. - void Initialize(int sample_rate_hz); - // Analyzes the first channel of `frame` and returns the speech probability. // `frame` must be a 10 ms frame with the sample rate specified in the last // `Initialize()` call. @@ -70,11 +67,11 @@ class VoiceActivityDetectorWrapper { private: const int vad_reset_period_frames_; - int frame_size_; + const int frame_size_; int time_to_vad_reset_; - PushResampler resampler_; std::unique_ptr vad_; std::vector resampled_buffer_; + PushResampler resampler_; }; } // namespace webrtc diff --git a/modules/audio_processing/agc2/vad_wrapper_unittest.cc b/modules/audio_processing/agc2/vad_wrapper_unittest.cc index 91efdb566e..6f66560060 100644 --- a/modules/audio_processing/agc2/vad_wrapper_unittest.cc +++ b/modules/audio_processing/agc2/vad_wrapper_unittest.cc @@ -50,7 +50,7 @@ class MockVad : public VoiceActivityDetectorWrapper::MonoVad { TEST(GainController2VoiceActivityDetectorWrapper, CtorAndInitReadSampleRate) { auto vad = std::make_unique(); EXPECT_CALL(*vad, SampleRateHz) - .Times(2) + .Times(1) .WillRepeatedly(Return(kSampleRate8kHz)); EXPECT_CALL(*vad, Reset).Times(AnyNumber()); auto vad_wrapper = std::make_unique( diff --git a/modules/audio_processing/audio_buffer.cc b/modules/audio_processing/audio_buffer.cc index 3dbe1fe072..c48d444664 100644 --- a/modules/audio_processing/audio_buffer.cc +++ b/modules/audio_processing/audio_buffer.cc @@ -15,7 +15,6 @@ #include #include "common_audio/channel_buffer.h" -#include "common_audio/include/audio_util.h" #include "common_audio/resampler/push_sinc_resampler.h" #include "modules/audio_processing/splitting_filter.h" #include "rtc_base/checks.h" @@ -25,7 +24,6 @@ namespace { constexpr size_t kSamplesPer32kHzChannel = 320; constexpr size_t kSamplesPer48kHzChannel = 480; -constexpr size_t kMaxSamplesPerChannel = AudioBuffer::kMaxSampleRate / 100; size_t NumBandsFromFramesPerChannel(size_t num_frames) { if (num_frames == kSamplesPer32kHzChannel) { @@ -110,9 +108,9 @@ void AudioBuffer::CopyFrom(const float* const* stacked_data, const bool resampling_needed = input_num_frames_ != buffer_num_frames_; if (downmix_needed) { - RTC_DCHECK_GE(kMaxSamplesPerChannel, input_num_frames_); + RTC_DCHECK_GE(kMaxSamplesPerChannel10ms, input_num_frames_); - std::array downmix; + std::array downmix; if (downmix_by_averaging_) { const float kOneByNumChannels = 1.f / input_num_channels_; for (size_t i = 0; i < input_num_frames_; ++i) { @@ -230,7 +228,7 @@ void AudioBuffer::CopyFrom(const int16_t* const interleaved_data, if (num_channels_ == 1) { if (input_num_channels_ == 1) { if (resampling_required) { - std::array float_buffer; + std::array float_buffer; S16ToFloatS16(interleaved, input_num_frames_, float_buffer.data()); input_resamplers_[0]->Resample(float_buffer.data(), input_num_frames_, data_->channels()[0], @@ -239,7 +237,7 @@ void AudioBuffer::CopyFrom(const int16_t* const interleaved_data, S16ToFloatS16(interleaved, input_num_frames_, data_->channels()[0]); } } else { - std::array float_buffer; + std::array float_buffer; float* downmixed_data = resampling_required ? float_buffer.data() : data_->channels()[0]; if (downmix_by_averaging_) { @@ -274,7 +272,7 @@ void AudioBuffer::CopyFrom(const int16_t* const interleaved_data, }; if (resampling_required) { - std::array float_buffer; + std::array float_buffer; for (size_t i = 0; i < num_channels_; ++i) { deinterleave_channel(i, num_channels_, input_num_frames_, interleaved, float_buffer.data()); @@ -302,7 +300,7 @@ void AudioBuffer::CopyTo(const StreamConfig& stream_config, int16_t* interleaved = interleaved_data; if (num_channels_ == 1) { - std::array float_buffer; + std::array float_buffer; if (resampling_required) { output_resamplers_[0]->Resample(data_->channels()[0], buffer_num_frames_, @@ -335,7 +333,7 @@ void AudioBuffer::CopyTo(const StreamConfig& stream_config, if (resampling_required) { for (size_t i = 0; i < num_channels_; ++i) { - std::array float_buffer; + std::array float_buffer; output_resamplers_[i]->Resample(data_->channels()[i], buffer_num_frames_, float_buffer.data(), output_num_frames_); diff --git a/modules/audio_processing/audio_buffer.h b/modules/audio_processing/audio_buffer.h index fd69f74ed1..3fdc06419f 100644 --- a/modules/audio_processing/audio_buffer.h +++ b/modules/audio_processing/audio_buffer.h @@ -19,6 +19,7 @@ #include "api/audio/audio_processing.h" #include "common_audio/channel_buffer.h" +#include "common_audio/include/audio_util.h" namespace webrtc { @@ -32,7 +33,8 @@ enum Band { kBand0To8kHz = 0, kBand8To16kHz = 1, kBand16To24kHz = 2 }; class AudioBuffer { public: static const int kSplitBandSize = 160; - static const int kMaxSampleRate = 384000; + // TODO(tommi): Remove this (`AudioBuffer::kMaxSampleRate`) constant. + static const int kMaxSampleRate = webrtc::kMaxSampleRateHz; AudioBuffer(size_t input_rate, size_t input_num_channels, size_t buffer_rate, diff --git a/modules/audio_processing/audio_processing_impl.cc b/modules/audio_processing/audio_processing_impl.cc index e98316f44f..2d0b327843 100644 --- a/modules/audio_processing/audio_processing_impl.cc +++ b/modules/audio_processing/audio_processing_impl.cc @@ -2157,17 +2157,11 @@ void AudioProcessingImpl::InitializeVoiceActivityDetector() { return; } - if (!submodules_.voice_activity_detector) { - RTC_DCHECK(!!submodules_.gain_controller2); - // TODO(bugs.webrtc.org/13663): Cache CPU features in APM and use here. - submodules_.voice_activity_detector = - std::make_unique( - submodules_.gain_controller2->GetCpuFeatures(), - proc_fullband_sample_rate_hz()); - } else { - submodules_.voice_activity_detector->Initialize( - proc_fullband_sample_rate_hz()); - } + // TODO(bugs.webrtc.org/13663): Cache CPU features in APM and use here. + submodules_.voice_activity_detector = + std::make_unique( + submodules_.gain_controller2->GetCpuFeatures(), + proc_fullband_sample_rate_hz()); } void AudioProcessingImpl::InitializeNoiseSuppressor() { diff --git a/modules/audio_processing/audio_processing_unittest.cc b/modules/audio_processing/audio_processing_unittest.cc index adb8891dfa..9dd4d376cd 100644 --- a/modules/audio_processing/audio_processing_unittest.cc +++ b/modules/audio_processing/audio_processing_unittest.cc @@ -2173,8 +2173,8 @@ TEST_P(AudioProcessingTest, Formats) { // don't match. std::unique_ptr cmp_data(new float[ref_length]); - PushResampler resampler; - resampler.InitializeIfNeeded(out_rate, ref_rate, out_num); + PushResampler resampler(out_samples_per_channel, + ref_samples_per_channel, out_num); // Compute the resampling delay of the output relative to the reference, // to find the region over which we should search for the best SNR.