Convert IntelligibilityEnhancer to multi-threaded mode
BUG=581029 R=henrik.lundin@webrtc.org, peah@webrtc.org, turaj@webrtc.org Review URL: https://codereview.webrtc.org/1766383002 . Cr-Commit-Position: refs/heads/master@{#11929}
This commit is contained in:
parent
c1e55c7136
commit
57ae82929a
@ -920,10 +920,6 @@ int AudioProcessingImpl::ProcessReverseStreamLocked() {
|
||||
}
|
||||
|
||||
if (constants_.intelligibility_enabled) {
|
||||
// Currently run in single-threaded mode when the intelligibility
|
||||
// enhancer is activated.
|
||||
// TODO(peah): Fix to be properly multi-threaded.
|
||||
rtc::CritScope cs(&crit_capture_);
|
||||
public_submodules_->intelligibility_enhancer->ProcessRenderAudio(
|
||||
ra->split_channels_f(kBand0To8kHz), capture_nonlocked_.split_rate,
|
||||
ra->num_channels());
|
||||
@ -1235,7 +1231,8 @@ void AudioProcessingImpl::InitializeIntelligibility() {
|
||||
if (constants_.intelligibility_enabled) {
|
||||
public_submodules_->intelligibility_enhancer.reset(
|
||||
new IntelligibilityEnhancer(capture_nonlocked_.split_rate,
|
||||
render_.render_audio->num_channels()));
|
||||
render_.render_audio->num_channels(),
|
||||
NoiseSuppressionImpl::num_noise_bins()));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -61,25 +61,31 @@ void MapToErbBands(const float* pow,
|
||||
} // namespace
|
||||
|
||||
IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz,
|
||||
size_t num_render_channels)
|
||||
size_t num_render_channels,
|
||||
size_t num_noise_bins)
|
||||
: freqs_(RealFourier::ComplexLength(
|
||||
RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))),
|
||||
num_noise_bins_(num_noise_bins),
|
||||
chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)),
|
||||
bank_size_(GetBankSize(sample_rate_hz, kErbResolution)),
|
||||
sample_rate_hz_(sample_rate_hz),
|
||||
num_render_channels_(num_render_channels),
|
||||
clear_power_estimator_(freqs_, kDecayRate),
|
||||
noise_power_estimator_(
|
||||
new intelligibility::PowerEstimator<float>(freqs_, kDecayRate)),
|
||||
noise_power_estimator_(num_noise_bins, kDecayRate),
|
||||
filtered_clear_pow_(bank_size_, 0.f),
|
||||
filtered_noise_pow_(bank_size_, 0.f),
|
||||
filtered_noise_pow_(num_noise_bins, 0.f),
|
||||
center_freqs_(bank_size_),
|
||||
capture_filter_bank_(CreateErbBank(num_noise_bins)),
|
||||
render_filter_bank_(CreateErbBank(freqs_)),
|
||||
gains_eq_(bank_size_),
|
||||
gain_applier_(freqs_, kMaxRelativeGainChange),
|
||||
audio_s16_(chunk_length_),
|
||||
chunks_since_voice_(kSpeechOffsetDelay),
|
||||
is_speech_(false) {
|
||||
is_speech_(false),
|
||||
noise_estimation_buffer_(num_noise_bins),
|
||||
noise_estimation_queue_(kMaxNumNoiseEstimatesToBuffer,
|
||||
std::vector<float>(num_noise_bins),
|
||||
RenderQueueItemVerifier<float>(num_noise_bins)) {
|
||||
RTC_DCHECK_LE(kRho, 1.f);
|
||||
|
||||
const size_t erb_index = static_cast<size_t>(
|
||||
@ -98,13 +104,11 @@ IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz,
|
||||
|
||||
void IntelligibilityEnhancer::SetCaptureNoiseEstimate(
|
||||
std::vector<float> noise) {
|
||||
if (capture_filter_bank_.size() != bank_size_ ||
|
||||
capture_filter_bank_[0].size() != noise.size()) {
|
||||
capture_filter_bank_ = CreateErbBank(noise.size());
|
||||
noise_power_estimator_.reset(
|
||||
new intelligibility::PowerEstimator<float>(noise.size(), kDecayRate));
|
||||
}
|
||||
noise_power_estimator_->Step(noise.data());
|
||||
RTC_DCHECK_EQ(noise.size(), num_noise_bins_);
|
||||
// Disregarding return value since buffer overflow is acceptable, because it
|
||||
// is not critical to get each noise estimate.
|
||||
if (noise_estimation_queue_.Insert(&noise)) {
|
||||
};
|
||||
}
|
||||
|
||||
void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,
|
||||
@ -112,6 +116,9 @@ void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio,
|
||||
size_t num_channels) {
|
||||
RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz);
|
||||
RTC_CHECK_EQ(num_render_channels_, num_channels);
|
||||
while (noise_estimation_queue_.Remove(&noise_estimation_buffer_)) {
|
||||
noise_power_estimator_.Step(noise_estimation_buffer_.data());
|
||||
}
|
||||
is_speech_ = IsSpeech(audio[0]);
|
||||
render_mangler_->ProcessChunk(audio, audio);
|
||||
}
|
||||
@ -127,7 +134,7 @@ void IntelligibilityEnhancer::ProcessAudioBlock(
|
||||
clear_power_estimator_.Step(in_block[0]);
|
||||
}
|
||||
const std::vector<float>& clear_power = clear_power_estimator_.power();
|
||||
const std::vector<float>& noise_power = noise_power_estimator_->power();
|
||||
const std::vector<float>& noise_power = noise_power_estimator_.power();
|
||||
MapToErbBands(clear_power.data(), render_filter_bank_,
|
||||
filtered_clear_pow_.data());
|
||||
MapToErbBands(noise_power.data(), capture_filter_bank_,
|
||||
|
||||
@ -17,7 +17,9 @@
|
||||
|
||||
#include "webrtc/common_audio/lapped_transform.h"
|
||||
#include "webrtc/common_audio/channel_buffer.h"
|
||||
#include "webrtc/common_audio/swap_queue.h"
|
||||
#include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h"
|
||||
#include "webrtc/modules/audio_processing/processing_component.h"
|
||||
#include "webrtc/modules/audio_processing/vad/voice_activity_detector.h"
|
||||
|
||||
namespace webrtc {
|
||||
@ -29,7 +31,9 @@ namespace webrtc {
|
||||
// http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788
|
||||
class IntelligibilityEnhancer : public LappedTransform::Callback {
|
||||
public:
|
||||
IntelligibilityEnhancer(int sample_rate_hz, size_t num_render_channels);
|
||||
IntelligibilityEnhancer(int sample_rate_hz,
|
||||
size_t num_render_channels,
|
||||
size_t num_noise_bins);
|
||||
|
||||
// Sets the capture noise magnitude spectrum estimate.
|
||||
void SetCaptureNoiseEstimate(std::vector<float> noise);
|
||||
@ -72,15 +76,17 @@ class IntelligibilityEnhancer : public LappedTransform::Callback {
|
||||
// Returns true if the audio is speech.
|
||||
bool IsSpeech(const float* audio);
|
||||
|
||||
static const size_t kMaxNumNoiseEstimatesToBuffer = 5;
|
||||
|
||||
const size_t freqs_; // Num frequencies in frequency domain.
|
||||
const size_t num_noise_bins_;
|
||||
const size_t chunk_length_; // Chunk size in samples.
|
||||
const size_t bank_size_; // Num ERB filters.
|
||||
const int sample_rate_hz_;
|
||||
const size_t num_render_channels_;
|
||||
|
||||
intelligibility::PowerEstimator<std::complex<float>> clear_power_estimator_;
|
||||
std::unique_ptr<intelligibility::PowerEstimator<float>>
|
||||
noise_power_estimator_;
|
||||
intelligibility::PowerEstimator<float> noise_power_estimator_;
|
||||
std::vector<float> filtered_clear_pow_;
|
||||
std::vector<float> filtered_noise_pow_;
|
||||
std::vector<float> center_freqs_;
|
||||
@ -97,6 +103,10 @@ class IntelligibilityEnhancer : public LappedTransform::Callback {
|
||||
std::vector<int16_t> audio_s16_;
|
||||
size_t chunks_since_voice_;
|
||||
bool is_speech_;
|
||||
|
||||
std::vector<float> noise_estimation_buffer_;
|
||||
SwapQueue<std::vector<float>, RenderQueueItemVerifier<float>>
|
||||
noise_estimation_queue_;
|
||||
};
|
||||
|
||||
} // namespace webrtc
|
||||
|
||||
@ -201,6 +201,7 @@ const int kSamples = 1000;
|
||||
const int kSampleRate = 4000;
|
||||
const int kNumChannels = 1;
|
||||
const int kFragmentSize = kSampleRate / 100;
|
||||
const size_t kNumNoiseBins = 129;
|
||||
|
||||
} // namespace
|
||||
|
||||
@ -208,11 +209,13 @@ class IntelligibilityEnhancerTest : public ::testing::Test {
|
||||
protected:
|
||||
IntelligibilityEnhancerTest()
|
||||
: clear_data_(kSamples), noise_data_(kSamples), orig_data_(kSamples) {
|
||||
enh_.reset(new IntelligibilityEnhancer(kSampleRate, kNumChannels));
|
||||
enh_.reset(
|
||||
new IntelligibilityEnhancer(kSampleRate, kNumChannels, kNumNoiseBins));
|
||||
}
|
||||
|
||||
bool CheckUpdate() {
|
||||
enh_.reset(new IntelligibilityEnhancer(kSampleRate, kNumChannels));
|
||||
enh_.reset(
|
||||
new IntelligibilityEnhancer(kSampleRate, kNumChannels, kNumNoiseBins));
|
||||
float* clear_cursor = clear_data_.data();
|
||||
float* noise_cursor = noise_data_.data();
|
||||
for (int i = 0; i < kSamples; i += kFragmentSize) {
|
||||
|
||||
@ -37,9 +37,10 @@ void void_main(int argc, char* argv[]) {
|
||||
WavReader noise_file(FLAGS_noise_file);
|
||||
WavWriter out_file(FLAGS_out_file, in_file.sample_rate(),
|
||||
in_file.num_channels());
|
||||
IntelligibilityEnhancer enh(in_file.sample_rate(), in_file.num_channels());
|
||||
rtc::CriticalSection crit;
|
||||
NoiseSuppressionImpl ns(&crit);
|
||||
IntelligibilityEnhancer enh(in_file.sample_rate(), in_file.num_channels(),
|
||||
NoiseSuppressionImpl::num_noise_bins());
|
||||
ns.Initialize(noise_file.num_channels(), noise_file.sample_rate());
|
||||
ns.Enable(true);
|
||||
const size_t in_samples = noise_file.sample_rate() / 100;
|
||||
|
||||
@ -200,4 +200,12 @@ std::vector<float> NoiseSuppressionImpl::NoiseEstimate() {
|
||||
return noise_estimate;
|
||||
}
|
||||
|
||||
size_t NoiseSuppressionImpl::num_noise_bins() {
|
||||
#if defined(WEBRTC_NS_FLOAT)
|
||||
return WebRtcNs_num_freq();
|
||||
#elif defined(WEBRTC_NS_FIXED)
|
||||
return WebRtcNsx_num_freq();
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace webrtc
|
||||
|
||||
@ -39,6 +39,7 @@ class NoiseSuppressionImpl : public NoiseSuppression {
|
||||
Level level() const override;
|
||||
float speech_probability() const override;
|
||||
std::vector<float> NoiseEstimate() override;
|
||||
static size_t num_noise_bins();
|
||||
|
||||
private:
|
||||
class Suppressor;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user