InputVolumeController: Replace speech level target and max digital gain

Replace the use of speech level target and digital gain maximum with speech level target range parameters. Bug: webrtc:7494 Change-Id: I703756c5a3fbd330ed585e3f5b4ac3141d9ea6e2 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/280943 Commit-Queue: Alessio Bazzica <alessiob@webrtc.org> Reviewed-by: Alessio Bazzica <alessiob@webrtc.org> Cr-Commit-Position: refs/heads/main@{#38563}
2022-11-07 10:51:21 +01:00 · 2022-11-07 10:51:21 +01:00 · 8a8de9be3b
commit 8a8de9be3b
parent adc5dfe515
3 changed files with 77 additions and 54 deletions
--- a/modules/audio_processing/agc2/input_volume_controller.cc
+++ b/modules/audio_processing/agc2/input_volume_controller.cc
@ -41,7 +41,6 @@ constexpr int kMaxResidualGainChange = 15;
 // the RMS error in `GetSpeechLevelErrorDb()`.
 // TODO(webrtc:7494): Move these to a config and pass in the ctor with
 // kUpdateInputVolumeWaitFrames = 100.
-constexpr float kTargetSpeechLevelDbfs = -18.0f;
 constexpr float kSpeechProbabilitySilenceThreshold = 0.5f;
 constexpr int kUpdateInputVolumeWaitFrames = 0;

@ -140,10 +139,15 @@ void LogClippingMetrics(int clipping_rate) {
                              /*bucket_count=*/50);
 }

-// Computes the speech level error in dB. `speech_level_dbfs` is required to be
-// in the range [-90.0f, 30.0f] and `speech_probability` in the range
-// [0.0f, 1.0f].
-int GetSpeechLevelErrorDb(float speech_level_dbfs, float speech_probability) {
+// Computes the speech level error in dB. The value of `speech_level_dbfs` is
+// required to be in the range [-90.0f, 30.0f] and `speech_probability` in the
+// range [0.0f, 1.0f]. Returns a positive value when the speech level is below
+// the target range and a negative value when the speech level is above the
+// target range.
+int GetSpeechLevelErrorDb(float speech_level_dbfs,
+                          float speech_probability,
+                          int target_range_min_dbfs,
+                          int target_range_max_dbfs) {
  constexpr float kMinSpeechLevelDbfs = -90.0f;
  constexpr float kMaxSpeechLevelDbfs = 30.0f;
  RTC_DCHECK_GE(speech_level_dbfs, kMinSpeechLevelDbfs);
@ -151,24 +155,33 @@ int GetSpeechLevelErrorDb(float speech_level_dbfs, float speech_probability) {
  RTC_DCHECK_GE(speech_probability, 0.0f);
  RTC_DCHECK_LE(speech_probability, 1.0f);

+  // TODO(webrtc:7494): Replace with the use of `SpeechProbabilityBuffer`.
  if (speech_probability < kSpeechProbabilitySilenceThreshold) {
    return 0;
  }

-  const float speech_level = rtc::SafeClamp<float>(
+  // Ensure the speech level is in the range [-90.0f, 30.0f].
+  speech_level_dbfs = rtc::SafeClamp<float>(
      speech_level_dbfs, kMinSpeechLevelDbfs, kMaxSpeechLevelDbfs);

-  return std::round(kTargetSpeechLevelDbfs - speech_level);
+  // Compute the speech level distance to the target range
+  // [`target_range_min_dbfs`, `target_range_max_dbfs`].
+  int rms_error_dbfs = 0;
+  if (speech_level_dbfs > target_range_max_dbfs) {
+    rms_error_dbfs = std::round(target_range_max_dbfs - speech_level_dbfs);
+  } else if (speech_level_dbfs < target_range_min_dbfs) {
+    rms_error_dbfs = std::round(target_range_min_dbfs - speech_level_dbfs);
+  }
+
+  return rms_error_dbfs;
 }

 }  // namespace

 MonoInputVolumeController::MonoInputVolumeController(int startup_min_level,
                                                     int clipped_level_min,
-                                                     int min_mic_level,
-                                                     int max_digital_gain_db)
+                                                     int min_mic_level)
    : min_mic_level_(min_mic_level),
-      max_digital_gain_db_(max_digital_gain_db),
      max_level_(kMaxMicLevel),
      startup_min_level_(ClampLevel(startup_min_level, min_mic_level_)),
      clipped_level_min_(clipped_level_min) {}
@ -183,7 +196,7 @@ void MonoInputVolumeController::Initialize() {
  is_first_frame_ = true;
 }

-void MonoInputVolumeController::Process(absl::optional<int> rms_error) {
+void MonoInputVolumeController::Process(absl::optional<int> rms_error_dbfs) {
  if (check_volume_on_next_process_) {
    check_volume_on_next_process_ = false;
    // We have to wait until the first process call to check the volume,
@ -191,9 +204,9 @@ void MonoInputVolumeController::Process(absl::optional<int> rms_error) {
    CheckVolumeAndReset();
  }

-  if (rms_error.has_value() && !is_first_frame_ &&
+  if (rms_error_dbfs.has_value() && !is_first_frame_ &&
      frames_since_update_gain_ >= kUpdateInputVolumeWaitFrames) {
-    UpdateGain(*rms_error);
+    UpdateInputVolume(*rms_error_dbfs);
  }

  is_first_frame_ = false;
@ -318,27 +331,15 @@ int MonoInputVolumeController::CheckVolumeAndReset() {
  return 0;
 }

-// Distributes the required gain change between the digital compression stage
-// and volume slider. We use the compressor first, providing a slack region
-// around the current slider position to reduce movement.
-//
-// If the slider needs to be moved, we check first if the user has adjusted
-// it, in which case we take no action and cache the updated level.
-void MonoInputVolumeController::UpdateGain(int rms_error_db) {
-  int rms_error = rms_error_db;
-
+void MonoInputVolumeController::UpdateInputVolume(int rms_error_dbfs) {
  // Always reset the counter regardless of whether the gain is changed
  // or not.
  frames_since_update_gain_ = 0;

-  int raw_digital_gain = 0;
-  raw_digital_gain = rtc::SafeClamp(rms_error, 0, max_digital_gain_db_);
+  const int residual_gain = rtc::SafeClamp(
+      rms_error_dbfs, -kMaxResidualGainChange, kMaxResidualGainChange);

-  const int residual_gain =
-      rtc::SafeClamp(rms_error - raw_digital_gain, -kMaxResidualGainChange,
-                     kMaxResidualGainChange);
-
-  RTC_DLOG(LS_INFO) << "[agc] rms_error=" << rms_error
+  RTC_DLOG(LS_INFO) << "[agc] rms_error_dbfs=" << rms_error_dbfs
                    << ", residual_gain=" << residual_gain;

  if (residual_gain == 0) {
@ -370,7 +371,9 @@ InputVolumeController::InputVolumeController(int num_capture_channels,
          CreateClippingPredictorConfig(config.enable_clipping_predictor)
              .use_predicted_step),
      clipping_rate_log_(0.0f),
-      clipping_rate_log_counter_(0) {
+      clipping_rate_log_counter_(0),
+      target_range_max_dbfs_(config.target_range_max_dbfs),
+      target_range_min_dbfs_(config.target_range_min_dbfs) {
  RTC_LOG(LS_INFO) << "[agc] analog controller enabled: "
                   << (analog_controller_enabled_ ? "yes" : "no");
  const int min_mic_level = min_mic_level_override_.value_or(kMinMicLevel);
@ -382,8 +385,7 @@ InputVolumeController::InputVolumeController(int num_capture_channels,

  for (auto& controller : channel_controllers_) {
    controller = std::make_unique<MonoInputVolumeController>(
-        config.startup_min_volume, config.clipped_level_min, min_mic_level,
-        config.max_digital_gain_db);
+        config.startup_min_volume, config.clipped_level_min, min_mic_level);
  }

  RTC_DCHECK(!channel_controllers_.empty());
@ -495,13 +497,15 @@ void InputVolumeController::Process(absl::optional<float> speech_probability,
    return;
  }

-  absl::optional<int> rms_error;
+  absl::optional<int> rms_error_dbfs;
  if (speech_probability.has_value() && speech_level_dbfs.has_value()) {
-    rms_error = GetSpeechLevelErrorDb(*speech_level_dbfs, *speech_probability);
+    rms_error_dbfs =
+        GetSpeechLevelErrorDb(*speech_level_dbfs, *speech_probability,
+                              target_range_min_dbfs_, target_range_max_dbfs_);
  }

  for (auto& controller : channel_controllers_) {
-    controller->Process(rms_error);
+    controller->Process(rms_error_dbfs);
  }

  AggregateChannelLevels();
--- a/modules/audio_processing/agc2/input_volume_controller.h
+++ b/modules/audio_processing/agc2/input_volume_controller.h
@ -54,8 +54,17 @@ class InputVolumeController final {
    int clipped_wait_frames = 300;
    // Enables clipping prediction functionality.
    bool enable_clipping_predictor = false;
-    // Maximum digital gain used before input volume is adjusted.
-    int max_digital_gain_db = 30;
+    // Speech level target range (dBFS). If the speech level is in the range
+    // [`target_range_min_dbfs`, `target_range_max_dbfs`], no input volume
+    // adjustments are done based on the speech level. For speech levels below
+    // and above the range, the targets `target_range_min_dbfs` and
+    // `target_range_max_dbfs` are used, respectively. The example values
+    // `target_range_max_dbfs` -18 and `target_range_min_dbfs` -48 refer to a
+    // configuration where the zero-digital-gain target is -18 dBFS and the
+    // digital gain control is expected to compensate for speech level errors
+    // up to -30 dB.
+    int target_range_max_dbfs = -18;
+    int target_range_min_dbfs = -48;
  };

  // Ctor. `num_capture_channels` specifies the number of channels for the audio
@ -77,15 +86,15 @@ class InputVolumeController final {
  // TODO(bugs.webrtc.org/7494): Add argument for the applied input volume and
  // remove `set_stream_analog_level()`.
  // Analyzes `audio` before `Process()` is called so that the analysis can be
-  // performed before external digital processing operations take place (e.g.,
-  // echo cancellation). The analysis consists of input clipping detection and
+  // performed before digital processing operations take place (e.g., echo
+  // cancellation). The analysis consists of input clipping detection and
  // prediction (if enabled). Must be called after `set_stream_analog_level()`.
  void AnalyzePreProcess(const AudioBuffer& audio_buffer);

-  // Chooses a digital compression gain and the new input volume to recommend.
-  // Must be called after `AnalyzePreProcess()`. `speech_probability`
-  // (range [0.0f, 1.0f]) and `speech_level_dbfs` (range [-90.f, 30.0f]) are
-  // used to compute the RMS error.
+  // Adjusts the recommended input volume upwards/downwards based on
+  // `speech_level_dbfs`. Must be called after `AnalyzePreProcess()`. The value
+  // of `speech_probability` is expected to be in the range [0.0f, 1.0f] and
+  // `speech_level_dbfs` in the the range [-90.f, 30.0f].
  void Process(absl::optional<float> speech_probability,
               absl::optional<float> speech_level_dbfs);

@ -179,6 +188,13 @@ class InputVolumeController final {
  const bool use_clipping_predictor_step_;
  float clipping_rate_log_;
  int clipping_rate_log_counter_;
+
+  // Target range minimum and maximum. If the seech level is in the range
+  // [`target_range_min_dbfs`, `target_range_max_dbfs`], no volume adjustments
+  // take place. Instead, the digital gain controller is assumed to adapt to
+  // compensate for the speech level RMS error.
+  const int target_range_max_dbfs_;
+  const int target_range_min_dbfs_;
 };

 // TODO(bugs.webrtc.org/7494): Use applied/recommended input volume naming
@ -187,8 +203,7 @@ class MonoInputVolumeController {
 public:
  MonoInputVolumeController(int startup_min_level,
                            int clipped_level_min,
-                            int min_mic_level,
-                            int max_digital_gain_db);
+                            int min_mic_level);
  ~MonoInputVolumeController();
  MonoInputVolumeController(const MonoInputVolumeController&) = delete;
  MonoInputVolumeController& operator=(const MonoInputVolumeController&) =
@ -205,9 +220,10 @@ class MonoInputVolumeController {
  // `set_stream_analog_level()`.
  void HandleClipping(int clipped_level_step);

-  // Updates the recommended input volume based on the estimated speech level
-  // RMS error. Must be called after `HandleClipping()`.
-  void Process(absl::optional<int> rms_error);
+  // Adjusts the recommended input volume upwards/downwards depending on whether
+  // `rms_error_dbfs` is positive or negative. Must be called after
+  // `HandleClipping()`.
+  void Process(absl::optional<int> rms_error_dbfs);

  // Returns the recommended input volume. Must be called after `Process()`.
  int recommended_analog_level() const { return recommended_input_volume_; }
@ -228,12 +244,14 @@ class MonoInputVolumeController {
  void SetMaxLevel(int level);

  int CheckVolumeAndReset();
-  void UpdateGain(int rms_error_db);
+
+  // Updates the recommended input volume. If the volume slider needs to be
+  // moved, we check first if the user has adjusted it, in which case we take no
+  // action and cache the updated level.
+  void UpdateInputVolume(int rms_error_dbfs);

  const int min_mic_level_;

-  const int max_digital_gain_db_;
-
  int level_ = 0;
  int max_level_;

--- a/modules/audio_processing/agc2/input_volume_controller_unittest.cc
+++ b/modules/audio_processing/agc2/input_volume_controller_unittest.cc
@ -44,7 +44,6 @@ constexpr float kClippedRatioThreshold = 0.1f;
 constexpr int kClippedWaitFrames = 300;
 constexpr float kHighSpeechProbability = 0.7f;
 constexpr float kSpeechLevel = -25.0f;
-constexpr int kMaxDigitalGainDb = 12;

 constexpr float kMinSample = std::numeric_limits<int16_t>::min();
 constexpr float kMaxSample = std::numeric_limits<int16_t>::max();
@ -71,7 +70,8 @@ std::unique_ptr<InputVolumeController> CreateInputVolumeController(
      .clipped_ratio_threshold = clipped_ratio_threshold,
      .clipped_wait_frames = clipped_wait_frames,
      .enable_clipping_predictor = enable_clipping_predictor,
-      .max_digital_gain_db = kMaxDigitalGainDb,
+      .target_range_max_dbfs = -18,
+      .target_range_min_dbfs = -30,
  };

  return std::make_unique<InputVolumeController>(/*num_capture_channels=*/1,
@ -262,7 +262,8 @@ constexpr InputVolumeControllerConfig GetInputVolumeControllerTestConfig() {
      .clipped_ratio_threshold = kClippedRatioThreshold,
      .clipped_wait_frames = kClippedWaitFrames,
      .enable_clipping_predictor = kDefaultClippingPredictorConfig.enabled,
-      .max_digital_gain_db = kMaxDigitalGainDb,
+      .target_range_max_dbfs = -18,
+      .target_range_min_dbfs = -30,
  };
  return config;
 }