APM-QA annotations: incorrect type bugfix and level estimation with 1 ms frames.

TBR= Bug: webrtc:7494 Change-Id: I2d4432d5b135e70b9abb5f2794a28228ec6808ba Reviewed-on: https://webrtc-review.googlesource.com/13621 Reviewed-by: Alessio Bazzica <alessiob@webrtc.org> Commit-Queue: Alessio Bazzica <alessiob@webrtc.org> Cr-Commit-Position: refs/heads/master@{#20346}
2017-10-19 11:10:52 +02:00 · 2017-10-19 11:10:52 +02:00 · a8c08b1063
commit a8c08b1063
parent a5fbc23379
2 changed files with 43 additions and 23 deletions
--- a/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations.py
+++ b/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations.py
@ -31,11 +31,13 @@ class AudioAnnotationsExtractor(object):
  _VAD_FILENAME = 'vad.npy'
  _SPEECH_LEVEL_FILENAME = 'speech_level.npy'

-  # Level estimation params. The time constants in ms indicate the time it takes
-  # for the level estimate to go down/up by 1 db if the signal is zero.
+  # Level estimation params.
+  _ONE_DB_REDUCTION = np.power(10.0, -1.0 / 20.0)
+  _LEVEL_FRAME_SIZE_MS = 1.0
+  # The time constants in ms indicate the time it takes for the level estimate
+  # to go down/up by 1 db if the signal is zero.
  _LEVEL_ATTACK_MS = 5.0
  _LEVEL_DECAY_MS = 20.0
-  _ONE_DB_REDUCTION = np.power(10.0, -1.0 / 20.0)

  # VAD params.
  _VAD_THRESHOLD = 1
@ -45,6 +47,7 @@ class AudioAnnotationsExtractor(object):
    self._level = None
    self._vad = None
    self._speech_level = None
+    self._level_frame_size = None
    self._c_attack = None
    self._c_decay = None

@ -75,12 +78,15 @@ class AudioAnnotationsExtractor(object):
    if self._signal.channels != 1:
      raise NotImplementedError('multiple-channel annotations not implemented')

-    # Smoothing params.
-    sample_duration_ms = 1000.0 / self._signal.frame_rate
-    self._c_attack = 0 if self._LEVEL_ATTACK_MS == 0 else (
-        self._ONE_DB_REDUCTION ** (sample_duration_ms / self._LEVEL_ATTACK_MS))
-    self._c_decay = 0 if self._LEVEL_DECAY_MS == 0 else (
-        self._ONE_DB_REDUCTION ** (sample_duration_ms / self._LEVEL_DECAY_MS))
+    # level estimation params.
+    self._level_frame_size = int(self._signal.frame_rate / 1000 * (
+        self._LEVEL_FRAME_SIZE_MS))
+    self._c_attack = 0.0 if self._LEVEL_ATTACK_MS == 0 else (
+        self._ONE_DB_REDUCTION ** (
+            self._LEVEL_FRAME_SIZE_MS / self._LEVEL_ATTACK_MS))
+    self._c_decay = 0.0 if self._LEVEL_DECAY_MS == 0 else (
+        self._ONE_DB_REDUCTION ** (
+            self._LEVEL_FRAME_SIZE_MS / self._LEVEL_DECAY_MS))

    # Compute level.
    self._LevelEstimation()
@ -95,6 +101,11 @@ class AudioAnnotationsExtractor(object):
    # Speech level based on VAD output.
    self._speech_level = self._level * self._vad

+    # Expand to one value per sample.
+    self._level = np.repeat(self._level, self._level_frame_size)
+    self._vad = np.repeat(self._vad, self._level_frame_size)
+    self._speech_level = np.repeat(self._speech_level, self._level_frame_size)
+
  def Save(self, output_path):
    np.save(os.path.join(output_path, self._LEVEL_FILENAME), self._level)
    np.save(os.path.join(output_path, self._VAD_FILENAME), self._vad)
@ -104,16 +115,21 @@ class AudioAnnotationsExtractor(object):
  def _LevelEstimation(self):
    # Read samples.
    samples = signal_processing.SignalProcessingUtils.AudioSegmentToRawData(
-        self._signal)
-    num_samples = len(samples)
+        self._signal).astype(np.float32) / 32768.0
+    num_frames = len(samples) // self._level_frame_size
+    num_samples = num_frames * self._level_frame_size

    # Envelope.
-    self._level = np.abs(samples)
+    self._level = np.max(np.reshape(np.abs(samples[:num_samples]), (
+        num_frames, self._level_frame_size)), axis=1)
+    assert len(self._level) == num_frames

    # Envelope smoothing.
    smooth = lambda curr, prev, k: (1 - k) * curr  + k * prev
    self._level[0] = smooth(self._level[0], 0.0, self._c_attack)
-    for i in range(1, num_samples):
+    for i in range(1, num_frames):
      self._level[i] = smooth(
          self._level[i], self._level[i - 1], self._c_attack if (
              self._level[i] > self._level[i - 1]) else self._c_decay)
+
+    return self._level
--- a/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations_unittest.py
+++ b/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations_unittest.py
@ -26,7 +26,7 @@ class TestAnnotationsExtraction(unittest.TestCase):
  """Unit tests for the annotations module.
  """

-  _CLEAN_TMP_OUTPUT = False
+  _CLEAN_TMP_OUTPUT = True

  def setUp(self):
    """Create temporary folder."""
@ -56,12 +56,16 @@ class TestAnnotationsExtraction(unittest.TestCase):
    e = annotations.AudioAnnotationsExtractor()
    e.Extract(self._wav_file_path)
    e.Save(self._tmp_path)
-    np.testing.assert_array_equal(
-        e.GetLevel(),
-        np.load(os.path.join(self._tmp_path, e.GetLevelFileName())))
-    np.testing.assert_array_equal(
-        e.GetVad(),
-        np.load(os.path.join(self._tmp_path, e.GetVadFileName())))
-    np.testing.assert_array_equal(
-        e.GetSpeechLevel(),
-        np.load(os.path.join(self._tmp_path, e.GetSpeechLevelFileName())))
+
+    level = np.load(os.path.join(self._tmp_path, e.GetLevelFileName()))
+    np.testing.assert_array_equal(e.GetLevel(), level)
+    self.assertEqual(np.float32, level.dtype)
+
+    vad = np.load(os.path.join(self._tmp_path, e.GetVadFileName()))
+    np.testing.assert_array_equal(e.GetVad(), vad)
+    self.assertEqual(np.uint8, vad.dtype)
+
+    speech_level = np.load(os.path.join(
+        self._tmp_path, e.GetSpeechLevelFileName()))
+    np.testing.assert_array_equal(e.GetSpeechLevel(), speech_level)
+    self.assertEqual(np.float32, speech_level.dtype)