From a8c08b10635cbfae48b876bd513c8578b646a0dc Mon Sep 17 00:00:00 2001 From: Alessio Bazzica Date: Thu, 19 Oct 2017 11:10:52 +0200 Subject: [PATCH] APM-QA annotations: incorrect type bugfix and level estimation with 1 ms frames. TBR= Bug: webrtc:7494 Change-Id: I2d4432d5b135e70b9abb5f2794a28228ec6808ba Reviewed-on: https://webrtc-review.googlesource.com/13621 Reviewed-by: Alessio Bazzica Commit-Queue: Alessio Bazzica Cr-Commit-Position: refs/heads/master@{#20346} --- .../quality_assessment/annotations.py | 42 +++++++++++++------ .../annotations_unittest.py | 24 ++++++----- 2 files changed, 43 insertions(+), 23 deletions(-) diff --git a/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations.py b/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations.py index 81f6af435e..55b3388d40 100644 --- a/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations.py +++ b/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations.py @@ -31,11 +31,13 @@ class AudioAnnotationsExtractor(object): _VAD_FILENAME = 'vad.npy' _SPEECH_LEVEL_FILENAME = 'speech_level.npy' - # Level estimation params. The time constants in ms indicate the time it takes - # for the level estimate to go down/up by 1 db if the signal is zero. + # Level estimation params. + _ONE_DB_REDUCTION = np.power(10.0, -1.0 / 20.0) + _LEVEL_FRAME_SIZE_MS = 1.0 + # The time constants in ms indicate the time it takes for the level estimate + # to go down/up by 1 db if the signal is zero. _LEVEL_ATTACK_MS = 5.0 _LEVEL_DECAY_MS = 20.0 - _ONE_DB_REDUCTION = np.power(10.0, -1.0 / 20.0) # VAD params. _VAD_THRESHOLD = 1 @@ -45,6 +47,7 @@ class AudioAnnotationsExtractor(object): self._level = None self._vad = None self._speech_level = None + self._level_frame_size = None self._c_attack = None self._c_decay = None @@ -75,12 +78,15 @@ class AudioAnnotationsExtractor(object): if self._signal.channels != 1: raise NotImplementedError('multiple-channel annotations not implemented') - # Smoothing params. - sample_duration_ms = 1000.0 / self._signal.frame_rate - self._c_attack = 0 if self._LEVEL_ATTACK_MS == 0 else ( - self._ONE_DB_REDUCTION ** (sample_duration_ms / self._LEVEL_ATTACK_MS)) - self._c_decay = 0 if self._LEVEL_DECAY_MS == 0 else ( - self._ONE_DB_REDUCTION ** (sample_duration_ms / self._LEVEL_DECAY_MS)) + # level estimation params. + self._level_frame_size = int(self._signal.frame_rate / 1000 * ( + self._LEVEL_FRAME_SIZE_MS)) + self._c_attack = 0.0 if self._LEVEL_ATTACK_MS == 0 else ( + self._ONE_DB_REDUCTION ** ( + self._LEVEL_FRAME_SIZE_MS / self._LEVEL_ATTACK_MS)) + self._c_decay = 0.0 if self._LEVEL_DECAY_MS == 0 else ( + self._ONE_DB_REDUCTION ** ( + self._LEVEL_FRAME_SIZE_MS / self._LEVEL_DECAY_MS)) # Compute level. self._LevelEstimation() @@ -95,6 +101,11 @@ class AudioAnnotationsExtractor(object): # Speech level based on VAD output. self._speech_level = self._level * self._vad + # Expand to one value per sample. + self._level = np.repeat(self._level, self._level_frame_size) + self._vad = np.repeat(self._vad, self._level_frame_size) + self._speech_level = np.repeat(self._speech_level, self._level_frame_size) + def Save(self, output_path): np.save(os.path.join(output_path, self._LEVEL_FILENAME), self._level) np.save(os.path.join(output_path, self._VAD_FILENAME), self._vad) @@ -104,16 +115,21 @@ class AudioAnnotationsExtractor(object): def _LevelEstimation(self): # Read samples. samples = signal_processing.SignalProcessingUtils.AudioSegmentToRawData( - self._signal) - num_samples = len(samples) + self._signal).astype(np.float32) / 32768.0 + num_frames = len(samples) // self._level_frame_size + num_samples = num_frames * self._level_frame_size # Envelope. - self._level = np.abs(samples) + self._level = np.max(np.reshape(np.abs(samples[:num_samples]), ( + num_frames, self._level_frame_size)), axis=1) + assert len(self._level) == num_frames # Envelope smoothing. smooth = lambda curr, prev, k: (1 - k) * curr + k * prev self._level[0] = smooth(self._level[0], 0.0, self._c_attack) - for i in range(1, num_samples): + for i in range(1, num_frames): self._level[i] = smooth( self._level[i], self._level[i - 1], self._c_attack if ( self._level[i] > self._level[i - 1]) else self._c_decay) + + return self._level diff --git a/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations_unittest.py b/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations_unittest.py index b59397cc09..bac3d2174e 100644 --- a/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations_unittest.py +++ b/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations_unittest.py @@ -26,7 +26,7 @@ class TestAnnotationsExtraction(unittest.TestCase): """Unit tests for the annotations module. """ - _CLEAN_TMP_OUTPUT = False + _CLEAN_TMP_OUTPUT = True def setUp(self): """Create temporary folder.""" @@ -56,12 +56,16 @@ class TestAnnotationsExtraction(unittest.TestCase): e = annotations.AudioAnnotationsExtractor() e.Extract(self._wav_file_path) e.Save(self._tmp_path) - np.testing.assert_array_equal( - e.GetLevel(), - np.load(os.path.join(self._tmp_path, e.GetLevelFileName()))) - np.testing.assert_array_equal( - e.GetVad(), - np.load(os.path.join(self._tmp_path, e.GetVadFileName()))) - np.testing.assert_array_equal( - e.GetSpeechLevel(), - np.load(os.path.join(self._tmp_path, e.GetSpeechLevelFileName()))) + + level = np.load(os.path.join(self._tmp_path, e.GetLevelFileName())) + np.testing.assert_array_equal(e.GetLevel(), level) + self.assertEqual(np.float32, level.dtype) + + vad = np.load(os.path.join(self._tmp_path, e.GetVadFileName())) + np.testing.assert_array_equal(e.GetVad(), vad) + self.assertEqual(np.uint8, vad.dtype) + + speech_level = np.load(os.path.join( + self._tmp_path, e.GetSpeechLevelFileName())) + np.testing.assert_array_equal(e.GetSpeechLevel(), speech_level) + self.assertEqual(np.float32, speech_level.dtype)