APM-QA annotations: incorrect type bugfix and level estimation with 1 ms frames.

TBR=

Bug: webrtc:7494
Change-Id: I2d4432d5b135e70b9abb5f2794a28228ec6808ba
Reviewed-on: https://webrtc-review.googlesource.com/13621
Reviewed-by: Alessio Bazzica <alessiob@webrtc.org>
Commit-Queue: Alessio Bazzica <alessiob@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#20346}
This commit is contained in:
Alessio Bazzica 2017-10-19 11:10:52 +02:00 committed by Commit Bot
parent a5fbc23379
commit a8c08b1063
2 changed files with 43 additions and 23 deletions

View File

@ -31,11 +31,13 @@ class AudioAnnotationsExtractor(object):
_VAD_FILENAME = 'vad.npy'
_SPEECH_LEVEL_FILENAME = 'speech_level.npy'
# Level estimation params. The time constants in ms indicate the time it takes
# for the level estimate to go down/up by 1 db if the signal is zero.
# Level estimation params.
_ONE_DB_REDUCTION = np.power(10.0, -1.0 / 20.0)
_LEVEL_FRAME_SIZE_MS = 1.0
# The time constants in ms indicate the time it takes for the level estimate
# to go down/up by 1 db if the signal is zero.
_LEVEL_ATTACK_MS = 5.0
_LEVEL_DECAY_MS = 20.0
_ONE_DB_REDUCTION = np.power(10.0, -1.0 / 20.0)
# VAD params.
_VAD_THRESHOLD = 1
@ -45,6 +47,7 @@ class AudioAnnotationsExtractor(object):
self._level = None
self._vad = None
self._speech_level = None
self._level_frame_size = None
self._c_attack = None
self._c_decay = None
@ -75,12 +78,15 @@ class AudioAnnotationsExtractor(object):
if self._signal.channels != 1:
raise NotImplementedError('multiple-channel annotations not implemented')
# Smoothing params.
sample_duration_ms = 1000.0 / self._signal.frame_rate
self._c_attack = 0 if self._LEVEL_ATTACK_MS == 0 else (
self._ONE_DB_REDUCTION ** (sample_duration_ms / self._LEVEL_ATTACK_MS))
self._c_decay = 0 if self._LEVEL_DECAY_MS == 0 else (
self._ONE_DB_REDUCTION ** (sample_duration_ms / self._LEVEL_DECAY_MS))
# level estimation params.
self._level_frame_size = int(self._signal.frame_rate / 1000 * (
self._LEVEL_FRAME_SIZE_MS))
self._c_attack = 0.0 if self._LEVEL_ATTACK_MS == 0 else (
self._ONE_DB_REDUCTION ** (
self._LEVEL_FRAME_SIZE_MS / self._LEVEL_ATTACK_MS))
self._c_decay = 0.0 if self._LEVEL_DECAY_MS == 0 else (
self._ONE_DB_REDUCTION ** (
self._LEVEL_FRAME_SIZE_MS / self._LEVEL_DECAY_MS))
# Compute level.
self._LevelEstimation()
@ -95,6 +101,11 @@ class AudioAnnotationsExtractor(object):
# Speech level based on VAD output.
self._speech_level = self._level * self._vad
# Expand to one value per sample.
self._level = np.repeat(self._level, self._level_frame_size)
self._vad = np.repeat(self._vad, self._level_frame_size)
self._speech_level = np.repeat(self._speech_level, self._level_frame_size)
def Save(self, output_path):
np.save(os.path.join(output_path, self._LEVEL_FILENAME), self._level)
np.save(os.path.join(output_path, self._VAD_FILENAME), self._vad)
@ -104,16 +115,21 @@ class AudioAnnotationsExtractor(object):
def _LevelEstimation(self):
# Read samples.
samples = signal_processing.SignalProcessingUtils.AudioSegmentToRawData(
self._signal)
num_samples = len(samples)
self._signal).astype(np.float32) / 32768.0
num_frames = len(samples) // self._level_frame_size
num_samples = num_frames * self._level_frame_size
# Envelope.
self._level = np.abs(samples)
self._level = np.max(np.reshape(np.abs(samples[:num_samples]), (
num_frames, self._level_frame_size)), axis=1)
assert len(self._level) == num_frames
# Envelope smoothing.
smooth = lambda curr, prev, k: (1 - k) * curr + k * prev
self._level[0] = smooth(self._level[0], 0.0, self._c_attack)
for i in range(1, num_samples):
for i in range(1, num_frames):
self._level[i] = smooth(
self._level[i], self._level[i - 1], self._c_attack if (
self._level[i] > self._level[i - 1]) else self._c_decay)
return self._level

View File

@ -26,7 +26,7 @@ class TestAnnotationsExtraction(unittest.TestCase):
"""Unit tests for the annotations module.
"""
_CLEAN_TMP_OUTPUT = False
_CLEAN_TMP_OUTPUT = True
def setUp(self):
"""Create temporary folder."""
@ -56,12 +56,16 @@ class TestAnnotationsExtraction(unittest.TestCase):
e = annotations.AudioAnnotationsExtractor()
e.Extract(self._wav_file_path)
e.Save(self._tmp_path)
np.testing.assert_array_equal(
e.GetLevel(),
np.load(os.path.join(self._tmp_path, e.GetLevelFileName())))
np.testing.assert_array_equal(
e.GetVad(),
np.load(os.path.join(self._tmp_path, e.GetVadFileName())))
np.testing.assert_array_equal(
e.GetSpeechLevel(),
np.load(os.path.join(self._tmp_path, e.GetSpeechLevelFileName())))
level = np.load(os.path.join(self._tmp_path, e.GetLevelFileName()))
np.testing.assert_array_equal(e.GetLevel(), level)
self.assertEqual(np.float32, level.dtype)
vad = np.load(os.path.join(self._tmp_path, e.GetVadFileName()))
np.testing.assert_array_equal(e.GetVad(), vad)
self.assertEqual(np.uint8, vad.dtype)
speech_level = np.load(os.path.join(
self._tmp_path, e.GetSpeechLevelFileName()))
np.testing.assert_array_equal(e.GetSpeechLevel(), speech_level)
self.assertEqual(np.float32, speech_level.dtype)