APM-QA annotations: incorrect type bugfix and level estimation with 1 ms frames.
TBR= Bug: webrtc:7494 Change-Id: I2d4432d5b135e70b9abb5f2794a28228ec6808ba Reviewed-on: https://webrtc-review.googlesource.com/13621 Reviewed-by: Alessio Bazzica <alessiob@webrtc.org> Commit-Queue: Alessio Bazzica <alessiob@webrtc.org> Cr-Commit-Position: refs/heads/master@{#20346}
This commit is contained in:
parent
a5fbc23379
commit
a8c08b1063
@ -31,11 +31,13 @@ class AudioAnnotationsExtractor(object):
|
||||
_VAD_FILENAME = 'vad.npy'
|
||||
_SPEECH_LEVEL_FILENAME = 'speech_level.npy'
|
||||
|
||||
# Level estimation params. The time constants in ms indicate the time it takes
|
||||
# for the level estimate to go down/up by 1 db if the signal is zero.
|
||||
# Level estimation params.
|
||||
_ONE_DB_REDUCTION = np.power(10.0, -1.0 / 20.0)
|
||||
_LEVEL_FRAME_SIZE_MS = 1.0
|
||||
# The time constants in ms indicate the time it takes for the level estimate
|
||||
# to go down/up by 1 db if the signal is zero.
|
||||
_LEVEL_ATTACK_MS = 5.0
|
||||
_LEVEL_DECAY_MS = 20.0
|
||||
_ONE_DB_REDUCTION = np.power(10.0, -1.0 / 20.0)
|
||||
|
||||
# VAD params.
|
||||
_VAD_THRESHOLD = 1
|
||||
@ -45,6 +47,7 @@ class AudioAnnotationsExtractor(object):
|
||||
self._level = None
|
||||
self._vad = None
|
||||
self._speech_level = None
|
||||
self._level_frame_size = None
|
||||
self._c_attack = None
|
||||
self._c_decay = None
|
||||
|
||||
@ -75,12 +78,15 @@ class AudioAnnotationsExtractor(object):
|
||||
if self._signal.channels != 1:
|
||||
raise NotImplementedError('multiple-channel annotations not implemented')
|
||||
|
||||
# Smoothing params.
|
||||
sample_duration_ms = 1000.0 / self._signal.frame_rate
|
||||
self._c_attack = 0 if self._LEVEL_ATTACK_MS == 0 else (
|
||||
self._ONE_DB_REDUCTION ** (sample_duration_ms / self._LEVEL_ATTACK_MS))
|
||||
self._c_decay = 0 if self._LEVEL_DECAY_MS == 0 else (
|
||||
self._ONE_DB_REDUCTION ** (sample_duration_ms / self._LEVEL_DECAY_MS))
|
||||
# level estimation params.
|
||||
self._level_frame_size = int(self._signal.frame_rate / 1000 * (
|
||||
self._LEVEL_FRAME_SIZE_MS))
|
||||
self._c_attack = 0.0 if self._LEVEL_ATTACK_MS == 0 else (
|
||||
self._ONE_DB_REDUCTION ** (
|
||||
self._LEVEL_FRAME_SIZE_MS / self._LEVEL_ATTACK_MS))
|
||||
self._c_decay = 0.0 if self._LEVEL_DECAY_MS == 0 else (
|
||||
self._ONE_DB_REDUCTION ** (
|
||||
self._LEVEL_FRAME_SIZE_MS / self._LEVEL_DECAY_MS))
|
||||
|
||||
# Compute level.
|
||||
self._LevelEstimation()
|
||||
@ -95,6 +101,11 @@ class AudioAnnotationsExtractor(object):
|
||||
# Speech level based on VAD output.
|
||||
self._speech_level = self._level * self._vad
|
||||
|
||||
# Expand to one value per sample.
|
||||
self._level = np.repeat(self._level, self._level_frame_size)
|
||||
self._vad = np.repeat(self._vad, self._level_frame_size)
|
||||
self._speech_level = np.repeat(self._speech_level, self._level_frame_size)
|
||||
|
||||
def Save(self, output_path):
|
||||
np.save(os.path.join(output_path, self._LEVEL_FILENAME), self._level)
|
||||
np.save(os.path.join(output_path, self._VAD_FILENAME), self._vad)
|
||||
@ -104,16 +115,21 @@ class AudioAnnotationsExtractor(object):
|
||||
def _LevelEstimation(self):
|
||||
# Read samples.
|
||||
samples = signal_processing.SignalProcessingUtils.AudioSegmentToRawData(
|
||||
self._signal)
|
||||
num_samples = len(samples)
|
||||
self._signal).astype(np.float32) / 32768.0
|
||||
num_frames = len(samples) // self._level_frame_size
|
||||
num_samples = num_frames * self._level_frame_size
|
||||
|
||||
# Envelope.
|
||||
self._level = np.abs(samples)
|
||||
self._level = np.max(np.reshape(np.abs(samples[:num_samples]), (
|
||||
num_frames, self._level_frame_size)), axis=1)
|
||||
assert len(self._level) == num_frames
|
||||
|
||||
# Envelope smoothing.
|
||||
smooth = lambda curr, prev, k: (1 - k) * curr + k * prev
|
||||
self._level[0] = smooth(self._level[0], 0.0, self._c_attack)
|
||||
for i in range(1, num_samples):
|
||||
for i in range(1, num_frames):
|
||||
self._level[i] = smooth(
|
||||
self._level[i], self._level[i - 1], self._c_attack if (
|
||||
self._level[i] > self._level[i - 1]) else self._c_decay)
|
||||
|
||||
return self._level
|
||||
|
||||
@ -26,7 +26,7 @@ class TestAnnotationsExtraction(unittest.TestCase):
|
||||
"""Unit tests for the annotations module.
|
||||
"""
|
||||
|
||||
_CLEAN_TMP_OUTPUT = False
|
||||
_CLEAN_TMP_OUTPUT = True
|
||||
|
||||
def setUp(self):
|
||||
"""Create temporary folder."""
|
||||
@ -56,12 +56,16 @@ class TestAnnotationsExtraction(unittest.TestCase):
|
||||
e = annotations.AudioAnnotationsExtractor()
|
||||
e.Extract(self._wav_file_path)
|
||||
e.Save(self._tmp_path)
|
||||
np.testing.assert_array_equal(
|
||||
e.GetLevel(),
|
||||
np.load(os.path.join(self._tmp_path, e.GetLevelFileName())))
|
||||
np.testing.assert_array_equal(
|
||||
e.GetVad(),
|
||||
np.load(os.path.join(self._tmp_path, e.GetVadFileName())))
|
||||
np.testing.assert_array_equal(
|
||||
e.GetSpeechLevel(),
|
||||
np.load(os.path.join(self._tmp_path, e.GetSpeechLevelFileName())))
|
||||
|
||||
level = np.load(os.path.join(self._tmp_path, e.GetLevelFileName()))
|
||||
np.testing.assert_array_equal(e.GetLevel(), level)
|
||||
self.assertEqual(np.float32, level.dtype)
|
||||
|
||||
vad = np.load(os.path.join(self._tmp_path, e.GetVadFileName()))
|
||||
np.testing.assert_array_equal(e.GetVad(), vad)
|
||||
self.assertEqual(np.uint8, vad.dtype)
|
||||
|
||||
speech_level = np.load(os.path.join(
|
||||
self._tmp_path, e.GetSpeechLevelFileName()))
|
||||
np.testing.assert_array_equal(e.GetSpeechLevel(), speech_level)
|
||||
self.assertEqual(np.float32, speech_level.dtype)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user