From 3e83b7fe8d94a0c682591c997e32e46c795f8d07 Mon Sep 17 00:00:00 2001 From: Alex Loiko Date: Tue, 7 Nov 2017 10:51:20 +0100 Subject: [PATCH] audio_processing VAD annotations in APM-qa. Added possibility to extract audio_processing VAD annotations in the Quality Assessment tool. Annotations are extracted into compressed Numpy 'annotations.npz' files. Annotations contain information about VAD, speech level, speech probabilities etc. TBR=alessiob@webrtc.org Bug: webrtc:7494 Change-Id: I0e54bb67132ae4e180f89959b8bca3ea7f259458 Reviewed-on: https://webrtc-review.googlesource.com/17840 Commit-Queue: Alex Loiko Reviewed-by: Alessio Bazzica Reviewed-by: Alex Loiko Cr-Commit-Position: refs/heads/master@{#20581} --- .../test/py_quality_assessment/BUILD.gn | 13 ++ .../quality_assessment/annotations.py | 123 ++++++++++++++---- .../annotations_unittest.py | 74 ++++++++--- .../quality_assessment/apm_vad.cc | 93 +++++++++++++ .../quality_assessment/simulation.py | 4 +- .../quality_assessment/vad.cc | 14 +- .../vad/voice_activity_detector.cc | 2 - .../vad/voice_activity_detector.h | 4 +- 8 files changed, 270 insertions(+), 57 deletions(-) create mode 100644 modules/audio_processing/test/py_quality_assessment/quality_assessment/apm_vad.cc diff --git a/modules/audio_processing/test/py_quality_assessment/BUILD.gn b/modules/audio_processing/test/py_quality_assessment/BUILD.gn index 59623e3183..eee58da2bb 100644 --- a/modules/audio_processing/test/py_quality_assessment/BUILD.gn +++ b/modules/audio_processing/test/py_quality_assessment/BUILD.gn @@ -99,6 +99,7 @@ group("unit_tests") { testonly = true visibility = [ ":*" ] # Only targets in this file can depend on this. deps = [ + ":apm_vad", ":fake_polqa", ":lib_unit_tests", ":scripts_unit_tests", @@ -130,6 +131,18 @@ rtc_executable("vad") { ] } +rtc_executable("apm_vad") { + sources = [ + "quality_assessment/apm_vad.cc", + ] + deps = [ + "../..", + "../../../..:webrtc_common", + "../../../../common_audio", + "../../../../rtc_base:rtc_base_approved", + ] +} + copy("lib_unit_tests") { testonly = true sources = [ diff --git a/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations.py b/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations.py index 399beb7c0e..2f5daf1f23 100644 --- a/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations.py +++ b/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations.py @@ -10,7 +10,6 @@ """ from __future__ import division -import enum import logging import os import shutil @@ -33,10 +32,30 @@ class AudioAnnotationsExtractor(object): """Extracts annotations from audio files. """ - @enum.unique - class VadType(enum.Enum): - ENERGY_THRESHOLD = 0 # TODO(alessiob): Consider switching to P56 standard. - WEBRTC = 1 + # TODO(aleloi): change to enum.IntEnum when py 3.6 is available. + class VadType(object): + ENERGY_THRESHOLD = 1 # TODO(alessiob): Consider switching to P56 standard. + WEBRTC_COMMON_AUDIO = 2 # common_audio/vad/include/vad.h + WEBRTC_APM = 4 # modules/audio_processing/vad/vad.h + + def __init__(self, value): + if (not isinstance(value, int)) or not 0 <= value <= 7: + raise exceptions.InitializationException( + 'Invalid vad type: ' + value) + self._value = value + + def Contains(self, vad_type): + return self._value | vad_type == self._value + + def __str__(self): + vads = [] + if self.Contains(self.ENERGY_THRESHOLD): + vads.append("energy") + if self.Contains(self.WEBRTC_COMMON_AUDIO): + vads.append("common_audio") + if self.Contains(self.WEBRTC_APM): + vads.append("apm") + return "VadType({})".format(", ".join(vads)) _OUTPUT_FILENAME = 'annotations.npz' @@ -52,25 +71,31 @@ class AudioAnnotationsExtractor(object): _VAD_THRESHOLD = 1 _VAD_WEBRTC_PATH = os.path.join(os.path.dirname( os.path.abspath(__file__)), os.pardir, os.pardir) - _VAD_WEBRTC_BIN_PATH = os.path.join(_VAD_WEBRTC_PATH, 'vad') + _VAD_WEBRTC_COMMON_AUDIO_PATH = os.path.join(_VAD_WEBRTC_PATH, 'vad') + + _VAD_WEBRTC_APM_PATH = os.path.join( + _VAD_WEBRTC_PATH, 'apm_vad') def __init__(self, vad_type): self._signal = None self._level = None self._level_frame_size = None - self._vad_output = None + self._common_audio_vad = None + self._energy_vad = None + self._apm_vad_probs = None + self._apm_vad_rms = None self._vad_frame_size = None self._vad_frame_size_ms = None self._c_attack = None self._c_decay = None - self._vad_type = vad_type - if self._vad_type not in self.VadType: - raise exceptions.InitializationException( - 'Invalid vad type: ' + self._vad_type) - logging.info('VAD used for annotations: ' + str(self._vad_type)) + self._vad_type = self.VadType(vad_type) + logging.info('VADs used for annotations: ' + str(self._vad_type)) - assert os.path.exists(self._VAD_WEBRTC_BIN_PATH), self._VAD_WEBRTC_BIN_PATH + assert os.path.exists(self._VAD_WEBRTC_COMMON_AUDIO_PATH), \ + self._VAD_WEBRTC_COMMON_AUDIO_PATH + assert os.path.exists(self._VAD_WEBRTC_APM_PATH), \ + self._VAD_WEBRTC_APM_PATH @classmethod def GetOutputFileName(cls): @@ -86,8 +111,16 @@ class AudioAnnotationsExtractor(object): def GetLevelFrameSizeMs(cls): return cls._LEVEL_FRAME_SIZE_MS - def GetVadOutput(self): - return self._vad_output + def GetVadOutput(self, vad_type): + if vad_type == self.VadType.ENERGY_THRESHOLD: + return (self._energy_vad, ) + elif vad_type == self.VadType.WEBRTC_COMMON_AUDIO: + return (self._common_audio_vad, ) + elif vad_type == self.VadType.WEBRTC_APM: + return (self._apm_vad_probs, self._apm_vad_rms) + else: + raise exceptions.InitializationException( + 'Invalid vad type: ' + vad_type) def GetVadFrameSize(self): return self._vad_frame_size @@ -115,15 +148,18 @@ class AudioAnnotationsExtractor(object): self._LevelEstimation() # Ideal VAD output, it requires clean speech with high SNR as input. - if self._vad_type == self.VadType.ENERGY_THRESHOLD: + if self._vad_type.Contains(self.VadType.ENERGY_THRESHOLD): # Naive VAD based on level thresholding. vad_threshold = np.percentile(self._level, self._VAD_THRESHOLD) - self._vad_output = np.uint8(self._level > vad_threshold) + self._energy_vad = np.uint8(self._level > vad_threshold) self._vad_frame_size = self._level_frame_size self._vad_frame_size_ms = self._LEVEL_FRAME_SIZE_MS - elif self._vad_type == self.VadType.WEBRTC: - # WebRTC VAD. - self._RunWebRtcVad(filepath, self._signal.frame_rate) + if self._vad_type.Contains(self.VadType.WEBRTC_COMMON_AUDIO): + # WebRTC common_audio/ VAD. + self._RunWebRtcCommonAudioVad(filepath, self._signal.frame_rate) + if self._vad_type.Contains(self.VadType.WEBRTC_APM): + # WebRTC modules/audio_processing/ VAD. + self._RunWebRtcApmVad(filepath) def Save(self, output_path): np.savez_compressed( @@ -131,9 +167,13 @@ class AudioAnnotationsExtractor(object): level=self._level, level_frame_size=self._level_frame_size, level_frame_size_ms=self._LEVEL_FRAME_SIZE_MS, - vad_output=self._vad_output, + vad_output=self._common_audio_vad, + vad_energy_output=self._energy_vad, vad_frame_size=self._vad_frame_size, - vad_frame_size_ms=self._vad_frame_size_ms) + vad_frame_size_ms=self._vad_frame_size_ms, + vad_probs=self._apm_vad_probs, + vad_rms=self._apm_vad_rms + ) def _LevelEstimation(self): # Read samples. @@ -155,8 +195,8 @@ class AudioAnnotationsExtractor(object): self._level[i], self._level[i - 1], self._c_attack if ( self._level[i] > self._level[i - 1]) else self._c_decay) - def _RunWebRtcVad(self, wav_file_path, sample_rate): - self._vad_output = None + def _RunWebRtcCommonAudioVad(self, wav_file_path, sample_rate): + self._common_audio_vad = None self._vad_frame_size = None # Create temporary output path. @@ -167,7 +207,7 @@ class AudioAnnotationsExtractor(object): # Call WebRTC VAD. try: subprocess.call([ - self._VAD_WEBRTC_BIN_PATH, + self._VAD_WEBRTC_COMMON_AUDIO_PATH, '-i', wav_file_path, '-o', output_file_path ], cwd=self._VAD_WEBRTC_PATH) @@ -186,16 +226,45 @@ class AudioAnnotationsExtractor(object): # Init VAD vector. num_bytes = len(raw_data) num_frames = 8 * (num_bytes - 2) - extra_bits # 8 frames for each byte. - self._vad_output = np.zeros(num_frames, np.uint8) + self._common_audio_vad = np.zeros(num_frames, np.uint8) # Read VAD decisions. for i, byte in enumerate(raw_data[1:-1]): byte = struct.unpack('B', byte)[0] for j in range(8 if i < num_bytes - 3 else (8 - extra_bits)): - self._vad_output[i * 8 + j] = int(byte & 1) + self._common_audio_vad[i * 8 + j] = int(byte & 1) byte = byte >> 1 except Exception as e: logging.error('Error while running the WebRTC VAD (' + e.message + ')') finally: if os.path.exists(tmp_path): shutil.rmtree(tmp_path) + + def _RunWebRtcApmVad(self, wav_file_path): + # Create temporary output path. + tmp_path = tempfile.mkdtemp() + output_file_path_probs = os.path.join( + tmp_path, os.path.split(wav_file_path)[1] + '_vad_probs.tmp') + output_file_path_rms = os.path.join( + tmp_path, os.path.split(wav_file_path)[1] + '_vad_rms.tmp') + + # Call WebRTC VAD. + try: + subprocess.call([ + self._VAD_WEBRTC_APM_PATH, + '-i', wav_file_path, + '-o_probs', output_file_path_probs, + '-o_rms', output_file_path_rms + ], cwd=self._VAD_WEBRTC_PATH) + + # Parse annotations. + self._apm_vad_probs = np.fromfile(output_file_path_probs, np.double) + self._apm_vad_rms = np.fromfile(output_file_path_rms, np.double) + assert len(self._apm_vad_rms) == len(self._apm_vad_probs) + + except Exception as e: + logging.error('Error while running the WebRTC APM VAD (' + + e.message + ')') + finally: + if os.path.exists(tmp_path): + shutil.rmtree(tmp_path) diff --git a/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations_unittest.py b/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations_unittest.py index 8cb0d048b3..3f44edfb84 100644 --- a/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations_unittest.py +++ b/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations_unittest.py @@ -49,22 +49,49 @@ class TestAnnotationsExtraction(unittest.TestCase): self._tmp_path)) def testFrameSizes(self): - for vad_type in annotations.AudioAnnotationsExtractor.VadType: - e = annotations.AudioAnnotationsExtractor(vad_type=vad_type) - e.Extract(self._wav_file_path) - samples_to_ms = lambda n, sr: 1000 * n // sr - self.assertEqual(samples_to_ms(e.GetLevelFrameSize(), self._sample_rate), - e.GetLevelFrameSizeMs()) - self.assertEqual(samples_to_ms(e.GetVadFrameSize(), self._sample_rate), - e.GetVadFrameSizeMs()) + vad_type_class = annotations.AudioAnnotationsExtractor.VadType + vad_type = (vad_type_class.ENERGY_THRESHOLD | + vad_type_class.WEBRTC_COMMON_AUDIO | + vad_type_class.WEBRTC_APM) + e = annotations.AudioAnnotationsExtractor(vad_type=vad_type) + e.Extract(self._wav_file_path) + samples_to_ms = lambda n, sr: 1000 * n // sr + self.assertEqual(samples_to_ms(e.GetLevelFrameSize(), self._sample_rate), + e.GetLevelFrameSizeMs()) + self.assertEqual(samples_to_ms(e.GetVadFrameSize(), self._sample_rate), + e.GetVadFrameSizeMs()) def testVoiceActivityDetectors(self): - for vad_type in annotations.AudioAnnotationsExtractor.VadType: - e = annotations.AudioAnnotationsExtractor(vad_type=vad_type) + vad_type_class = annotations.AudioAnnotationsExtractor.VadType + max_vad_type = (vad_type_class.ENERGY_THRESHOLD | + vad_type_class.WEBRTC_COMMON_AUDIO | + vad_type_class.WEBRTC_APM) + for vad_type_value in range(0, max_vad_type+1): + vad_type = vad_type_class(vad_type_value) + e = annotations.AudioAnnotationsExtractor(vad_type=vad_type_value) e.Extract(self._wav_file_path) - vad_output = e.GetVadOutput() - self.assertGreater(len(vad_output), 0) - self.assertGreaterEqual(float(np.sum(vad_output)) / len(vad_output), 0.95) + if vad_type.Contains(vad_type_class.ENERGY_THRESHOLD): + # pylint: disable=unbalanced-tuple-unpacking + (vad_output, ) = e.GetVadOutput(vad_type_class.ENERGY_THRESHOLD) + self.assertGreater(len(vad_output), 0) + self.assertGreaterEqual(float(np.sum(vad_output)) / len(vad_output), + 0.95) + + if vad_type.Contains(vad_type_class.WEBRTC_COMMON_AUDIO): + # pylint: disable=unbalanced-tuple-unpacking + (vad_output,) = e.GetVadOutput(vad_type_class.WEBRTC_COMMON_AUDIO) + self.assertGreater(len(vad_output), 0) + self.assertGreaterEqual(float(np.sum(vad_output)) / len(vad_output), + 0.95) + + if vad_type.Contains(vad_type_class.WEBRTC_APM): + # pylint: disable=unbalanced-tuple-unpacking + (vad_probs, vad_rms) = e.GetVadOutput(vad_type_class.WEBRTC_APM) + self.assertGreater(len(vad_probs), 0) + self.assertGreater(len(vad_rms), 0) + self.assertGreaterEqual(float(np.sum(vad_probs)) / len(vad_probs), + 0.95) + self.assertGreaterEqual(float(np.sum(vad_rms)) / len(vad_rms), 20000) if self._DEBUG_PLOT_VAD: frame_times_s = lambda num_frames, frame_size_ms: np.arange( @@ -84,13 +111,26 @@ class TestAnnotationsExtraction(unittest.TestCase): plt.show() def testSaveLoad(self): - e = annotations.AudioAnnotationsExtractor( - vad_type=annotations.AudioAnnotationsExtractor.VadType.ENERGY_THRESHOLD) + vad_type_class = annotations.AudioAnnotationsExtractor.VadType + vad_type = (vad_type_class.ENERGY_THRESHOLD | + vad_type_class.WEBRTC_COMMON_AUDIO | + vad_type_class.WEBRTC_APM) + e = annotations.AudioAnnotationsExtractor(vad_type) e.Extract(self._wav_file_path) e.Save(self._tmp_path) data = np.load(os.path.join(self._tmp_path, e.GetOutputFileName())) np.testing.assert_array_equal(e.GetLevel(), data['level']) self.assertEqual(np.float32, data['level'].dtype) - np.testing.assert_array_equal(e.GetVadOutput(), data['vad_output']) - self.assertEqual(np.uint8, data['vad_output'].dtype) + np.testing.assert_array_equal( + e.GetVadOutput(vad_type_class.ENERGY_THRESHOLD), + data['vad_energy_output']) + np.testing.assert_array_equal( + e.GetVadOutput(vad_type_class.WEBRTC_COMMON_AUDIO), data['vad_output']) + np.testing.assert_array_equal( + e.GetVadOutput(vad_type_class.WEBRTC_APM)[0], data['vad_probs']) + np.testing.assert_array_equal( + e.GetVadOutput(vad_type_class.WEBRTC_APM)[1], data['vad_rms']) + self.assertEqual(np.uint8, data['vad_energy_output'].dtype) + self.assertEqual(np.float64, data['vad_probs'].dtype) + self.assertEqual(np.float64, data['vad_rms'].dtype) diff --git a/modules/audio_processing/test/py_quality_assessment/quality_assessment/apm_vad.cc b/modules/audio_processing/test/py_quality_assessment/quality_assessment/apm_vad.cc new file mode 100644 index 0000000000..ccbd02a504 --- /dev/null +++ b/modules/audio_processing/test/py_quality_assessment/quality_assessment/apm_vad.cc @@ -0,0 +1,93 @@ +// Copyright (c) 2017 The WebRTC project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. + +#include +#include +#include + +#include "common_audio/wav_file.h" +#include "modules/audio_processing/vad/voice_activity_detector.h" +#include "rtc_base/flags.h" +#include "rtc_base/logging.h" + +namespace webrtc { +namespace test { +namespace { + +constexpr uint8_t kAudioFrameLengthMilliseconds = 10; +constexpr int kMaxSampleRate = 48000; +constexpr size_t kMaxFrameLen = + kAudioFrameLengthMilliseconds * kMaxSampleRate / 1000; + +DEFINE_string(i, "", "Input wav file"); +DEFINE_string(o_probs, "", "VAD probabilities output file"); +DEFINE_string(o_rms, "", "VAD output file"); + +int main(int argc, char* argv[]) { + if (rtc::FlagList::SetFlagsFromCommandLine(&argc, argv, true)) + return 1; + + // Open wav input file and check properties. + WavReader wav_reader(FLAG_i); + if (wav_reader.num_channels() != 1) { + LOG(LS_ERROR) << "Only mono wav files supported"; + return 1; + } + if (wav_reader.sample_rate() > kMaxSampleRate) { + LOG(LS_ERROR) << "Beyond maximum sample rate (" << kMaxSampleRate << ")"; + return 1; + } + const size_t audio_frame_len = rtc::CheckedDivExact( + kAudioFrameLengthMilliseconds * wav_reader.sample_rate(), 1000); + if (audio_frame_len > kMaxFrameLen) { + LOG(LS_ERROR) << "The frame size and/or the sample rate are too large."; + return 1; + } + + // Create output file and write header. + std::ofstream out_probs_file(FLAG_o_probs, std::ofstream::binary); + std::ofstream out_rms_file(FLAG_o_rms, std::ofstream::binary); + + // Run VAD and write decisions. + VoiceActivityDetector vad; + std::array samples; + + while (true) { + // Process frame. + const auto read_samples = + wav_reader.ReadSamples(audio_frame_len, samples.data()); + if (read_samples < audio_frame_len) { + break; + } + vad.ProcessChunk(samples.data(), audio_frame_len, wav_reader.sample_rate()); + // Write output. + auto probs = vad.chunkwise_voice_probabilities(); + auto rms = vad.chunkwise_rms(); + RTC_CHECK_EQ(probs.size(), rms.size()); + RTC_CHECK_EQ(sizeof(double), 8); + + for (const auto& p : probs) { + out_probs_file.write(reinterpret_cast(&p), 8); + } + for (const auto& r : rms) { + out_rms_file.write(reinterpret_cast(&r), 8); + } + } + + out_probs_file.close(); + out_rms_file.close(); + return 0; +} + +} // namespace +} // namespace test +} // namespace webrtc + +int main(int argc, char* argv[]) { + return webrtc::test::main(argc, argv); +} diff --git a/modules/audio_processing/test/py_quality_assessment/quality_assessment/simulation.py b/modules/audio_processing/test/py_quality_assessment/quality_assessment/simulation.py index 305487a030..8e672916c5 100644 --- a/modules/audio_processing/test/py_quality_assessment/quality_assessment/simulation.py +++ b/modules/audio_processing/test/py_quality_assessment/quality_assessment/simulation.py @@ -47,7 +47,9 @@ class ApmModuleSimulator(object): self._audioproc_wrapper = ap_wrapper self._evaluator = evaluator self._annotator = annotations.AudioAnnotationsExtractor( - vad_type=annotations.AudioAnnotationsExtractor.VadType.WEBRTC) + annotations.AudioAnnotationsExtractor.VadType.ENERGY_THRESHOLD | + annotations.AudioAnnotationsExtractor.VadType.WEBRTC_COMMON_AUDIO | + annotations.AudioAnnotationsExtractor.VadType.WEBRTC_APM) # Init. self._test_data_generator_factory.SetOutputDirectoryPrefix( diff --git a/modules/audio_processing/test/py_quality_assessment/quality_assessment/vad.cc b/modules/audio_processing/test/py_quality_assessment/quality_assessment/vad.cc index 3a2c2849cf..90aa338b8c 100644 --- a/modules/audio_processing/test/py_quality_assessment/quality_assessment/vad.cc +++ b/modules/audio_processing/test/py_quality_assessment/quality_assessment/vad.cc @@ -44,11 +44,11 @@ int main(int argc, char* argv[]) { LOG(LS_ERROR) << "Beyond maximum sample rate (" << kMaxSampleRate << ")"; return 1; } - const size_t kAudioFrameLen = rtc::CheckedDivExact( + const size_t audio_frame_length = rtc::CheckedDivExact( kAudioFrameLengthMilliseconds * wav_reader.sample_rate(), 1000); - if (kAudioFrameLen > kMaxFrameLen) { + if (audio_frame_length > kMaxFrameLen) { LOG(LS_ERROR) << "The frame size and/or the sample rate are too large."; - return 2; + return 1; } // Create output file and write header. @@ -64,11 +64,11 @@ int main(int argc, char* argv[]) { while (true) { // Process frame. const auto read_samples = - wav_reader.ReadSamples(kAudioFrameLen, samples.data()); - if (read_samples < kAudioFrameLen) + wav_reader.ReadSamples(audio_frame_length, samples.data()); + if (read_samples < audio_frame_length) break; - const auto is_speech = vad->VoiceActivity(samples.data(), kAudioFrameLen, - wav_reader.sample_rate()); + const auto is_speech = vad->VoiceActivity( + samples.data(), audio_frame_length, wav_reader.sample_rate()); // Write output. buff = is_speech ? buff | (1 << next) : buff & ~(1 << next); diff --git a/modules/audio_processing/vad/voice_activity_detector.cc b/modules/audio_processing/vad/voice_activity_detector.cc index dfba73bce8..66a704f6be 100644 --- a/modules/audio_processing/vad/voice_activity_detector.cc +++ b/modules/audio_processing/vad/voice_activity_detector.cc @@ -17,7 +17,6 @@ namespace webrtc { namespace { -const size_t kMaxLength = 320; const size_t kNumChannels = 1; const double kDefaultVoiceValue = 1.0; @@ -40,7 +39,6 @@ void VoiceActivityDetector::ProcessChunk(const int16_t* audio, size_t length, int sample_rate_hz) { RTC_DCHECK_EQ(length, sample_rate_hz / 100); - RTC_DCHECK_LE(length, kMaxLength); // Resample to the required rate. const int16_t* resampled_ptr = audio; if (sample_rate_hz != kSampleRateHz) { diff --git a/modules/audio_processing/vad/voice_activity_detector.h b/modules/audio_processing/vad/voice_activity_detector.h index c937bbb5ff..0079cb2184 100644 --- a/modules/audio_processing/vad/voice_activity_detector.h +++ b/modules/audio_processing/vad/voice_activity_detector.h @@ -29,9 +29,7 @@ class VoiceActivityDetector { VoiceActivityDetector(); ~VoiceActivityDetector(); - // Processes each audio chunk and estimates the voice probability. The maximum - // supported sample rate is 32kHz. - // TODO(aluebs): Change |length| to size_t. + // Processes each audio chunk and estimates the voice probability. void ProcessChunk(const int16_t* audio, size_t length, int sample_rate_hz); // Returns a vector of voice probabilities for each chunk. It can be empty for