From 0fd1ef135c8492dd5886e9163126a193c6151706 Mon Sep 17 00:00:00 2001 From: Philipp Hancke Date: Wed, 10 Jun 2020 14:21:44 +0200 Subject: [PATCH] opus: add helper function to extract voice activity information BUG=webrtc:11643 Change-Id: I3cebc40916de0e4b0f5e41f5fda97dd53f76e4e3 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/176740 Commit-Queue: Philipp Hancke Reviewed-by: Minyue Li Reviewed-by: Jesus de Vicente Pena Cr-Commit-Position: refs/heads/master@{#31490} --- .../codecs/opus/opus_interface.cc | 103 +++++++++++++----- .../audio_coding/codecs/opus/opus_interface.h | 16 +++ .../audio_coding/codecs/opus/opus_unittest.cc | 26 +++++ 3 files changed, 118 insertions(+), 27 deletions(-) diff --git a/modules/audio_coding/codecs/opus/opus_interface.cc b/modules/audio_coding/codecs/opus/opus_interface.cc index 4bac365a89..ca39ed8235 100644 --- a/modules/audio_coding/codecs/opus/opus_interface.cc +++ b/modules/audio_coding/codecs/opus/opus_interface.cc @@ -678,33 +678,7 @@ int WebRtcOpus_FecDurationEst(const uint8_t* payload, return samples; } -// This method is based on Definition of the Opus Audio Codec -// (https://tools.ietf.org/html/rfc6716). Basically, this method is based on -// parsing the LP layer of an Opus packet, particularly the LBRR flag. -int WebRtcOpus_PacketHasFec(const uint8_t* payload, - size_t payload_length_bytes) { - if (payload == NULL || payload_length_bytes == 0) - return 0; - - // In CELT_ONLY mode, packets should not have FEC. - if (payload[0] & 0x80) - return 0; - - // Max number of frames in an Opus packet is 48. - opus_int16 frame_sizes[48]; - const unsigned char* frame_data[48]; - - // Parse packet to get the frames. But we only care about the first frame, - // since we can only decode the FEC from the first one. - if (opus_packet_parse(payload, static_cast(payload_length_bytes), - NULL, frame_data, frame_sizes, NULL) < 0) { - return 0; - } - - if (frame_sizes[0] <= 1) { - return 0; - } - +int WebRtcOpus_NumSilkFrames(const uint8_t* payload) { // For computing the payload length in ms, the sample rate is not important // since it cancels out. We use 48 kHz, but any valid sample rate would work. int payload_length_ms = @@ -727,10 +701,43 @@ int WebRtcOpus_PacketHasFec(const uint8_t* payload, default: return 0; // It is actually even an invalid packet. } + return silk_frames; +} + +// This method is based on Definition of the Opus Audio Codec +// (https://tools.ietf.org/html/rfc6716). Basically, this method is based on +// parsing the LP layer of an Opus packet, particularly the LBRR flag. +int WebRtcOpus_PacketHasFec(const uint8_t* payload, + size_t payload_length_bytes) { + if (payload == NULL || payload_length_bytes == 0) + return 0; + + // In CELT_ONLY mode, packets should not have FEC. + if (payload[0] & 0x80) + return 0; + + int silk_frames = WebRtcOpus_NumSilkFrames(payload); + if (silk_frames == 0) + return 0; // Not valid. const int channels = opus_packet_get_nb_channels(payload); RTC_DCHECK(channels == 1 || channels == 2); + // Max number of frames in an Opus packet is 48. + opus_int16 frame_sizes[48]; + const unsigned char* frame_data[48]; + + // Parse packet to get the frames. But we only care about the first frame, + // since we can only decode the FEC from the first one. + if (opus_packet_parse(payload, static_cast(payload_length_bytes), + NULL, frame_data, frame_sizes, NULL) < 0) { + return 0; + } + + if (frame_sizes[0] < 1) { + return 0; + } + // A frame starts with the LP layer. The LP layer begins with two to eight // header bits.These consist of one VAD bit per SILK frame (up to 3), // followed by a single flag indicating the presence of LBRR frames. @@ -748,3 +755,45 @@ int WebRtcOpus_PacketHasFec(const uint8_t* payload, return 0; } + +int WebRtcOpus_PacketHasVoiceActivity(const uint8_t* payload, + size_t payload_length_bytes) { + if (payload == NULL || payload_length_bytes == 0) + return 0; + + // In CELT_ONLY mode we can not determine whether there is VAD. + if (payload[0] & 0x80) + return -1; + + int silk_frames = WebRtcOpus_NumSilkFrames(payload); + if (silk_frames == 0) + return -1; + + const int channels = opus_packet_get_nb_channels(payload); + RTC_DCHECK(channels == 1 || channels == 2); + + // Max number of frames in an Opus packet is 48. + opus_int16 frame_sizes[48]; + const unsigned char* frame_data[48]; + + // Parse packet to get the frames. + int frames = + opus_packet_parse(payload, static_cast(payload_length_bytes), + NULL, frame_data, frame_sizes, NULL); + if (frames < 0) + return -1; + + // Iterate over all Opus frames which may contain multiple SILK frames. + for (int frame = 0; frame < frames; frame++) { + if (frame_sizes[frame] < 1) { + continue; + } + if (frame_data[frame][0] >> (8 - silk_frames)) + return 1; + if (channels == 2 && + (frame_data[frame][0] << (silk_frames + 1)) >> (8 - silk_frames)) + return 1; + } + + return 0; +} diff --git a/modules/audio_coding/codecs/opus/opus_interface.h b/modules/audio_coding/codecs/opus/opus_interface.h index e8de973010..2a3ceaa7d3 100644 --- a/modules/audio_coding/codecs/opus/opus_interface.h +++ b/modules/audio_coding/codecs/opus/opus_interface.h @@ -510,6 +510,22 @@ int WebRtcOpus_FecDurationEst(const uint8_t* payload, int WebRtcOpus_PacketHasFec(const uint8_t* payload, size_t payload_length_bytes); +/**************************************************************************** + * WebRtcOpus_PacketHasVoiceActivity(...) + * + * This function returns the SILK VAD information encoded in the opus packet. + * For CELT-only packets that do not have VAD information, it returns -1. + * Input: + * - payload : Encoded data pointer + * - payload_length_bytes : Bytes of encoded data + * + * Return value : 0 - no frame had the VAD flag set. + * 1 - at least one frame had the VAD flag set. + * -1 - VAD status could not be determined. + */ +int WebRtcOpus_PacketHasVoiceActivity(const uint8_t* payload, + size_t payload_length_bytes); + #ifdef __cplusplus } // extern "C" #endif diff --git a/modules/audio_coding/codecs/opus/opus_unittest.cc b/modules/audio_coding/codecs/opus/opus_unittest.cc index 3407d7d3cf..80cab50137 100644 --- a/modules/audio_coding/codecs/opus/opus_unittest.cc +++ b/modules/audio_coding/codecs/opus/opus_unittest.cc @@ -949,4 +949,30 @@ TEST_P(OpusTest, OpusDecodeRepacketized) { EXPECT_EQ(0, WebRtcOpus_DecoderFree(opus_decoder_)); } +TEST(OpusVadTest, CeltUnknownStatus) { + const uint8_t celt[] = {0x80}; + EXPECT_EQ(WebRtcOpus_PacketHasVoiceActivity(celt, 1), -1); +} + +TEST(OpusVadTest, Mono20msVadSet) { + uint8_t silk20msMonoVad[] = {0x78, 0x80}; + EXPECT_TRUE(WebRtcOpus_PacketHasVoiceActivity(silk20msMonoVad, 2)); +} + +TEST(OpusVadTest, Mono20MsVadUnset) { + uint8_t silk20msMonoSilence[] = {0x78, 0x00}; + EXPECT_FALSE(WebRtcOpus_PacketHasVoiceActivity(silk20msMonoSilence, 2)); +} + +TEST(OpusVadTest, Stereo20MsVadOnSideChannel) { + uint8_t silk20msStereoVadSideChannel[] = {0x78 | 0x04, 0x20}; + EXPECT_TRUE( + WebRtcOpus_PacketHasVoiceActivity(silk20msStereoVadSideChannel, 2)); +} + +TEST(OpusVadTest, TwoOpusMonoFramesVadOnSecond) { + uint8_t twoMonoFrames[] = {0x78 | 0x1, 0x00, 0x80}; + EXPECT_TRUE(WebRtcOpus_PacketHasVoiceActivity(twoMonoFrames, 3)); +} + } // namespace webrtc