From 0fd1ef135c8492dd5886e9163126a193c6151706 Mon Sep 17 00:00:00 2001
From: Philipp Hancke <philipp.hancke@googlemail.com>
Date: Wed, 10 Jun 2020 14:21:44 +0200
Subject: [PATCH] opus: add helper function to extract voice activity
 information

BUG=webrtc:11643

Change-Id: I3cebc40916de0e4b0f5e41f5fda97dd53f76e4e3
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/176740
Commit-Queue: Philipp Hancke <philipp.hancke@googlemail.com>
Reviewed-by: Minyue Li <minyue@webrtc.org>
Reviewed-by: Jesus de Vicente Pena <devicentepena@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#31490}
---
 .../codecs/opus/opus_interface.cc             | 103 +++++++++++++-----
 .../audio_coding/codecs/opus/opus_interface.h |  16 +++
 .../audio_coding/codecs/opus/opus_unittest.cc |  26 +++++
 3 files changed, 118 insertions(+), 27 deletions(-)

diff --git a/modules/audio_coding/codecs/opus/opus_interface.cc b/modules/audio_coding/codecs/opus/opus_interface.cc
index 4bac365a89..ca39ed8235 100644
--- a/modules/audio_coding/codecs/opus/opus_interface.cc
+++ b/modules/audio_coding/codecs/opus/opus_interface.cc
@@ -678,33 +678,7 @@ int WebRtcOpus_FecDurationEst(const uint8_t* payload,
   return samples;
 }
 
-// This method is based on Definition of the Opus Audio Codec
-// (https://tools.ietf.org/html/rfc6716). Basically, this method is based on
-// parsing the LP layer of an Opus packet, particularly the LBRR flag.
-int WebRtcOpus_PacketHasFec(const uint8_t* payload,
-                            size_t payload_length_bytes) {
-  if (payload == NULL || payload_length_bytes == 0)
-    return 0;
-
-  // In CELT_ONLY mode, packets should not have FEC.
-  if (payload[0] & 0x80)
-    return 0;
-
-  // Max number of frames in an Opus packet is 48.
-  opus_int16 frame_sizes[48];
-  const unsigned char* frame_data[48];
-
-  // Parse packet to get the frames. But we only care about the first frame,
-  // since we can only decode the FEC from the first one.
-  if (opus_packet_parse(payload, static_cast<opus_int32>(payload_length_bytes),
-                        NULL, frame_data, frame_sizes, NULL) < 0) {
-    return 0;
-  }
-
-  if (frame_sizes[0] <= 1) {
-    return 0;
-  }
-
+int WebRtcOpus_NumSilkFrames(const uint8_t* payload) {
   // For computing the payload length in ms, the sample rate is not important
   // since it cancels out. We use 48 kHz, but any valid sample rate would work.
   int payload_length_ms =
@@ -727,10 +701,43 @@ int WebRtcOpus_PacketHasFec(const uint8_t* payload,
     default:
       return 0;  // It is actually even an invalid packet.
   }
+  return silk_frames;
+}
+
+// This method is based on Definition of the Opus Audio Codec
+// (https://tools.ietf.org/html/rfc6716). Basically, this method is based on
+// parsing the LP layer of an Opus packet, particularly the LBRR flag.
+int WebRtcOpus_PacketHasFec(const uint8_t* payload,
+                            size_t payload_length_bytes) {
+  if (payload == NULL || payload_length_bytes == 0)
+    return 0;
+
+  // In CELT_ONLY mode, packets should not have FEC.
+  if (payload[0] & 0x80)
+    return 0;
+
+  int silk_frames = WebRtcOpus_NumSilkFrames(payload);
+  if (silk_frames == 0)
+    return 0;  // Not valid.
 
   const int channels = opus_packet_get_nb_channels(payload);
   RTC_DCHECK(channels == 1 || channels == 2);
 
+  // Max number of frames in an Opus packet is 48.
+  opus_int16 frame_sizes[48];
+  const unsigned char* frame_data[48];
+
+  // Parse packet to get the frames. But we only care about the first frame,
+  // since we can only decode the FEC from the first one.
+  if (opus_packet_parse(payload, static_cast<opus_int32>(payload_length_bytes),
+                        NULL, frame_data, frame_sizes, NULL) < 0) {
+    return 0;
+  }
+
+  if (frame_sizes[0] < 1) {
+    return 0;
+  }
+
   // A frame starts with the LP layer. The LP layer begins with two to eight
   // header bits.These consist of one VAD bit per SILK frame (up to 3),
   // followed by a single flag indicating the presence of LBRR frames.
@@ -748,3 +755,45 @@ int WebRtcOpus_PacketHasFec(const uint8_t* payload,
 
   return 0;
 }
+
+int WebRtcOpus_PacketHasVoiceActivity(const uint8_t* payload,
+                                      size_t payload_length_bytes) {
+  if (payload == NULL || payload_length_bytes == 0)
+    return 0;
+
+  // In CELT_ONLY mode we can not determine whether there is VAD.
+  if (payload[0] & 0x80)
+    return -1;
+
+  int silk_frames = WebRtcOpus_NumSilkFrames(payload);
+  if (silk_frames == 0)
+    return -1;
+
+  const int channels = opus_packet_get_nb_channels(payload);
+  RTC_DCHECK(channels == 1 || channels == 2);
+
+  // Max number of frames in an Opus packet is 48.
+  opus_int16 frame_sizes[48];
+  const unsigned char* frame_data[48];
+
+  // Parse packet to get the frames.
+  int frames =
+      opus_packet_parse(payload, static_cast<opus_int32>(payload_length_bytes),
+                        NULL, frame_data, frame_sizes, NULL);
+  if (frames < 0)
+    return -1;
+
+  // Iterate over all Opus frames which may contain multiple SILK frames.
+  for (int frame = 0; frame < frames; frame++) {
+    if (frame_sizes[frame] < 1) {
+      continue;
+    }
+    if (frame_data[frame][0] >> (8 - silk_frames))
+      return 1;
+    if (channels == 2 &&
+        (frame_data[frame][0] << (silk_frames + 1)) >> (8 - silk_frames))
+      return 1;
+  }
+
+  return 0;
+}
diff --git a/modules/audio_coding/codecs/opus/opus_interface.h b/modules/audio_coding/codecs/opus/opus_interface.h
index e8de973010..2a3ceaa7d3 100644
--- a/modules/audio_coding/codecs/opus/opus_interface.h
+++ b/modules/audio_coding/codecs/opus/opus_interface.h
@@ -510,6 +510,22 @@ int WebRtcOpus_FecDurationEst(const uint8_t* payload,
 int WebRtcOpus_PacketHasFec(const uint8_t* payload,
                             size_t payload_length_bytes);
 
+/****************************************************************************
+ * WebRtcOpus_PacketHasVoiceActivity(...)
+ *
+ * This function returns the SILK VAD information encoded in the opus packet.
+ * For CELT-only packets that do not have VAD information, it returns -1.
+ * Input:
+ *        - payload              : Encoded data pointer
+ *        - payload_length_bytes : Bytes of encoded data
+ *
+ * Return value                  : 0 - no frame had the VAD flag set.
+ *                                 1 - at least one frame had the VAD flag set.
+ *                                -1 - VAD status could not be determined.
+ */
+int WebRtcOpus_PacketHasVoiceActivity(const uint8_t* payload,
+                                      size_t payload_length_bytes);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/modules/audio_coding/codecs/opus/opus_unittest.cc b/modules/audio_coding/codecs/opus/opus_unittest.cc
index 3407d7d3cf..80cab50137 100644
--- a/modules/audio_coding/codecs/opus/opus_unittest.cc
+++ b/modules/audio_coding/codecs/opus/opus_unittest.cc
@@ -949,4 +949,30 @@ TEST_P(OpusTest, OpusDecodeRepacketized) {
   EXPECT_EQ(0, WebRtcOpus_DecoderFree(opus_decoder_));
 }
 
+TEST(OpusVadTest, CeltUnknownStatus) {
+  const uint8_t celt[] = {0x80};
+  EXPECT_EQ(WebRtcOpus_PacketHasVoiceActivity(celt, 1), -1);
+}
+
+TEST(OpusVadTest, Mono20msVadSet) {
+  uint8_t silk20msMonoVad[] = {0x78, 0x80};
+  EXPECT_TRUE(WebRtcOpus_PacketHasVoiceActivity(silk20msMonoVad, 2));
+}
+
+TEST(OpusVadTest, Mono20MsVadUnset) {
+  uint8_t silk20msMonoSilence[] = {0x78, 0x00};
+  EXPECT_FALSE(WebRtcOpus_PacketHasVoiceActivity(silk20msMonoSilence, 2));
+}
+
+TEST(OpusVadTest, Stereo20MsVadOnSideChannel) {
+  uint8_t silk20msStereoVadSideChannel[] = {0x78 | 0x04, 0x20};
+  EXPECT_TRUE(
+      WebRtcOpus_PacketHasVoiceActivity(silk20msStereoVadSideChannel, 2));
+}
+
+TEST(OpusVadTest, TwoOpusMonoFramesVadOnSecond) {
+  uint8_t twoMonoFrames[] = {0x78 | 0x1, 0x00, 0x80};
+  EXPECT_TRUE(WebRtcOpus_PacketHasVoiceActivity(twoMonoFrames, 3));
+}
+
 }  // namespace webrtc