opus: add helper function to extract voice activity information

BUG=webrtc:11643 Change-Id: I3cebc40916de0e4b0f5e41f5fda97dd53f76e4e3 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/176740 Commit-Queue: Philipp Hancke <philipp.hancke@googlemail.com> Reviewed-by: Minyue Li <minyue@webrtc.org> Reviewed-by: Jesus de Vicente Pena <devicentepena@webrtc.org> Cr-Commit-Position: refs/heads/master@{#31490}
2020-06-10 14:21:44 +02:00 · 2020-06-10 14:21:44 +02:00 · 0fd1ef135c
commit 0fd1ef135c
parent 0ca13d97d2
3 changed files with 118 additions and 27 deletions
--- a/modules/audio_coding/codecs/opus/opus_interface.cc
+++ b/modules/audio_coding/codecs/opus/opus_interface.cc
@ -678,33 +678,7 @@ int WebRtcOpus_FecDurationEst(const uint8_t* payload,
  return samples;
 }

-// This method is based on Definition of the Opus Audio Codec
-// (https://tools.ietf.org/html/rfc6716). Basically, this method is based on
-// parsing the LP layer of an Opus packet, particularly the LBRR flag.
-int WebRtcOpus_PacketHasFec(const uint8_t* payload,
-                            size_t payload_length_bytes) {
-  if (payload == NULL || payload_length_bytes == 0)
-    return 0;
-
-  // In CELT_ONLY mode, packets should not have FEC.
-  if (payload[0] & 0x80)
-    return 0;
-
-  // Max number of frames in an Opus packet is 48.
-  opus_int16 frame_sizes[48];
-  const unsigned char* frame_data[48];
-
-  // Parse packet to get the frames. But we only care about the first frame,
-  // since we can only decode the FEC from the first one.
-  if (opus_packet_parse(payload, static_cast<opus_int32>(payload_length_bytes),
-                        NULL, frame_data, frame_sizes, NULL) < 0) {
-    return 0;
-  }
-
-  if (frame_sizes[0] <= 1) {
-    return 0;
-  }
-
+int WebRtcOpus_NumSilkFrames(const uint8_t* payload) {
  // For computing the payload length in ms, the sample rate is not important
  // since it cancels out. We use 48 kHz, but any valid sample rate would work.
  int payload_length_ms =
@ -727,10 +701,43 @@ int WebRtcOpus_PacketHasFec(const uint8_t* payload,
    default:
      return 0;  // It is actually even an invalid packet.
  }
+  return silk_frames;
+}
+
+// This method is based on Definition of the Opus Audio Codec
+// (https://tools.ietf.org/html/rfc6716). Basically, this method is based on
+// parsing the LP layer of an Opus packet, particularly the LBRR flag.
+int WebRtcOpus_PacketHasFec(const uint8_t* payload,
+                            size_t payload_length_bytes) {
+  if (payload == NULL || payload_length_bytes == 0)
+    return 0;
+
+  // In CELT_ONLY mode, packets should not have FEC.
+  if (payload[0] & 0x80)
+    return 0;
+
+  int silk_frames = WebRtcOpus_NumSilkFrames(payload);
+  if (silk_frames == 0)
+    return 0;  // Not valid.

  const int channels = opus_packet_get_nb_channels(payload);
  RTC_DCHECK(channels == 1 || channels == 2);

+  // Max number of frames in an Opus packet is 48.
+  opus_int16 frame_sizes[48];
+  const unsigned char* frame_data[48];
+
+  // Parse packet to get the frames. But we only care about the first frame,
+  // since we can only decode the FEC from the first one.
+  if (opus_packet_parse(payload, static_cast<opus_int32>(payload_length_bytes),
+                        NULL, frame_data, frame_sizes, NULL) < 0) {
+    return 0;
+  }
+
+  if (frame_sizes[0] < 1) {
+    return 0;
+  }
+
  // A frame starts with the LP layer. The LP layer begins with two to eight
  // header bits.These consist of one VAD bit per SILK frame (up to 3),
  // followed by a single flag indicating the presence of LBRR frames.
@ -748,3 +755,45 @@ int WebRtcOpus_PacketHasFec(const uint8_t* payload,

  return 0;
 }
+
+int WebRtcOpus_PacketHasVoiceActivity(const uint8_t* payload,
+                                      size_t payload_length_bytes) {
+  if (payload == NULL || payload_length_bytes == 0)
+    return 0;
+
+  // In CELT_ONLY mode we can not determine whether there is VAD.
+  if (payload[0] & 0x80)
+    return -1;
+
+  int silk_frames = WebRtcOpus_NumSilkFrames(payload);
+  if (silk_frames == 0)
+    return -1;
+
+  const int channels = opus_packet_get_nb_channels(payload);
+  RTC_DCHECK(channels == 1 || channels == 2);
+
+  // Max number of frames in an Opus packet is 48.
+  opus_int16 frame_sizes[48];
+  const unsigned char* frame_data[48];
+
+  // Parse packet to get the frames.
+  int frames =
+      opus_packet_parse(payload, static_cast<opus_int32>(payload_length_bytes),
+                        NULL, frame_data, frame_sizes, NULL);
+  if (frames < 0)
+    return -1;
+
+  // Iterate over all Opus frames which may contain multiple SILK frames.
+  for (int frame = 0; frame < frames; frame++) {
+    if (frame_sizes[frame] < 1) {
+      continue;
+    }
+    if (frame_data[frame][0] >> (8 - silk_frames))
+      return 1;
+    if (channels == 2 &&
+        (frame_data[frame][0] << (silk_frames + 1)) >> (8 - silk_frames))
+      return 1;
+  }
+
+  return 0;
+}
--- a/modules/audio_coding/codecs/opus/opus_interface.h
+++ b/modules/audio_coding/codecs/opus/opus_interface.h
@ -510,6 +510,22 @@ int WebRtcOpus_FecDurationEst(const uint8_t* payload,
 int WebRtcOpus_PacketHasFec(const uint8_t* payload,
                            size_t payload_length_bytes);

+/****************************************************************************
+ * WebRtcOpus_PacketHasVoiceActivity(...)
+ *
+ * This function returns the SILK VAD information encoded in the opus packet.
+ * For CELT-only packets that do not have VAD information, it returns -1.
+ * Input:
+ *        - payload              : Encoded data pointer
+ *        - payload_length_bytes : Bytes of encoded data
+ *
+ * Return value                  : 0 - no frame had the VAD flag set.
+ *                                 1 - at least one frame had the VAD flag set.
+ *                                -1 - VAD status could not be determined.
+ */
+int WebRtcOpus_PacketHasVoiceActivity(const uint8_t* payload,
+                                      size_t payload_length_bytes);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/modules/audio_coding/codecs/opus/opus_unittest.cc
+++ b/modules/audio_coding/codecs/opus/opus_unittest.cc
@ -949,4 +949,30 @@ TEST_P(OpusTest, OpusDecodeRepacketized) {
  EXPECT_EQ(0, WebRtcOpus_DecoderFree(opus_decoder_));
 }

+TEST(OpusVadTest, CeltUnknownStatus) {
+  const uint8_t celt[] = {0x80};
+  EXPECT_EQ(WebRtcOpus_PacketHasVoiceActivity(celt, 1), -1);
+}
+
+TEST(OpusVadTest, Mono20msVadSet) {
+  uint8_t silk20msMonoVad[] = {0x78, 0x80};
+  EXPECT_TRUE(WebRtcOpus_PacketHasVoiceActivity(silk20msMonoVad, 2));
+}
+
+TEST(OpusVadTest, Mono20MsVadUnset) {
+  uint8_t silk20msMonoSilence[] = {0x78, 0x00};
+  EXPECT_FALSE(WebRtcOpus_PacketHasVoiceActivity(silk20msMonoSilence, 2));
+}
+
+TEST(OpusVadTest, Stereo20MsVadOnSideChannel) {
+  uint8_t silk20msStereoVadSideChannel[] = {0x78 | 0x04, 0x20};
+  EXPECT_TRUE(
+      WebRtcOpus_PacketHasVoiceActivity(silk20msStereoVadSideChannel, 2));
+}
+
+TEST(OpusVadTest, TwoOpusMonoFramesVadOnSecond) {
+  uint8_t twoMonoFrames[] = {0x78 | 0x1, 0x00, 0x80};
+  EXPECT_TRUE(WebRtcOpus_PacketHasVoiceActivity(twoMonoFrames, 3));
+}
+
 }  // namespace webrtc