From cb711f77d2ff9ebd42678869a73353809b3af66e Mon Sep 17 00:00:00 2001
From: "wu@webrtc.org" <wu@webrtc.org@4adac7df-926f-26a2-2b94-8c16560cd09d>
Date: Mon, 19 May 2014 17:39:11 +0000
Subject: [PATCH] Add interface to propagate audio capture timestamp to the
 renderer.

BUG=3111
R=andrew@webrtc.org, turaj@webrtc.org, xians@webrtc.org

Review URL: https://webrtc-codereview.appspot.com/12239004

git-svn-id: http://webrtc.googlecode.com/svn/trunk@6189 4adac7df-926f-26a2-2b94-8c16560cd09d
---
 .../app/webrtc/test/fakeaudiocapturemodule.cc | 11 ++++++
 .../test/fakeaudiocapturemodule_unittest.cc   | 10 ++++++
 .../audio_coding/main/acm2/acm_receiver.cc    |  6 ++++
 .../audio_device/audio_device_buffer.cc       |  7 ++--
 .../include/audio_device_defines.h            | 10 ++++--
 .../test/audio_device_test_api.cc             |  8 +++--
 .../audio_device/test/func_test_manager.cc    |  8 +++--
 .../audio_device/test/func_test_manager.h     |  8 +++--
 .../modules/interface/module_common_types.h   |  4 +++
 webrtc/test/fake_audio_device.cc              |  6 +++-
 webrtc/voice_engine/channel.cc                | 34 +++++++++++++++++--
 webrtc/voice_engine/channel.h                 |  9 +++++
 webrtc/voice_engine/include/voe_rtp_rtcp.h    |  3 ++
 webrtc/voice_engine/voe_base_impl.cc          | 20 ++++++++---
 webrtc/voice_engine/voe_base_impl.h           | 12 +++++--
 15 files changed, 133 insertions(+), 23 deletions(-)

diff --git a/talk/app/webrtc/test/fakeaudiocapturemodule.cc b/talk/app/webrtc/test/fakeaudiocapturemodule.cc
index 3b36163240..72d39c9714 100644
--- a/talk/app/webrtc/test/fakeaudiocapturemodule.cc
+++ b/talk/app/webrtc/test/fakeaudiocapturemodule.cc
@@ -728,11 +728,22 @@ void FakeAudioCaptureModule::ReceiveFrameP() {
     }
     ResetRecBuffer();
     uint32_t nSamplesOut = 0;
+#ifdef USE_WEBRTC_DEV_BRANCH
+    uint32_t rtp_timestamp = 0;
+    int64_t ntp_time_ms = 0;
+    if (audio_callback_->NeedMorePlayData(kNumberSamples, kNumberBytesPerSample,
+                                         kNumberOfChannels, kSamplesPerSecond,
+                                         rec_buffer_, nSamplesOut,
+                                         &rtp_timestamp, &ntp_time_ms) != 0) {
+      ASSERT(false);
+    }
+#else
     if (audio_callback_->NeedMorePlayData(kNumberSamples, kNumberBytesPerSample,
                                          kNumberOfChannels, kSamplesPerSecond,
                                          rec_buffer_, nSamplesOut) != 0) {
       ASSERT(false);
     }
+#endif
     ASSERT(nSamplesOut == kNumberSamples);
   }
   // The SetBuffer() function ensures that after decoding, the audio buffer
diff --git a/talk/app/webrtc/test/fakeaudiocapturemodule_unittest.cc b/talk/app/webrtc/test/fakeaudiocapturemodule_unittest.cc
index 5738955ec6..ea92f7b0d6 100644
--- a/talk/app/webrtc/test/fakeaudiocapturemodule_unittest.cc
+++ b/talk/app/webrtc/test/fakeaudiocapturemodule_unittest.cc
@@ -84,13 +84,23 @@ class FakeAdmTest : public testing::Test,
                                    const uint8_t nChannels,
                                    const uint32_t samplesPerSec,
                                    void* audioSamples,
+#ifdef USE_WEBRTC_DEV_BRANCH
+                                   uint32_t& nSamplesOut,
+                                   uint32_t* rtp_timestamp,
+                                   int64_t* ntp_time_ms) {
+#else
                                    uint32_t& nSamplesOut) {
+#endif
     ++pull_iterations_;
     const uint32_t audio_buffer_size = nSamples * nBytesPerSample;
     const uint32_t bytes_out = RecordedDataReceived() ?
         CopyFromRecBuffer(audioSamples, audio_buffer_size):
         GenerateZeroBuffer(audioSamples, audio_buffer_size);
     nSamplesOut = bytes_out / nBytesPerSample;
+#ifdef USE_WEBRTC_DEV_BRANCH
+    *rtp_timestamp = 0;
+    *ntp_time_ms = 0;
+#endif
     return 0;
   }
 
diff --git a/webrtc/modules/audio_coding/main/acm2/acm_receiver.cc b/webrtc/modules/audio_coding/main/acm2/acm_receiver.cc
index 7a6a5d0888..613491a052 100644
--- a/webrtc/modules/audio_coding/main/acm2/acm_receiver.cc
+++ b/webrtc/modules/audio_coding/main/acm2/acm_receiver.cc
@@ -473,6 +473,12 @@ int AcmReceiver::GetAudio(int desired_freq_hz, AudioFrame* audio_frame) {
   SetAudioFrameActivityAndType(vad_enabled_, type, audio_frame);
   previous_audio_activity_ = audio_frame->vad_activity_;
   call_stats_.DecodedByNetEq(audio_frame->speech_type_);
+
+  // Computes the RTP timestamp of the first sample in |audio_frame| from
+  // |PlayoutTimestamp|, which is the timestamp of the last sample of
+  // |audio_frame|.
+  audio_frame->timestamp_ =
+      PlayoutTimestamp() - audio_frame->samples_per_channel_;
   return 0;
 }
 
diff --git a/webrtc/modules/audio_device/audio_device_buffer.cc b/webrtc/modules/audio_device/audio_device_buffer.cc
index db5cc322f9..ed1bf2020b 100644
--- a/webrtc/modules/audio_device/audio_device_buffer.cc
+++ b/webrtc/modules/audio_device/audio_device_buffer.cc
@@ -548,13 +548,16 @@ int32_t AudioDeviceBuffer::RequestPlayoutData(uint32_t nSamples)
     if (_ptrCbAudioTransport)
     {
         uint32_t res(0);
-
+        uint32_t rtp_timestamp = 0;
+        int64_t ntp_time_ms = 0;
         res = _ptrCbAudioTransport->NeedMorePlayData(_playSamples,
                                                      playBytesPerSample,
                                                      playChannels,
                                                      playSampleRate,
                                                      &_playBuffer[0],
-                                                     nSamplesOut);
+                                                     nSamplesOut,
+                                                     &rtp_timestamp,
+                                                     &ntp_time_ms);
         if (res != 0)
         {
             WEBRTC_TRACE(kTraceError, kTraceAudioDevice, _id, "NeedMorePlayData() failed");
diff --git a/webrtc/modules/audio_device/include/audio_device_defines.h b/webrtc/modules/audio_device/include/audio_device_defines.h
index 0704ea8321..f65e3a8ec3 100644
--- a/webrtc/modules/audio_device/include/audio_device_defines.h
+++ b/webrtc/modules/audio_device/include/audio_device_defines.h
@@ -63,14 +63,16 @@ public:
                                             const int32_t clockDrift,
                                             const uint32_t currentMicLevel,
                                             const bool keyPressed,
-                                            uint32_t& newMicLevel) = 0;   
+                                            uint32_t& newMicLevel) = 0;
 
     virtual int32_t NeedMorePlayData(const uint32_t nSamples,
                                      const uint8_t nBytesPerSample,
                                      const uint8_t nChannels,
                                      const uint32_t samplesPerSec,
                                      void* audioSamples,
-                                     uint32_t& nSamplesOut) = 0;
+                                     uint32_t& nSamplesOut,
+                                     uint32_t* rtp_timestamp,
+                                     int64_t* ntp_time_ms) = 0;
 
     // Method to pass captured data directly and unmixed to network channels.
     // |channel_ids| contains a list of VoE channels which are the
@@ -125,7 +127,9 @@ public:
     // channel.
     virtual void PullRenderData(int bits_per_sample, int sample_rate,
                                 int number_of_channels, int number_of_frames,
-                                void* audio_data) {}
+                                void* audio_data,
+                                uint32_t* rtp_timestamp,
+                                int64_t* ntp_time_ms) {}
 
 protected:
     virtual ~AudioTransport() {}
diff --git a/webrtc/modules/audio_device/test/audio_device_test_api.cc b/webrtc/modules/audio_device/test/audio_device_test_api.cc
index 2749e8349f..b10accb753 100644
--- a/webrtc/modules/audio_device/test/audio_device_test_api.cc
+++ b/webrtc/modules/audio_device/test/audio_device_test_api.cc
@@ -116,7 +116,9 @@ class AudioTransportAPI: public AudioTransport {
       const uint8_t nChannels,
       const uint32_t sampleRate,
       void* audioSamples,
-      uint32_t& nSamplesOut) {
+      uint32_t& nSamplesOut,
+      uint32_t* rtp_timestamp,
+      int64_t* ntp_time_ms) {
     play_count_++;
     if (play_count_ % 100 == 0) {
       if (nChannels == 1) {
@@ -149,7 +151,9 @@ class AudioTransportAPI: public AudioTransport {
 
   virtual void PullRenderData(int bits_per_sample, int sample_rate,
                               int number_of_channels, int number_of_frames,
-                              void* audio_data) {}
+                              void* audio_data,
+                              uint32_t* rtp_timestamp,
+                              int64_t* ntp_time_ms) {}
  private:
   uint32_t rec_count_;
   uint32_t play_count_;
diff --git a/webrtc/modules/audio_device/test/func_test_manager.cc b/webrtc/modules/audio_device/test/func_test_manager.cc
index 9f80282dd1..a51ebfba2c 100644
--- a/webrtc/modules/audio_device/test/func_test_manager.cc
+++ b/webrtc/modules/audio_device/test/func_test_manager.cc
@@ -292,7 +292,9 @@ int32_t AudioTransportImpl::NeedMorePlayData(
     const uint8_t nChannels,
     const uint32_t samplesPerSec,
     void* audioSamples,
-    uint32_t& nSamplesOut)
+    uint32_t& nSamplesOut,
+    uint32_t* rtp_timestamp,
+    int64_t* ntp_time_ms)
 {
     if (_fullDuplex)
     {
@@ -551,7 +553,9 @@ void AudioTransportImpl::PushCaptureData(int voe_channel,
 void AudioTransportImpl::PullRenderData(int bits_per_sample, int sample_rate,
                                         int number_of_channels,
                                         int number_of_frames,
-                                        void* audio_data) {}
+                                        void* audio_data,
+                                        uint32_t* rtp_timestamp,
+                                        int64_t* ntp_time_ms) {}
 
 FuncTestManager::FuncTestManager() :
     _processThread(NULL),
diff --git a/webrtc/modules/audio_device/test/func_test_manager.h b/webrtc/modules/audio_device/test/func_test_manager.h
index bd32f627ae..1a1c2a5a4f 100644
--- a/webrtc/modules/audio_device/test/func_test_manager.h
+++ b/webrtc/modules/audio_device/test/func_test_manager.h
@@ -118,7 +118,9 @@ public:
                                      const uint8_t nChannels,
                                      const uint32_t samplesPerSec,
                                      void* audioSamples,
-                                     uint32_t& nSamplesOut);
+                                     uint32_t& nSamplesOut,
+                                     uint32_t* rtp_timestamp,
+                                     int64_t* ntp_time_ms);
 
     virtual int OnDataAvailable(const int voe_channels[],
                                 int number_of_voe_channels,
@@ -138,7 +140,9 @@ public:
 
     virtual void PullRenderData(int bits_per_sample, int sample_rate,
                                 int number_of_channels, int number_of_frames,
-                                void* audio_data);
+                                void* audio_data,
+                                uint32_t* rtp_timestamp,
+                                int64_t* ntp_time_ms);
 
     AudioTransportImpl(AudioDeviceModule* audioDevice);
     ~AudioTransportImpl();
diff --git a/webrtc/modules/interface/module_common_types.h b/webrtc/modules/interface/module_common_types.h
index d336ccf7cd..f9ba592ee5 100644
--- a/webrtc/modules/interface/module_common_types.h
+++ b/webrtc/modules/interface/module_common_types.h
@@ -684,7 +684,10 @@ class AudioFrame {
   AudioFrame& operator-=(const AudioFrame& rhs);
 
   int id_;
+  // RTP timestamp of the first sample in the AudioFrame.
   uint32_t timestamp_;
+  // NTP time of the estimated capture time in local timebase in milliseconds.
+  int64_t ntp_time_ms_;
   int16_t data_[kMaxDataSizeSamples];
   int samples_per_channel_;
   int sample_rate_hz_;
@@ -705,6 +708,7 @@ class AudioFrame {
 inline AudioFrame::AudioFrame()
     : id_(-1),
       timestamp_(0),
+      ntp_time_ms_(0),
       data_(),
       samples_per_channel_(0),
       sample_rate_hz_(0),
diff --git a/webrtc/test/fake_audio_device.cc b/webrtc/test/fake_audio_device.cc
index a6fe165b22..d3421ebd64 100644
--- a/webrtc/test/fake_audio_device.cc
+++ b/webrtc/test/fake_audio_device.cc
@@ -121,13 +121,17 @@ void FakeAudioDevice::CaptureAudio() {
         samples_needed = std::min(kFrequencyHz / time_since_last_playout_ms,
                                   kBufferSizeBytes / 2);
       uint32_t samples_out = 0;
+      uint32_t rtp_timestamp = 0;
+      int64_t ntp_time_ms = 0;
       EXPECT_EQ(0,
                 audio_callback_->NeedMorePlayData(samples_needed,
                                                   2,
                                                   1,
                                                   kFrequencyHz,
                                                   playout_buffer_,
-                                                  samples_out));
+                                                  samples_out,
+                                                  &rtp_timestamp,
+                                                  &ntp_time_ms));
     }
   }
   tick_->Wait(WEBRTC_EVENT_INFINITE);
diff --git a/webrtc/voice_engine/channel.cc b/webrtc/voice_engine/channel.cc
index f919c3d09f..365d4cadeb 100644
--- a/webrtc/voice_engine/channel.cc
+++ b/webrtc/voice_engine/channel.cc
@@ -664,6 +664,25 @@ int32_t Channel::GetAudioFrame(int32_t id, AudioFrame& audioFrame)
     // Measure audio level (0-9)
     _outputAudioLevel.ComputeLevel(audioFrame);
 
+    // TODO(wu): Calculate capture NTP time based on RTP timestamp and RTCP SR.
+    audioFrame.ntp_time_ms_ = 0;
+
+    if (!first_frame_arrived_) {
+      first_frame_arrived_ = true;
+      capture_start_rtp_time_stamp_ = audioFrame.timestamp_;
+    } else {
+      // |ntp_time_ms_| won't be valid until at least 2 RTCP SRs are received.
+      if (audioFrame.ntp_time_ms_ > 0) {
+        // Compute |capture_start_ntp_time_ms_| so that
+        // |capture_start_ntp_time_ms_| + |elapsed_time_ms| == |ntp_time_ms_|
+        CriticalSectionScoped lock(ts_stats_lock_.get());
+        uint32_t elapsed_time_ms =
+            (audioFrame.timestamp_ - capture_start_rtp_time_stamp_) /
+            (audioFrame.sample_rate_hz_ * 1000);
+        capture_start_ntp_time_ms_ = audioFrame.ntp_time_ms_ - elapsed_time_ms;
+      }
+    }
+
     return 0;
 }
 
@@ -836,6 +855,10 @@ Channel::Channel(int32_t channelId,
     playout_delay_ms_(0),
     _numberOfDiscardedPackets(0),
     send_sequence_number_(0),
+    ts_stats_lock_(CriticalSectionWrapper::CreateCriticalSection()),
+    first_frame_arrived_(false),
+    capture_start_rtp_time_stamp_(0),
+    capture_start_ntp_time_ms_(-1),
     _engineStatisticsPtr(NULL),
     _outputMixerPtr(NULL),
     _transmitMixerPtr(NULL),
@@ -3371,7 +3394,7 @@ int Channel::GetRemoteRTCPReportBlocks(
 int
 Channel::GetRTPStatistics(CallStatistics& stats)
 {
-    // --- Part one of the final structure (four values)
+    // --- RtcpStatistics
 
     // The jitter statistics is updated for each received RTP packet and is
     // based on received packets.
@@ -3398,7 +3421,7 @@ Channel::GetRTPStatistics(CallStatistics& stats)
                  stats.fractionLost, stats.cumulativeLost, stats.extendedMax,
                  stats.jitterSamples);
 
-    // --- Part two of the final structure (one value)
+    // --- RTT
 
     uint16_t RTT(0);
     RTCPMethod method = _rtpRtcpModule->RTCP();
@@ -3441,7 +3464,7 @@ Channel::GetRTPStatistics(CallStatistics& stats)
                  VoEId(_instanceId, _channelId),
                  "GetRTPStatistics() => rttMs=%d", stats.rttMs);
 
-    // --- Part three of the final structure (four values)
+    // --- Data counters
 
     uint32_t bytesSent(0);
     uint32_t packetsSent(0);
@@ -3473,6 +3496,11 @@ Channel::GetRTPStatistics(CallStatistics& stats)
                  stats.bytesSent, stats.packetsSent, stats.bytesReceived,
                  stats.packetsReceived);
 
+    // --- Timestamps
+    {
+      CriticalSectionScoped lock(ts_stats_lock_.get());
+      stats.capture_start_ntp_time_ms_ = capture_start_ntp_time_ms_;
+    }
     return 0;
 }
 
diff --git a/webrtc/voice_engine/channel.h b/webrtc/voice_engine/channel.h
index ed03519fc3..7b40ed282c 100644
--- a/webrtc/voice_engine/channel.h
+++ b/webrtc/voice_engine/channel.h
@@ -540,6 +540,15 @@ private:
     uint16_t send_sequence_number_;
     uint8_t restored_packet_[kVoiceEngineMaxIpPacketSizeBytes];
 
+    scoped_ptr<CriticalSectionWrapper> ts_stats_lock_;
+
+    bool first_frame_arrived_;
+    // The rtp timestamp of the first played out audio frame.
+    uint32_t capture_start_rtp_time_stamp_;
+    // The capture ntp time (in local timebase) of the first played out audio
+    // frame.
+    int64_t capture_start_ntp_time_ms_;
+
     // uses
     Statistics* _engineStatisticsPtr;
     OutputMixer* _outputMixerPtr;
diff --git a/webrtc/voice_engine/include/voe_rtp_rtcp.h b/webrtc/voice_engine/include/voe_rtp_rtcp.h
index f3a6313116..2fb09cc7f0 100644
--- a/webrtc/voice_engine/include/voe_rtp_rtcp.h
+++ b/webrtc/voice_engine/include/voe_rtp_rtcp.h
@@ -86,6 +86,9 @@ struct CallStatistics
     int packetsSent;
     int bytesReceived;
     int packetsReceived;
+    // The capture ntp time (in local timebase) of the first played out audio
+    // frame.
+    int64_t capture_start_ntp_time_ms_;
 };
 
 // See section 6.4.1 in http://www.ietf.org/rfc/rfc3550.txt for details.
diff --git a/webrtc/voice_engine/voe_base_impl.cc b/webrtc/voice_engine/voe_base_impl.cc
index 1b4b867662..cfedd40563 100644
--- a/webrtc/voice_engine/voe_base_impl.cc
+++ b/webrtc/voice_engine/voe_base_impl.cc
@@ -148,7 +148,9 @@ int32_t VoEBaseImpl::NeedMorePlayData(
         uint8_t nChannels,
         uint32_t samplesPerSec,
         void* audioSamples,
-        uint32_t& nSamplesOut)
+        uint32_t& nSamplesOut,
+        uint32_t* rtp_timestamp,
+        int64_t* ntp_time_ms)
 {
   WEBRTC_TRACE(kTraceStream, kTraceVoice, VoEId(_shared->instance_id(), -1),
                "VoEBaseImpl::NeedMorePlayData(nSamples=%u, "
@@ -157,7 +159,8 @@ int32_t VoEBaseImpl::NeedMorePlayData(
 
   GetPlayoutData(static_cast<int>(samplesPerSec),
                  static_cast<int>(nChannels),
-                 static_cast<int>(nSamples), true, audioSamples);
+                 static_cast<int>(nSamples), true, audioSamples,
+                 rtp_timestamp, ntp_time_ms);
 
   nSamplesOut = _audioFrame.samples_per_channel_;
 
@@ -233,12 +236,14 @@ void VoEBaseImpl::PushCaptureData(int voe_channel, const void* audio_data,
 
 void VoEBaseImpl::PullRenderData(int bits_per_sample, int sample_rate,
                                  int number_of_channels, int number_of_frames,
-                                 void* audio_data) {
+                                 void* audio_data,
+                                 uint32_t* rtp_timestamp,
+                                 int64_t* ntp_time_ms) {
   assert(bits_per_sample == 16);
   assert(number_of_frames == static_cast<int>(sample_rate / 100));
 
   GetPlayoutData(sample_rate, number_of_channels, number_of_frames, false,
-                 audio_data);
+                 audio_data, rtp_timestamp, ntp_time_ms);
 }
 
 int VoEBaseImpl::RegisterVoiceEngineObserver(VoiceEngineObserver& observer)
@@ -1081,7 +1086,9 @@ int VoEBaseImpl::ProcessRecordedDataWithAPM(
 
 void VoEBaseImpl::GetPlayoutData(int sample_rate, int number_of_channels,
                                  int number_of_frames, bool feed_data_to_apm,
-                                 void* audio_data) {
+                                 void* audio_data,
+                                 uint32_t* rtp_timestamp,
+                                 int64_t* ntp_time_ms) {
   assert(_shared->output_mixer() != NULL);
 
   // TODO(andrew): if the device is running in mono, we should tell the mixer
@@ -1102,6 +1109,9 @@ void VoEBaseImpl::GetPlayoutData(int sample_rate, int number_of_channels,
   // Deliver audio (PCM) samples to the ADM
   memcpy(audio_data, _audioFrame.data_,
          sizeof(int16_t) * number_of_frames * number_of_channels);
+
+  *rtp_timestamp = _audioFrame.timestamp_;
+  *ntp_time_ms = _audioFrame.ntp_time_ms_;
 }
 
 }  // namespace webrtc
diff --git a/webrtc/voice_engine/voe_base_impl.h b/webrtc/voice_engine/voe_base_impl.h
index 96dc225aa9..fbcb4dd857 100644
--- a/webrtc/voice_engine/voe_base_impl.h
+++ b/webrtc/voice_engine/voe_base_impl.h
@@ -79,7 +79,9 @@ public:
                                      uint8_t nChannels,
                                      uint32_t samplesPerSec,
                                      void* audioSamples,
-                                     uint32_t& nSamplesOut);
+                                     uint32_t& nSamplesOut,
+                                     uint32_t* rtp_timestamp,
+                                     int64_t* ntp_time_ms);
 
     virtual int OnDataAvailable(const int voe_channels[],
                                 int number_of_voe_channels,
@@ -102,7 +104,9 @@ public:
 
     virtual void PullRenderData(int bits_per_sample, int sample_rate,
                                 int number_of_channels, int number_of_frames,
-                                void* audio_data);
+                                void* audio_data,
+                                uint32_t* rtp_timestamp,
+                                int64_t* ntp_time_ms);
 
     // AudioDeviceObserver
     virtual void OnErrorIsReported(ErrorCode error);
@@ -138,7 +142,9 @@ private:
 
     void GetPlayoutData(int sample_rate, int number_of_channels,
                         int number_of_frames, bool feed_data_to_apm,
-                        void* audio_data);
+                        void* audio_data,
+                        uint32_t* rtp_timestamp,
+                        int64_t* ntp_time_ms);
 
     int32_t AddBuildInfo(char* str) const;
     int32_t AddVoEVersion(char* str) const;