From b81e304ac0ec3745d2f3cb522678eaf2bf73f17f Mon Sep 17 00:00:00 2001
From: "andrew@webrtc.org" <andrew@webrtc.org>
Date: Mon, 3 Nov 2014 17:17:51 +0000
Subject: [PATCH] replace inline assembly WebRtcNsx_AnalysisUpdate by
 intrinsics.

The modification only uses the unique part of the analysis_update
 function. Pass byte to byte conformance test on both ARMv7 and AArch64,
 and the single function performance is similar with original assembly
 version on different platforms. If not specified, the code is compiled
 by GCC 4.6. The result is the "X version / C version" ratio, and the
 less is better.

| run 100k times             | cortex-a7 | cortex-a9 | cortex-a15 |
| use C as the base on each  |  (1.2Ghz) |  (1.0Ghz) |   (1.7Ghz) |
| CPU target                 |           |           |            |
|----------------------------+-----------+-----------+------------|
| Neon asm                   |    15.61% |    20.15% |     14.89% |
| Neon inline asm (LLVM 3.4) |    25.98% |    33.96% |     18.18% |
| Neon intrinsics (GCC 4.6)  |    22.06% |    27.01% |     19.24% |
| Neon intrinsics (GCC 4.8)  |    17.28% |    18.23% |     18.55% |
| Neon intrinsics (LLVM 3.4) |    21.02% |    19.98% |     16.76% |

BUG=
R=andrew@webrtc.org

Review URL: https://webrtc-codereview.appspot.com/28849004

Patch from Zhongwei Yao <zhongwei.yao@arm.com>.

git-svn-id: http://webrtc.googlecode.com/svn/trunk@7596 4adac7df-926f-26a2-2b94-8c16560cd09d
---
 .../audio_processing/ns/nsx_core_neon.c       | 99 +++++++++----------
 1 file changed, 44 insertions(+), 55 deletions(-)

diff --git a/webrtc/modules/audio_processing/ns/nsx_core_neon.c b/webrtc/modules/audio_processing/ns/nsx_core_neon.c
index 52f35cc708..93099dbf20 100644
--- a/webrtc/modules/audio_processing/ns/nsx_core_neon.c
+++ b/webrtc/modules/audio_processing/ns/nsx_core_neon.c
@@ -582,75 +582,64 @@ void WebRtcNsx_SynthesisUpdateNeon(NsxInst_t* inst,
 void WebRtcNsx_AnalysisUpdateNeon(NsxInst_t* inst,
                                   int16_t* out,
                                   int16_t* new_speech) {
-
-  int16_t* ptr_ana = &inst->analysisBuffer[inst->blockLen10ms];
-  int16_t* ptr_out = &inst->analysisBuffer[0];
+  assert(inst->blockLen10ms % 16 == 0);
+  assert(inst->anaLen % 16 == 0);
 
   // For lower band update analysis buffer.
   // WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
   //                      inst->analysisBuffer + inst->blockLen10ms,
   //                      inst->anaLen - inst->blockLen10ms);
-  for (; ptr_out < &inst->analysisBuffer[inst->anaLen - inst->blockLen10ms];) {
-    // Loop unrolled once, so both pointers are incremented by 8 twice.
-    __asm__ __volatile__(
-      "vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t"
-      "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
-      "vld1.16 {d22, d23}, [%[ptr_ana]]!\n\t"
-      "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t"
-      :[ptr_ana]"+r"(ptr_ana),
-       [ptr_out]"+r"(ptr_out)
-      :
-      :"d20", "d21", "d22", "d23"
-    );
+  int16_t* p_start_src = inst->analysisBuffer + inst->blockLen10ms;
+  int16_t* p_end_src = inst->analysisBuffer + inst->anaLen;
+  int16_t* p_start_dst = inst->analysisBuffer;
+  while (p_start_src < p_end_src) {
+    int16x8_t frame = vld1q_s16(p_start_src);
+    vst1q_s16(p_start_dst, frame);
+
+    p_start_src += 8;
+    p_start_dst += 8;
   }
 
   // WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
   //    + inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
-  for (ptr_ana = new_speech; ptr_out < &inst->analysisBuffer[inst->anaLen];) {
-    // Loop unrolled once, so both pointers are incremented by 8 twice.
-    __asm__ __volatile__(
-      "vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t"
-      "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
-      "vld1.16 {d22, d23}, [%[ptr_ana]]!\n\t"
-      "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t"
-      :[ptr_ana]"+r"(ptr_ana),
-       [ptr_out]"+r"(ptr_out)
-      :
-      :"d20", "d21", "d22", "d23"
-    );
+  p_start_src = new_speech;
+  p_end_src = new_speech + inst->blockLen10ms;
+  p_start_dst = inst->analysisBuffer + inst->anaLen - inst->blockLen10ms;
+  while (p_start_src < p_end_src) {
+    int16x8_t frame = vld1q_s16(p_start_src);
+    vst1q_s16(p_start_dst, frame);
+
+    p_start_src += 8;
+    p_start_dst += 8;
   }
 
-  // Window data before FFT
-  const int16_t* ptr_window = &inst->window[0];
-  ptr_out = &out[0];
-  ptr_ana = &inst->analysisBuffer[0];
-  for (; ptr_out < &out[inst->anaLen];) {
+  // Window data before FFT.
+  int16_t* p_start_window = (int16_t*) inst->window;
+  int16_t* p_start_buffer = inst->analysisBuffer;
+  int16_t* p_start_out = out;
+  const int16_t* p_end_out = out + inst->anaLen;
 
-    // Loop unrolled once, so all pointers are incremented by 4 twice.
-    __asm__ __volatile__(
-      "vld1.16 d20, [%[ptr_ana]]!\n\t"
-      "vld1.16 d21, [%[ptr_window]]!\n\t"
-      // out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
-      //           inst->window[i], inst->analysisBuffer[i], 14); // Q0
-      "vmull.s16 q10, d20, d21\n\t"
-      "vrshrn.i32 d20, q10, #14\n\t"
-      "vst1.16 d20, [%[ptr_out]]!\n\t"
+  // Load the first element to reduce pipeline bubble.
+  int16x8_t window = vld1q_s16(p_start_window);
+  int16x8_t buffer = vld1q_s16(p_start_buffer);
+  p_start_window += 8;
+  p_start_buffer += 8;
 
-      "vld1.16 d22, [%[ptr_ana]]!\n\t"
-      "vld1.16 d23, [%[ptr_window]]!\n\t"
-      // out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
-      //           inst->window[i], inst->analysisBuffer[i], 14); // Q0
-      "vmull.s16 q11, d22, d23\n\t"
-      "vrshrn.i32 d22, q11, #14\n\t"
-      "vst1.16 d22, [%[ptr_out]]!\n\t"
+  while (p_start_out < p_end_out) {
+    // Unroll loop.
+    int32x4_t tmp32_low = vmull_s16(vget_low_s16(window), vget_low_s16(buffer));
+    int32x4_t tmp32_high = vmull_s16(vget_high_s16(window),
+                                     vget_high_s16(buffer));
+    window = vld1q_s16(p_start_window);
+    buffer = vld1q_s16(p_start_buffer);
 
-      // Specify constraints.
-      :[ptr_ana]"+r"(ptr_ana),
-       [ptr_window]"+r"(ptr_window),
-       [ptr_out]"+r"(ptr_out)
-      :
-      :"d20", "d21", "d22", "d23", "q10", "q11"
-    );
+    int16x4_t result_low = vrshrn_n_s32(tmp32_low, 14);
+    int16x4_t result_high = vrshrn_n_s32(tmp32_high, 14);
+    vst1q_s16(p_start_out, vcombine_s16(result_low, result_high));
+
+    p_start_buffer += 8;
+    p_start_window += 8;
+    p_start_out += 8;
   }
 }