From 4ffc7341ca8d75d5256dcc6a90e819a397268aa1 Mon Sep 17 00:00:00 2001
From: "andrew@webrtc.org" <andrew@webrtc.org>
Date: Mon, 10 Nov 2014 17:27:53 +0000
Subject: [PATCH] replace inline assembly WebRtcAecm_ResetAdaptiveChannelNeon 
 by intrinsics.

The modification only uses the unique part of the ResetAdaptiveChannel
 function. Pass byte to byte conformance test both on ARM32 and ARM64,
 and the single function performance is similar with original assembly
 version on different platforms. If not specified, the code is compiled
 by GCC 4.6. The result is the "X version / C version" ratio, and the
 less is better.

| run 100k times             | cortex-a7 | cortex-a9 | cortex-a15 |
| use C as the base on each  |  (1.2Ghz) |  (1.0Ghz) |   (1.7Ghz) |
| CPU target                 |           |           |            |
|----------------------------+-----------+-----------+------------|
| Neon asm                   |       15% |       30% |        12% |
| Neon inline                |       21% |       30% |        12% |
| Neon intrinsics (GCC 4.6)  |       19% |       32% |        12% |
| Neon intrinsics (GCC 4.8)  |       20% |       32% |        12% |
| Neon intrinsics (LLVM 3.4) |       19% |       30% |        12% |

BUG=3580
R=andrew@webrtc.org

Review URL: https://webrtc-codereview.appspot.com/29019004

Patch from Zhongwei Yao <zhongwei.yao@arm.com>.

git-svn-id: http://webrtc.googlecode.com/svn/trunk@7672 4adac7df-926f-26a2-2b94-8c16560cd09d
---
 .../audio_processing/aecm/aecm_core_neon.c    | 45 ++++++++++++-------
 1 file changed, 30 insertions(+), 15 deletions(-)

diff --git a/webrtc/modules/audio_processing/aecm/aecm_core_neon.c b/webrtc/modules/audio_processing/aecm/aecm_core_neon.c
index 0a0b768e45..7908ae52a7 100644
--- a/webrtc/modules/audio_processing/aecm/aecm_core_neon.c
+++ b/webrtc/modules/audio_processing/aecm/aecm_core_neon.c
@@ -322,24 +322,39 @@ void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm,
 }
 
 void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore_t* aecm) {
-  int i;
-
   assert((uintptr_t)(aecm->channelStored) % 16 == 0);
   assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0);
   assert((uintptr_t)(aecm->channelAdapt32) % 32 == 0);
 
-  for (i = 0; i < PART_LEN - 7; i += 8) {
-    // aecm->channelAdapt16[i] = aecm->channelStored[i];
-    // aecm->channelAdapt32[i] = (int32_t)aecm->channelStored[i] << 16;
-    __asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : :
-            "r"(&aecm->channelStored[i]) : "q12");
-    __asm __volatile("vst1.16 {d24, d25}, [%0, :128]" : :
-            "r"(&aecm->channelAdapt16[i]) : "q12");
-    __asm __volatile("vshll.s16 q10, d24, #16" : : : "q12", "q13", "q10");
-    __asm __volatile("vshll.s16 q11, d25, #16" : : : "q12", "q13", "q11");
-    __asm __volatile("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
-            "r"(&aecm->channelAdapt32[i]): "q10", "q11");
+  // The C code of following optimized code.
+  // for (i = 0; i < PART_LEN1; i++) {
+  //   aecm->channelAdapt16[i] = aecm->channelStored[i];
+  //   aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
+  //              (int32_t)aecm->channelStored[i], 16);
+  // }
+
+  int16_t* start_stored_p = aecm->channelStored;
+  int16_t* start_adapt16_p = aecm->channelAdapt16;
+  int32_t* start_adapt32_p = aecm->channelAdapt32;
+  const int16_t* end_stored_p = start_stored_p + PART_LEN;
+
+  int16x8_t stored_v;
+  int32x4_t adapt32_v_low, adapt32_v_high;
+
+  while (start_stored_p < end_stored_p) {
+    stored_v = vld1q_s16(start_stored_p);
+    vst1q_s16(start_adapt16_p, stored_v);
+
+    adapt32_v_low = vshll_n_s16(vget_low_s16(stored_v), 16);
+    adapt32_v_high = vshll_n_s16(vget_high_s16(stored_v), 16);
+
+    vst1q_s32(start_adapt32_p, adapt32_v_low);
+    vst1q_s32(start_adapt32_p + 4, adapt32_v_high);
+
+    start_stored_p += 8;
+    start_adapt16_p += 8;
+    start_adapt32_p += 8;
   }
-  aecm->channelAdapt16[i] = aecm->channelStored[i];
-  aecm->channelAdapt32[i] = (int32_t)aecm->channelStored[i] << 16;
+  aecm->channelAdapt16[PART_LEN] = aecm->channelStored[PART_LEN];
+  aecm->channelAdapt32[PART_LEN] = (int32_t)aecm->channelStored[PART_LEN] << 16;
 }