From f809b9b38d6c12ae0e545354ad03ebc8c3136c67 Mon Sep 17 00:00:00 2001
From: Zhongwei Yao <zhongwei.yao@chromium.org>
Date: Wed, 1 Apr 2015 17:43:00 +0800
Subject: [PATCH] Fix bug in WebRtcIsacfix_FilterMaLoopNeon.

Pass content_browsertests in Chromium. Performance test result (lower is
better):
C version: 100%
old intrinsics Neon version (with bug): 16.5%
new intrinsics Neon version: 18.0%
asm Neon version: 23.3%

BUG=4002
R=andrew@webrtc.org, jridges@masque.com

Change-Id: Ia0a96ac237216b635fc528f67d39319cdf246281

Review URL: https://webrtc-codereview.appspot.com/46739004

Cr-Commit-Position: refs/heads/master@{#8907}
---
 .../codecs/isac/fix/source/lattice_neon.c     | 40 ++++++++++++++-----
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.c b/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.c
index 6bdaf1dfa5..9218a3a0e8 100644
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.c
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.c
@@ -17,7 +17,7 @@
 // filter routine for iSAC codec, optimized for ARM Neon platform.
 // It does:
 //  for 0 <= n < HALF_SUBFRAMELEN - 1:
-//    *ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
+//    *ptr2 = input2 * ((*ptr2) + input0 * (*ptr0));
 //    *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
 // Output is not bit-exact with the reference C code, due to the replacement
 // of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon
@@ -41,6 +41,7 @@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,  // Filter coefficient
   int32x4_t ptr0va, ptr1va, ptr2va;
   int32x4_t ptr0vb, ptr1vb, ptr2vb;
 
+  int64x2_t tmp2al_low, tmp2al_high, tmp2bl_low, tmp2bl_high;
   // Unroll to process 8 samples at once.
   for (n = 0; n < loop; n++) {
     ptr0va = vld1q_s32(ptr0);
@@ -61,12 +62,25 @@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,  // Filter coefficient
     // Calculate tmp2 = tmp0 + *(ptr2).
     tmp2a = vaddq_s32(tmp0a, ptr2va);
     tmp2b = vaddq_s32(tmp0b, ptr2vb);
-    tmp2a = vshlq_n_s32(tmp2a, 15);
-    tmp2b = vshlq_n_s32(tmp2b, 15);
 
     // Calculate *ptr2 = input2 * tmp2.
-    ptr2va = vqrdmulhq_s32(tmp2a, input2_v);
-    ptr2vb = vqrdmulhq_s32(tmp2b, input2_v);
+    tmp2al_low = vmull_s32(vget_low_s32(tmp2a), vget_low_s32(input2_v));
+#if defined(WEBRTC_ARCH_ARM64)
+    tmp2al_high = vmull_high_s32(tmp2a, input2_v);
+#else
+    tmp2al_high = vmull_s32(vget_high_s32(tmp2a), vget_high_s32(input2_v));
+#endif
+    ptr2va = vcombine_s32(vrshrn_n_s64(tmp2al_low, 16),
+                          vrshrn_n_s64(tmp2al_high, 16));
+
+    tmp2bl_low = vmull_s32(vget_low_s32(tmp2b), vget_low_s32(input2_v));
+#if defined(WEBRTC_ARCH_ARM64)
+    tmp2bl_high = vmull_high_s32(tmp2b, input2_v);
+#else
+    tmp2bl_high = vmull_s32(vget_high_s32(tmp2b), vget_high_s32(input2_v));
+#endif
+    ptr2vb = vcombine_s32(vrshrn_n_s64(tmp2bl_low, 16),
+                          vrshrn_n_s64(tmp2bl_high, 16));
 
     vst1q_s32(ptr2, ptr2va);
     vst1q_s32(ptr2 + 4, ptr2vb);
@@ -99,10 +113,17 @@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,  // Filter coefficient
 
     // Calculate tmp2 = tmp0 + *(ptr2).
     tmp2a = vaddq_s32(tmp0a, ptr2va);
-    tmp2a = vshlq_n_s32(tmp2a, 15);
 
     // Calculate *ptr2 = input2 * tmp2.
-    ptr2va = vqrdmulhq_s32(tmp2a, input2_v);
+    tmp2al_low = vmull_s32(vget_low_s32(tmp2a), vget_low_s32(input2_v));
+
+#if defined(WEBRTC_ARCH_ARM64)
+    tmp2al_high = vmull_high_s32(tmp2a, input2_v);
+#else
+    tmp2al_high = vmull_s32(vget_high_s32(tmp2a), vget_high_s32(input2_v));
+#endif
+    ptr2va = vcombine_s32(vrshrn_n_s64(tmp2al_low, 16),
+                          vrshrn_n_s64(tmp2al_high, 16));
 
     vst1q_s32(ptr2, ptr2va);
     ptr2 += 4;
@@ -121,6 +142,7 @@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,  // Filter coefficient
   if (loop_tail & 0x2) {
     int32x2_t ptr0v_tail, ptr2v_tail, ptr1v_tail;
     int32x2_t tmp0_tail, tmp1_tail, tmp2_tail, tmp3_tail;
+    int64x2_t tmp2l_tail;
     ptr0v_tail = vld1_s32(ptr0);
     ptr2v_tail = vld1_s32(ptr2);
     ptr0 += 2;
@@ -133,10 +155,10 @@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,  // Filter coefficient
 
     // Calculate tmp2 = tmp0 + *(ptr2).
     tmp2_tail = vadd_s32(tmp0_tail, ptr2v_tail);
-    tmp2_tail = vshl_n_s32(tmp2_tail, 15);
 
     // Calculate *ptr2 = input2 * tmp2.
-    ptr2v_tail = vqrdmulh_s32(tmp2_tail, vget_low_s32(input2_v));
+    tmp2l_tail = vmull_s32(tmp2_tail, vget_low_s32(input2_v));
+    ptr2v_tail = vrshrn_n_s64(tmp2l_tail, 16);
 
     vst1_s32(ptr2, ptr2v_tail);
     ptr2 += 2;