From 0e37b898f056f7d7a140636162d8e86e2222de98 Mon Sep 17 00:00:00 2001 From: "andrew@webrtc.org" Date: Tue, 11 Nov 2014 19:34:14 +0000 Subject: [PATCH] replace inline assembly WebRtcAecm_CalcLinearEnergiesNeon by intrinsics. The modification only uses the unique part of the CalcLinearEnergies function. Pass byte to byte conformance test both on ARMv7 and ARM64, and the single function performance is similar with original assembly version on different platforms. If not specified, the code is compiled by GCC 4.6. The result is the "X version / C version" ratio, and the less is better. | run 100k times | cortex-a7 | cortex-a9 | cortex-a15 | | use C as the base on each | (1.2Ghz) | (1.0Ghz) | (1.7Ghz) | | CPU target | | | | |----------------------------+-----------+-----------+------------| | Neon asm | 19.48% | 19.26% | 13.68% | | Neon inline | 27.90% | 28.87% | 17.79% | | Neon intrinsics (GCC 4.8) | 18.69% | 20.18% | 14.69% | | Neon intrinsics (LLVM 3.4) | 18.52% | 21.15% | 13.56% | BUG=3580 R=andrew@webrtc.org Review URL: https://webrtc-codereview.appspot.com/23349004 Patch from Zhongwei Yao . git-svn-id: http://webrtc.googlecode.com/svn/trunk@7686 4adac7df-926f-26a2-2b94-8c16560cd09d --- .../audio_processing/aecm/aecm_core_neon.c | 113 ++++++++++-------- 1 file changed, 63 insertions(+), 50 deletions(-) diff --git a/webrtc/modules/audio_processing/aecm/aecm_core_neon.c b/webrtc/modules/audio_processing/aecm/aecm_core_neon.c index 48d63f7d63..ff733b7f79 100644 --- a/webrtc/modules/audio_processing/aecm/aecm_core_neon.c +++ b/webrtc/modules/audio_processing/aecm/aecm_core_neon.c @@ -227,72 +227,85 @@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm, } } +static inline void AddLanes(uint32_t* ptr, uint32x4_t v) { +#if defined(__aarch64__) + *(ptr) = vaddvq_u32(v); +#else + uint32x2_t tmp_v; + tmp_v = vadd_u32(vget_low_u32(v), vget_high_u32(v)); + tmp_v = vpadd_u32(tmp_v, tmp_v); + *(ptr) = vget_lane_u32(tmp_v, 0); +#endif +} + void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm, const uint16_t* far_spectrum, int32_t* echo_est, uint32_t* far_energy, uint32_t* echo_energy_adapt, uint32_t* echo_energy_stored) { - int i; + int16_t* start_stored_p = aecm->channelStored; + int16_t* start_adapt_p = aecm->channelAdapt16; + int32_t* echo_est_p = echo_est; + const int16_t* end_stored_p = aecm->channelStored + PART_LEN; + const uint16_t* far_spectrum_p = far_spectrum; + int16x8_t store_v, adapt_v, spectrum_v; + uint32x4_t echo_est_v_low, echo_est_v_high; + uint32x4_t far_energy_v, echo_energy_stored_v, echo_energy_adapt_v; - register uint32_t far_energy_r; - register uint32_t echo_energy_stored_r; - register uint32_t echo_energy_adapt_r; + far_energy_v = vdupq_n_u32(0); + echo_energy_adapt_v = vdupq_n_u32(0); + echo_energy_stored_v = vdupq_n_u32(0); - assert((uintptr_t)echo_est % 32 == 0); - assert((uintptr_t)(aecm->channelStored) % 16 == 0); - assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0); - assert((uintptr_t)(aecm->channelStored) % 16 == 0); - assert((uintptr_t)(aecm->channelStored) % 16 == 0); + // Get energy for the delayed far end signal and estimated + // echo using both stored and adapted channels. + // The C code: + // for (i = 0; i < PART_LEN1; i++) { + // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], + // far_spectrum[i]); + // (*far_energy) += (uint32_t)(far_spectrum[i]); + // *echo_energy_adapt += aecm->channelAdapt16[i] * far_spectrum[i]; + // (*echo_energy_stored) += (uint32_t)echo_est[i]; + // } + while (start_stored_p < end_stored_p) { + spectrum_v = vld1q_u16(far_spectrum_p); + adapt_v = vld1q_s16(start_adapt_p); + store_v = vld1q_s16(start_stored_p); - __asm __volatile("vmov.i32 q14, #0" : : : "q14"); // far_energy - __asm __volatile("vmov.i32 q8, #0" : : : "q8"); // echo_energy_stored - __asm __volatile("vmov.i32 q9, #0" : : : "q9"); // echo_energy_adapt + far_energy_v = vaddw_u16(far_energy_v, vget_low_s16(spectrum_v)); + far_energy_v = vaddw_u16(far_energy_v, vget_high_s16(spectrum_v)); - for (i = 0; i < PART_LEN - 7; i += 8) { - // far_energy += (uint32_t)(far_spectrum[i]); - __asm __volatile("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13"); - __asm __volatile("vaddw.u16 q14, q14, d26" : : : "q14", "q13"); - __asm __volatile("vaddw.u16 q14, q14, d27" : : : "q14", "q13"); + echo_est_v_low = vmull_u16(vget_low_s16(store_v), vget_low_s16(spectrum_v)); + echo_est_v_high = vmull_u16(vget_high_s16(store_v), + vget_high_s16(spectrum_v)); + vst1q_s32(echo_est_p, echo_est_v_low); + vst1q_s32(echo_est_p + 4, echo_est_v_high); - // Get estimated echo energies for adaptive channel and stored channel. - // echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); - __asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12"); - __asm __volatile("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); - __asm __volatile("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); - __asm __volatile("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]): - "q10", "q11"); + echo_energy_stored_v = vaddq_s32(echo_est_v_low, echo_energy_stored_v); + echo_energy_stored_v = vaddq_s32(echo_est_v_high, echo_energy_stored_v); - // echo_energy_stored += (uint32_t)echoEst[i]; - __asm __volatile("vadd.u32 q8, q10" : : : "q10", "q8"); - __asm __volatile("vadd.u32 q8, q11" : : : "q11", "q8"); + echo_energy_adapt_v = vmlal_u16(echo_energy_adapt_v, + vget_low_s16(adapt_v), + vget_low_s16(spectrum_v)); + echo_energy_adapt_v = vmlal_u16(echo_energy_adapt_v, + vget_high_s16(adapt_v), + vget_high_s16(spectrum_v)); - // echo_energy_adapt += aecm->channelAdapt16[i] * far_spectrum[i]; - __asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12"); - __asm __volatile("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); - __asm __volatile("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); - __asm __volatile("vadd.u32 q9, q10" : : : "q9", "q15"); - __asm __volatile("vadd.u32 q9, q11" : : : "q9", "q11"); + start_stored_p += 8; + start_adapt_p += 8; + far_spectrum_p += 8; + echo_est_p += 8; } - __asm __volatile("vadd.u32 d28, d29" : : : "q14"); - __asm __volatile("vpadd.u32 d28, d28" : : : "q14"); - __asm __volatile("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14"); + AddLanes(far_energy, far_energy_v); + AddLanes(echo_energy_stored, echo_energy_stored_v); + AddLanes(echo_energy_adapt, echo_energy_adapt_v); - __asm __volatile("vadd.u32 d18, d19" : : : "q9"); - __asm __volatile("vpadd.u32 d18, d18" : : : "q9"); - __asm __volatile("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9"); - - __asm __volatile("vadd.u32 d16, d17" : : : "q8"); - __asm __volatile("vpadd.u32 d16, d16" : : : "q8"); - __asm __volatile("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8"); - - // Get estimated echo energies for adaptive channel and stored channel. - echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); - *echo_energy_stored = echo_energy_stored_r + (uint32_t)echo_est[i]; - *far_energy = far_energy_r + (uint32_t)(far_spectrum[i]); - *echo_energy_adapt = echo_energy_adapt_r + - aecm->channelAdapt16[i] * far_spectrum[i]; + echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN], + far_spectrum[PART_LEN]); + *echo_energy_stored += (uint32_t)echo_est[PART_LEN]; + *far_energy += (uint32_t)far_spectrum[PART_LEN]; + *echo_energy_adapt += aecm->channelAdapt16[PART_LEN] * far_spectrum[PART_LEN]; } void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm,