From 1751ee7d326677cb5843898228e9288c35f76682 Mon Sep 17 00:00:00 2001 From: "andrew@webrtc.org" Date: Tue, 2 Dec 2014 19:36:14 +0000 Subject: [PATCH] Remove -flax-vector-conversions flag for ARM NEON building. Pass compilation on both ARMv7 and ARM64. The generated binary (audioproc) is byte to byte (with symbol striped) same as before. The output of audioproc -aecm is also byte to byte same between C and NEON version on ARMv7 and ARM64. Change-Id: Ibdf40fe085f6bad1311f59bf9318bbcf37dd7ce5 BUG=3850 R=andrew@webrtc.org, jridges@masque.com Review URL: https://webrtc-codereview.appspot.com/30239004 Patch from Zhongwei Yao . git-svn-id: http://webrtc.googlecode.com/svn/trunk@7783 4adac7df-926f-26a2-2b94-8c16560cd09d --- webrtc/build/arm_neon.gypi | 3 - webrtc/common_audio/BUILD.gn | 1 - webrtc/modules/audio_coding/BUILD.gn | 2 - webrtc/modules/audio_processing/BUILD.gn | 5 +- .../audio_processing/aecm/aecm_core_neon.c | 57 ++++++++++--------- 5 files changed, 31 insertions(+), 37 deletions(-) diff --git a/webrtc/build/arm_neon.gypi b/webrtc/build/arm_neon.gypi index 037dd70fa4..9d8f71c8b1 100644 --- a/webrtc/build/arm_neon.gypi +++ b/webrtc/build/arm_neon.gypi @@ -23,9 +23,6 @@ 'cflags!': [ '-mfpu=vfpv3-d16', ], - 'cflags': [ - '-flax-vector-conversions', - ], 'conditions': [ # "-mfpu=neon" is not requried for arm64 in GCC. ['target_arch!="arm64"', { diff --git a/webrtc/common_audio/BUILD.gn b/webrtc/common_audio/BUILD.gn index ba1d17951d..5c14e51f1a 100644 --- a/webrtc/common_audio/BUILD.gn +++ b/webrtc/common_audio/BUILD.gn @@ -211,7 +211,6 @@ if (rtc_build_armv7_neon) { # Remove the -mfpu=vfpv3-d16 cflag. configs -= [ "//build/config/compiler:compiler_arm_fpu" ] cflags = [ - "-flax-vector-conversions", "-mfpu=neon", ] diff --git a/webrtc/modules/audio_coding/BUILD.gn b/webrtc/modules/audio_coding/BUILD.gn index 810fcf1933..0c087ecef3 100644 --- a/webrtc/modules/audio_coding/BUILD.gn +++ b/webrtc/modules/audio_coding/BUILD.gn @@ -501,7 +501,6 @@ source_set("isacfix") { # Remove the -mfpu=vfpv3-d16 cflag. configs -= [ "//build/config/compiler:compiler_arm_fpu" ] cflags = [ - "-flax-vector-conversions", "-mfpu=neon", ] @@ -572,7 +571,6 @@ if (rtc_build_armv7_neon) { # Remove the -mfpu=vfpv3-d16 cflag. configs -= [ "//build/config/compiler:compiler_arm_fpu" ] cflags = [ - "-flax-vector-conversions", "-mfpu=neon", ] diff --git a/webrtc/modules/audio_processing/BUILD.gn b/webrtc/modules/audio_processing/BUILD.gn index 3efc8ee789..fbc1e7a581 100644 --- a/webrtc/modules/audio_processing/BUILD.gn +++ b/webrtc/modules/audio_processing/BUILD.gn @@ -210,13 +210,10 @@ if (rtc_build_armv7_neon || cpu_arch == "arm64") { # //build/config/arm.gni instead, to reduce code duplication. # Remove the -mfpu=vfpv3-d16 cflag. configs -= [ "//build/config/compiler:compiler_arm_fpu" ] - cflags = [ - "-flax-vector-conversions", - ] # "-mfpu=neon" is not requried for arm64 in GCC. if (cpu_arch != "arm64") { - cflags += [ "-mfpu=neon" ] + cflags = [ "-mfpu=neon" ] } # Disable LTO in audio_processing_neon target due to compiler bug. diff --git a/webrtc/modules/audio_processing/aecm/aecm_core_neon.c b/webrtc/modules/audio_processing/aecm/aecm_core_neon.c index 4efe5d84b5..1a0a6f54e4 100644 --- a/webrtc/modules/audio_processing/aecm/aecm_core_neon.c +++ b/webrtc/modules/audio_processing/aecm/aecm_core_neon.c @@ -53,13 +53,14 @@ void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm, int32_t* echo_est_p = echo_est; const int16_t* end_stored_p = aecm->channelStored + PART_LEN; const uint16_t* far_spectrum_p = far_spectrum; - int16x8_t store_v, adapt_v, spectrum_v; + int16x8_t store_v, adapt_v; + uint16x8_t spectrum_v; uint32x4_t echo_est_v_low, echo_est_v_high; - uint32x4_t far_energy_v, echo_energy_stored_v, echo_energy_adapt_v; + uint32x4_t far_energy_v, echo_stored_v, echo_adapt_v; far_energy_v = vdupq_n_u32(0); - echo_energy_adapt_v = vdupq_n_u32(0); - echo_energy_stored_v = vdupq_n_u32(0); + echo_adapt_v = vdupq_n_u32(0); + echo_stored_v = vdupq_n_u32(0); // Get energy for the delayed far end signal and estimated // echo using both stored and adapted channels. @@ -76,24 +77,25 @@ void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm, adapt_v = vld1q_s16(start_adapt_p); store_v = vld1q_s16(start_stored_p); - far_energy_v = vaddw_u16(far_energy_v, vget_low_s16(spectrum_v)); - far_energy_v = vaddw_u16(far_energy_v, vget_high_s16(spectrum_v)); + far_energy_v = vaddw_u16(far_energy_v, vget_low_u16(spectrum_v)); + far_energy_v = vaddw_u16(far_energy_v, vget_high_u16(spectrum_v)); - echo_est_v_low = vmull_u16(vget_low_s16(store_v), vget_low_s16(spectrum_v)); - echo_est_v_high = vmull_u16(vget_high_s16(store_v), - vget_high_s16(spectrum_v)); - vst1q_s32(echo_est_p, echo_est_v_low); - vst1q_s32(echo_est_p + 4, echo_est_v_high); + echo_est_v_low = vmull_u16(vreinterpret_u16_s16(vget_low_s16(store_v)), + vget_low_u16(spectrum_v)); + echo_est_v_high = vmull_u16(vreinterpret_u16_s16(vget_high_s16(store_v)), + vget_high_u16(spectrum_v)); + vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low)); + vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high)); - echo_energy_stored_v = vaddq_s32(echo_est_v_low, echo_energy_stored_v); - echo_energy_stored_v = vaddq_s32(echo_est_v_high, echo_energy_stored_v); + echo_stored_v = vaddq_u32(echo_est_v_low, echo_stored_v); + echo_stored_v = vaddq_u32(echo_est_v_high, echo_stored_v); - echo_energy_adapt_v = vmlal_u16(echo_energy_adapt_v, - vget_low_s16(adapt_v), - vget_low_s16(spectrum_v)); - echo_energy_adapt_v = vmlal_u16(echo_energy_adapt_v, - vget_high_s16(adapt_v), - vget_high_s16(spectrum_v)); + echo_adapt_v = vmlal_u16(echo_adapt_v, + vreinterpret_u16_s16(vget_low_s16(adapt_v)), + vget_low_u16(spectrum_v)); + echo_adapt_v = vmlal_u16(echo_adapt_v, + vreinterpret_u16_s16(vget_high_s16(adapt_v)), + vget_high_u16(spectrum_v)); start_stored_p += 8; start_adapt_p += 8; @@ -102,8 +104,8 @@ void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm, } AddLanes(far_energy, far_energy_v); - AddLanes(echo_energy_stored, echo_energy_stored_v); - AddLanes(echo_energy_adapt, echo_energy_adapt_v); + AddLanes(echo_energy_stored, echo_stored_v); + AddLanes(echo_energy_adapt, echo_adapt_v); echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN], far_spectrum[PART_LEN]); @@ -143,8 +145,9 @@ void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm, const int16_t* end_stored_p = aecm->channelStored + PART_LEN; int32_t* echo_est_p = echo_est; - int16x8_t far_spectrum_v, adapt_v; - int32x4_t echo_est_v_low, echo_est_v_high; + uint16x8_t far_spectrum_v; + int16x8_t adapt_v; + uint32x4_t echo_est_v_low, echo_est_v_high; while (start_stored_p < end_stored_p) { far_spectrum_v = vld1q_u16(far_spectrum_p); @@ -153,12 +156,12 @@ void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm, vst1q_s16(start_stored_p, adapt_v); echo_est_v_low = vmull_u16(vget_low_u16(far_spectrum_v), - vget_low_u16(adapt_v)); + vget_low_u16(vreinterpretq_u16_s16(adapt_v))); echo_est_v_high = vmull_u16(vget_high_u16(far_spectrum_v), - vget_high_u16(adapt_v)); + vget_high_u16(vreinterpretq_u16_s16(adapt_v))); - vst1q_s32(echo_est_p, echo_est_v_low); - vst1q_s32(echo_est_p + 4, echo_est_v_high); + vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low)); + vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high)); far_spectrum_p += 8; start_adapt_p += 8;