From 12454028bce5af5ea3ed23df5b336db7e21dbf3e Mon Sep 17 00:00:00 2001 From: "kma@webrtc.org" Date: Wed, 7 Nov 2012 22:34:31 +0000 Subject: [PATCH] Fixed and enabled ARM assembly code in AECM and NS. Review URL: https://webrtc-codereview.appspot.com/860005 git-svn-id: http://webrtc.googlecode.com/svn/trunk@3060 4adac7df-926f-26a2-2b94-8c16560cd09d --- .../modules/audio_processing/aecm/Android.mk | 17 ++- .../modules/audio_processing/aecm/aecm_core.h | 5 +- .../audio_processing/aecm/aecm_core_neon.S | 105 ++++++++---------- .../aecm/aecm_core_neon_offsets.c | 2 +- .../audio_processing/audio_processing.gypi | 57 +++++++--- webrtc/modules/audio_processing/ns/Android.mk | 18 +-- webrtc/modules/audio_processing/ns/nsx_core.h | 5 +- .../audio_processing/ns/nsx_core_neon.S | 3 +- 8 files changed, 113 insertions(+), 99 deletions(-) diff --git a/webrtc/modules/audio_processing/aecm/Android.mk b/webrtc/modules/audio_processing/aecm/Android.mk index d4a882cc40..d74ca5b629 100644 --- a/webrtc/modules/audio_processing/aecm/Android.mk +++ b/webrtc/modules/audio_processing/aecm/Android.mk @@ -56,20 +56,18 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_MODULE := libwebrtc_aecm_neon LOCAL_MODULE_TAGS := optional -GEN := $(LOCAL_PATH)/aecm_core_neon_offsets.h +AECM_ASM_HEADER := $(intermediates)/aecm_core_neon_offsets.h +AECM_ASM_HEADER_DIR := $(intermediates) # Generate a header file aecm_core_neon_offsets.h which will be included in # assembly file aecm_core_neon.S, from file aecm_core_neon_offsets.c. -$(GEN): $(LOCAL_PATH)/../../../build/generate_asm_header.py \ - $(intermediates)/aecm_core_neon_offsets.S - @python $^ $@ offset_aecm_ - -$(intermediates)/aecm_core_neon_offsets.S: \ +$(AECM_ASM_HEADER): $(LOCAL_PATH)/../../../build/generate_asm_header.py \ $(LOCAL_PATH)/aecm_core_neon_offsets.c - @$(TARGET_CC) $(addprefix -I, $(LOCAL_INCLUDES)) $(addprefix -isystem ,\ - $(TARGET_C_INCLUDES)) -S -o $@ $^ + @python $^ --compiler=$(TARGET_CC) --options="$(addprefix -I, \ + $(LOCAL_INCLUDES)) $(addprefix -isystem , $(TARGET_C_INCLUDES)) -S" \ + --dir=$(AECM_ASM_HEADER_DIR) -LOCAL_GENERATED_SOURCES := $(GEN) +LOCAL_GENERATED_SOURCES := $(AECM_ASM_HEADER) LOCAL_SRC_FILES := aecm_core_neon.S # Flags passed to both C and C++ files. @@ -80,6 +78,7 @@ LOCAL_CFLAGS := \ -flax-vector-conversions LOCAL_C_INCLUDES := \ + $(AECM_ASM_HEADER_DIR) \ $(LOCAL_PATH)/include \ $(LOCAL_PATH)/../../.. \ $(LOCAL_PATH)/../../../common_audio/signal_processing/include diff --git a/webrtc/modules/audio_processing/aecm/aecm_core.h b/webrtc/modules/audio_processing/aecm/aecm_core.h index c4de06789c..a0fac5dd18 100644 --- a/webrtc/modules/audio_processing/aecm/aecm_core.h +++ b/webrtc/modules/audio_processing/aecm/aecm_core.h @@ -13,10 +13,9 @@ #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AECM_AECM_CORE_H_ #define WEBRTC_MODULES_AUDIO_PROCESSING_AECM_AECM_CORE_H_ +#include "common_audio/signal_processing/include/signal_processing_library.h" +#include "modules/audio_processing/aecm/aecm_defines.h" #include "typedefs.h" -#include "signal_processing_library.h" - -#include "aecm_defines.h" #ifdef _MSC_VER // visual c++ #define ALIGN8_BEG __declspec(align(8)) diff --git a/webrtc/modules/audio_processing/aecm/aecm_core_neon.S b/webrtc/modules/audio_processing/aecm/aecm_core_neon.S index 0708c5fd9a..412c173f6a 100644 --- a/webrtc/modules/audio_processing/aecm/aecm_core_neon.S +++ b/webrtc/modules/audio_processing/aecm/aecm_core_neon.S @@ -26,66 +26,64 @@ .global WebRtcAecm_StoreAdaptiveChannelNeon .global WebRtcAecm_ResetAdaptiveChannelNeon -@ void WebRtcAecm_WindowAndFFTNeon(WebRtc_Word16* fft, +@ void WebRtcAecm_WindowAndFFTNeon(AecmCore_t* aecm, +@ WebRtc_Word16* fft, @ const WebRtc_Word16* time_signal, @ complex16_t* freq_signal, @ int time_signal_scaling); .align 2 WebRtcAecm_WindowAndFFTNeon: .fnstart -.save {r4, r5, lr} - push {r4, r5, lr} +.save {r4, r5, r6, lr} + push {r4, r5, r6, lr} - vdup.16 d16, r3 - mov r5, r2 @ WebRtcSpl_ComplexIFFT changes r2. + ldr r12, [sp, #16] @ time_signal_scaling + vdup.16 d16, r12 vmov.i16 d21, #0 @ For imaginary parts of |fft|. vmov.i16 d27, #0 @ For imaginary parts of |fft|. - ldr r2, =WebRtcAecm_kSqrtHanning + ldr r5, =WebRtcAecm_kSqrtHanning adr lr, kSqrtHanningReversed - add r4, r0, #(PART_LEN2 * 2) @ &fft[PART_LEN2] - add r12, r1, #(PART_LEN * 2) @ time_signal[PART_LEN] - mov r3, #(PART_LEN / 4) @ Loop counter, unrolled by 4 + add r4, r1, #(PART_LEN2 * 2) @ &fft[PART_LEN2] + add r12, r2, #(PART_LEN * 2) @ time_signal[PART_LEN] + mov r6, #(PART_LEN / 4) @ Loop counter, unrolled by 4 LOOP_PART_LEN: - vld1.16 d0, [r1, :64]! @ time_signal[i] + vld1.16 d0, [r2, :64]! @ time_signal[i] vld1.16 d22, [r12, :64]! @ time_signal[i + PART_LEN] - vld1.16 d17, [r2, :64]! @ WebRtcAecm_kSqrtHanning[i] + vld1.16 d17, [r5, :64]! @ WebRtcAecm_kSqrtHanning[i] vld1.16 d23, [lr, :64]! @ kSqrtHanningReversed[i] vshl.s16 d18, d0, d16 vshl.s16 d22, d22, d16 vmull.s16 q9, d18, d17 vmull.s16 q12, d22, d23 - subs r3, #1 + subs r6, #1 vshrn.i32 d20, q9, #14 vshrn.i32 d26, q12, #14 - vst2.16 {d20, d21}, [r0, :128]! @ fft[j] + vst2.16 {d20, d21}, [r1, :128]! @ fft[j] vst2.16 {d26, d27}, [r4, :128]! @ fft[PART_LEN2 + j] bgt LOOP_PART_LEN - sub r4, r0, #(PART_LEN2 * 2) @ r4 points to fft[0] - mov r0, r4 - mov r1, #7 - bl WebRtcSpl_ComplexBitReverse + @ WebRtcSpl_RealForwardFFT(aecm->real_fft, fft, (int16_t*)freq_signal); + ldr r12, =offset_aecm_real_fft + sub r1, #(PART_LEN * 4) @ Get r1 back to &fft[0]. + mov r2, r3 @ freq_signal + mov r4, r3 + ldr r0, [r0, r12] @ aecm->real_fft + bl WebRtcSpl_RealForwardFFTNeon - mov r0, r4 - mov r1, #7 - mov r2, #1 - bl WebRtcSpl_ComplexFFT - - mov r3, #(PART_LEN * 2 / 16) @ Loop counter, unrolled by 16. + mov r12, #(PART_LEN * 2 / 16) @ Loop counter, unrolled by 16. LOOP_PART_LEN2: - @ freq_signal[i].real = fft[j]; - @ freq_signal[i].imag = - fft[j+1]; - vld2.16 {d20, d21, d22, d23}, [r4, :256]! - subs r3, #1 + @ freq_signal[i].imag = - freq_signal[i].imag; + vld2.16 {d20, d21, d22, d23}, [r4, :256] + subs r12, #1 vneg.s16 d22, d22 vneg.s16 d23, d23 - vst2.16 {d20, d21, d22, d23}, [r5, :256]! + vst2.16 {d20, d21, d22, d23}, [r4, :256]! bgt LOOP_PART_LEN2 - pop {r4, r5, pc} + pop {r4, r5, r6, pc} .fnend @ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm, @@ -123,29 +121,18 @@ LOOP_PRE_IFFT: @ fft[PART_LEN2] = efw[PART_LEN].real; @ fft[PART_LEN2 + 1] = -efw[PART_LEN].imag; ldr r8, [r12] - ssub16 r2, r6, r8 - mov r1, #(PART_LEN2 * 2) - pkhbt r8, r8, r2 - str r8, [r4, r1] + ssub16 r12, r6, r8 + mov r3, #(PART_LEN2 * 2) + pkhbt r8, r8, r12 + str r8, [r4, r3] - mov r0, r4 - mov r1, #7 - bl WebRtcSpl_ComplexBitReverse - - mov r0, r4 - mov r1, #7 - mov r2, #1 - bl WebRtcSpl_ComplexIFFT - - mov r1, r4 - mov r2, r4 - mov r3, #(PART_LEN * 2 / 8) @ Loop counter, unrolled by 8. - -LOOP_GET_REAL_VALUES: - vld2.16 {q10, q11}, [r2, :256]! - subs r3, #1 - vst1.16 {q10}, [r1, :128]! - bgt LOOP_GET_REAL_VALUES + @ outCFFT = WebRtcSpl_RealInverseFFT(aecm->real_fft, fft, (int16_t*)efw); + ldr r12, =offset_aecm_real_fft + sub r1, #(PART_LEN * 4) @ Get r1 back to &fft[0]. + sub r2, #(PART_LEN * 4) @ Get r2 back to &efw[0]. + mov r4, r2 @ Keep efw in r4. + ldr r0, [r0, r12] @ aecm->real_fft + bl WebRtcSpl_RealInverseFFTNeon ldr r6, =offset_aecm_outBuf ldr r12, =offset_aecm_dfaCleanQDomain @@ -156,24 +143,24 @@ LOOP_GET_REAL_VALUES: ldr r6, =WebRtcAecm_kSqrtHanning rsb r0, r2, r0 @ outCFFT - aecm->dfaCleanQDomain vdup.32 q9, r0 - add r0, r4, #(PART_LEN * 2) @ &fft[PART_LEN] - mov r3, #(PART_LEN / 4) @ Loop counter, unrolled by 4. + add r0, r4, #(PART_LEN * 4) @ &efw[PART_LEN] + mov r3, #(PART_LEN / 4) @ Loop counter, unrolled by 4 LOOP_POST_IFFT: - vld1.16 d16, [r4, :64] @ fft[i]; + vld2.16 {d4, d5}, [r4, :128] @ &efw[i]; vld1.16 d17, [r6, :64]! @ WebRtcAecm_kSqrtHanning[i] vld1.16 d20, [r8, :64] @ aecm->outBuf[i] - vmull.s16 q8, d16, d17 + vmull.s16 q8, d4, d17 vmovl.s16 q10, d20 vrshr.s32 q8, q8, #14 - vld1.16 d0, [r0, :64]! @ &fft[PART_LEN + i] + vld1.16 d0, [r0, :64]! @ &efw[PART_LEN + i] vshl.s32 q8, q8, q9 vld1.16 d1, [r12, :64]! @ kSqrtHanningReversed[i] vadd.i32 q8, q10 vmull.s16 q0, d0, d1 - vqshrn.s32 d16, q8, #0 + vqshrn.s32 d4, q8, #0 vshr.s32 q0, q0, #14 - vst1.16 d16, [r4, :64]! @ fft[i]; + vst2.16 {d4, d5}, [r4, :128]! @ &efw[i]; vshl.s32 q0, q0, q9 vst1.16 d16, [r7, :64]! @ output[i] vqshrn.s32 d0, q0, #0 @@ -197,7 +184,7 @@ LOOP_COPY: vst1.16 {q12, q13}, [r1, :256]! bgt LOOP_COPY - ldr r2, [sp, #24] + ldr r2, [sp, #16] cmp r2, #0 @ Check if (nearendClean != NULL). beq END diff --git a/webrtc/modules/audio_processing/aecm/aecm_core_neon_offsets.c b/webrtc/modules/audio_processing/aecm/aecm_core_neon_offsets.c index b614977871..b7bd48d10c 100644 --- a/webrtc/modules/audio_processing/aecm/aecm_core_neon_offsets.c +++ b/webrtc/modules/audio_processing/aecm/aecm_core_neon_offsets.c @@ -23,4 +23,4 @@ int offset_aecm_dBufClean = offsetof(AecmCore_t, dBufClean); int offset_aecm_channelStored = offsetof(AecmCore_t, channelStored); int offset_aecm_channelAdapt16 = offsetof(AecmCore_t, channelAdapt16); int offset_aecm_channelAdapt32 = offsetof(AecmCore_t, channelAdapt32); - +int offset_aecm_real_fft = offsetof(AecmCore_t, real_fft); diff --git a/webrtc/modules/audio_processing/audio_processing.gypi b/webrtc/modules/audio_processing/audio_processing.gypi index 7a4ee9aae6..a9d8438e05 100644 --- a/webrtc/modules/audio_processing/audio_processing.gypi +++ b/webrtc/modules/audio_processing/audio_processing.gypi @@ -159,19 +159,50 @@ ], }], ['target_arch=="arm" and armv7==1', { - 'targets': [ - { - 'target_name': 'audio_processing_neon', - 'type': 'static_library', - 'includes': ['../../build/arm_neon.gypi',], - 'dependencies': [ - '<(webrtc_root)/common_audio/common_audio.gyp:signal_processing', - ], - 'sources': [ - 'aecm/aecm_core_neon.c', - 'ns/nsx_core_neon.c', - ], - }, + 'targets': [{ + 'target_name': 'audio_processing_neon', + 'type': 'static_library', + 'includes': ['../../build/arm_neon.gypi',], + 'dependencies': [ + '<(webrtc_root)/common_audio/common_audio.gyp:signal_processing', + ], + 'sources': [ + 'aecm/aecm_core_neon.c', + 'ns/nsx_core_neon.c', + ], + 'conditions': [ + ['OS=="android"', { + 'dependencies': [ + 'audio_processing_offsets', + ], + # TODO(kma): port this block from Android into other build systems. + 'sources': [ + 'aecm/aecm_core_neon.S', + 'ns/nsx_core_neon.S', + ], + 'sources!': [ + 'aecm/aecm_core_neon.c', + 'ns/nsx_core_neon.c', + ], + 'includes!': ['../../build/arm_neon.gypi',], + }], + ], + }], + 'conditions': [ + ['OS=="android"', { + 'targets': [{ + 'target_name': 'audio_processing_offsets', + 'type': 'none', + 'sources': [ + 'aecm/aecm_core_neon_offsets.c', + 'ns/nsx_core_neon_offsets.c', + ], + 'variables': { + 'asm_header_dir': 'asm_offsets', + }, + 'includes': ['../../build/generate_asm_header.gypi',], + }], + }], ], }], ], diff --git a/webrtc/modules/audio_processing/ns/Android.mk b/webrtc/modules/audio_processing/ns/Android.mk index f083c50224..34c30e5eb1 100644 --- a/webrtc/modules/audio_processing/ns/Android.mk +++ b/webrtc/modules/audio_processing/ns/Android.mk @@ -57,19 +57,18 @@ LOCAL_ARM_MODE := arm LOCAL_MODULE_CLASS := STATIC_LIBRARIES LOCAL_MODULE := libwebrtc_ns_neon LOCAL_MODULE_TAGS := optional -GEN := $(LOCAL_PATH)/nsx_core_neon_offsets.h +NS_ASM_HEADER := $(intermediates)/ns_core_neon_offsets.h +NS_ASM_HEADER_DIR := $(intermediates) # Generate a header file nsx_core_neon_offsets.h which will be included in # assembly file nsx_core_neon.S, from file nsx_core_neon_offsets.c. -$(GEN): $(LOCAL_PATH)/../../../build/generate_asm_header.py \ - $(intermediates)/nsx_core_neon_offsets.S - @python $^ $@ offset_nsx_ +$(NS_ASM_HEADER): $(LOCAL_PATH)/../../../build/generate_asm_header.py \ + $(LOCAL_PATH)/nsx_core_neon_offsets.c + @python $^ --compiler=$(TARGET_CC) --options="$(addprefix -I, \ + $(LOCAL_INCLUDES)) $(addprefix -isystem , $(TARGET_C_INCLUDES)) -S" \ + --dir=$(NS_ASM_HEADER_DIR) -$(intermediates)/nsx_core_neon_offsets.S: $(LOCAL_PATH)/nsx_core_neon_offsets.c - @$(TARGET_CC) $(addprefix -I, $(LOCAL_INCLUDES)) $(addprefix -isystem ,\ - $(TARGET_C_INCLUDES)) -S -o $@ $^ - -LOCAL_GENERATED_SOURCES := $(GEN) +LOCAL_GENERATED_SOURCES := $(NS_ASM_HEADER) LOCAL_SRC_FILES := nsx_core_neon.S # Flags passed to both C and C++ files. @@ -80,6 +79,7 @@ LOCAL_CFLAGS := \ -flax-vector-conversions LOCAL_C_INCLUDES := \ + $(NS_ASM_HEADER_DIR) \ $(LOCAL_PATH)/include \ $(LOCAL_PATH)/../../.. \ $(LOCAL_PATH)/../../../common_audio/signal_processing/include diff --git a/webrtc/modules/audio_processing/ns/nsx_core.h b/webrtc/modules/audio_processing/ns/nsx_core.h index 503c6fb373..618193b281 100644 --- a/webrtc/modules/audio_processing/ns/nsx_core.h +++ b/webrtc/modules/audio_processing/ns/nsx_core.h @@ -11,10 +11,9 @@ #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_NS_MAIN_SOURCE_NSX_CORE_H_ #define WEBRTC_MODULES_AUDIO_PROCESSING_NS_MAIN_SOURCE_NSX_CORE_H_ +#include "common_audio/signal_processing/include/signal_processing_library.h" +#include "modules/audio_processing/ns/nsx_defines.h" #include "typedefs.h" -#include "signal_processing_library.h" - -#include "nsx_defines.h" #ifdef NS_FILEDEBUG #include diff --git a/webrtc/modules/audio_processing/ns/nsx_core_neon.S b/webrtc/modules/audio_processing/ns/nsx_core_neon.S index 31eea06a9a..4e15959cd6 100644 --- a/webrtc/modules/audio_processing/ns/nsx_core_neon.S +++ b/webrtc/modules/audio_processing/ns/nsx_core_neon.S @@ -335,7 +335,7 @@ UpdateNoiseEstimateNeon: mov r0, r4 mov r1, r6 - bl WebRtcSpl_MaxValueW16 + bl WebRtcSpl_MaxValueW16Neon sub r12, r6, #1 @ Loop counter: inst->magnLen - 1. @@ -351,7 +351,6 @@ UpdateNoiseEstimateNeon: vdup.32 q13, r0 str r0, [r5, r1] - LOOP_UPDATE: vld1.16 {d0, d1}, [r4]! @ &inst->noiseEstLogQuantile[offset + i] vmull.s16 q1, d0, d16