diff --git a/webrtc/modules/audio_processing/ns/nsx_core_neon.S b/webrtc/modules/audio_processing/ns/nsx_core_neon.S index cea75532a9..1eb46293c4 100644 --- a/webrtc/modules/audio_processing/ns/nsx_core_neon.S +++ b/webrtc/modules/audio_processing/ns/nsx_core_neon.S @@ -423,10 +423,12 @@ LOOP_MAGNLEN: strh r7, [r2] strh r8, [r4] - ldr r5, [r0, #offset_nsx_anaLen2] @ inst->anaLen2 + ldr r5, [r0, #offset_nsx_anaLen2] @ inst->anaLen2 ldr r7, [r0, #offset_nsx_anaLen] @ inst->anaLen - add r5, r3, r5, lsl #1 @ &inst->real[inst->anaLen2] + lsr r5, #3 @ inst->anaLen2 / 8 + sub r5, #1 @ Loop counter. +@ Process and write the first 2 samples into freq_buf[]. ldrh r2, [r3], #2 @ inst->real[0] ldrh r0, [r9] @ inst->imag[0] strh r2, [r1], #2 @ Store to freq_buf[0] @@ -438,28 +440,52 @@ LOOP_MAGNLEN: mvn r12, #0x1F @ -32 -@ At the last iteration, &freq_buf[inst->anaLen + 1] will be written to by both -@ the vst1 instructions. Only the 2nd vst1 instruction has the correct value -@ (-inst->imag[inst->anaLen2]), so the order of the two vst1's is important. +@ Process and write (inst->anaLen2 * 4 - 32) samples into freq_buf[]. LOOP_ANALEN2: - vld1.16 {d0, d1}, [r3]! @ inst->real[], starting from inst->real[1] - vld1.16 {d2, d3}, [r6]! @ inst->imag[], starting from inst->imag[1] - vmov.s16 d4, d0 + vld1.16 d3, [r3]! @ inst->real[], starting from inst->real[1] + vld1.16 d1, [r3]! + vmov.s16 d4, d3 + vld1.16 d2, [r6]! @ inst->imag[], starting from inst->imag[1] vmov.s16 d6, d1 vneg.s16 d5, d2 - vneg.s16 d7, d3 - vzip.16 d0, d2 - vzip.16 d1, d3 + vld1.16 d0, [r6]! + vneg.s16 d7, d0 + vzip.16 d1, d0 + vzip.16 d3, d2 vzip.16 d4, d5 + vrev64.32 q8, q0 + vrev64.32 q9, q1 vzip.16 d6, d7 - vrev64.32 d16, d3 - vrev64.32 d17, d1 - vrev64.32 d18, d2 - vrev64.32 d19, d0 - cmp r3, r5 + subs r5, #1 vst1.16 {d16, d17, d18, d19}, [r2], r12 vst1.16 {d4, d5, d6, d7}, [r1]! - bls LOOP_ANALEN2 + bgt LOOP_ANALEN2 + +@ Process and write 32 samples into freq_buf[]. We need to adjust the pointers +@ to overwrite the 2 starting samples in the back half of the buffer. + sub r0, r3, #2 + sub r4, r6, #2 + add r2, #4 + vld1.16 d3, [r3]! @ inst->real[], starting from inst->real[1] + vld1.16 d1, [r3]! + vmov.s16 d4, d3 + vld1.16 d2, [r6]! @ inst->imag[], starting from inst->imag[1] + vmov.s16 d6, d1 + vld1.16 d0, [r6]! + vneg.s16 d5, d2 + vld1.16 d23, [r0]! @ inst->real[], starting from inst->real[1] + vneg.s16 d7, d0 + vld1.16 d21, [r0] + vzip.16 d4, d5 + vld1.16 d22, [r4]! @ inst->imag[], starting from inst->imag[1] + vld1.16 d20, [r4] + vzip.16 d23, d22 + vzip.16 d21, d20 + vzip.16 d6, d7 + vrev64.32 q8, q10 + vrev64.32 q9, q11 + vst1.16 {d4, d5, d6, d7}, [r1] + vst1.16 {d16, d17, d18, d19}, [r2] pop {r4-r8} bx r14