Remove -flax-vector-conversions flag for ARM NEON building. Pass compilation on both ARMv7 and ARM64. The generated binary (audioproc) is byte to byte (with symbol striped) same as before. The output of audioproc -aecm is also byte to byte same between C and NEON version on ARMv7 and ARM64. Change-Id: Ibdf40fe085f6bad1311f59bf9318bbcf37dd7ce5 BUG=3850 R=andrew@webrtc.org, jridges@masque.com Review URL: https://webrtc-codereview.appspot.com/30239004 Patch from Zhongwei Yao <zhongwei.yao@arm.com>. git-svn-id: http://webrtc.googlecode.com/svn/trunk/webrtc@7783 4adac7df-926f-26a2-2b94-8c16560cd09d

commit: ba372c0bc14990ab5f93e0a2735c12cf039dd7f9 [log] [tgz]
author: andrew@webrtc.org <andrew@webrtc.org> Tue Dec 02 19:36:14 2014
committer: andrew@webrtc.org <andrew@webrtc.org> Tue Dec 02 19:36:14 2014
tree: 0edd4908083af34901639aaefa5b1d5103edd74f
parent: 422bafd9af282bf9810c13de8ddc577868e7b6fe [diff]
diff --git a/build/arm_neon.gypi b/build/arm_neon.gypi
index 037dd70..9d8f71c 100644
--- a/build/arm_neon.gypi
+++ b/build/arm_neon.gypi

@@ -23,9 +23,6 @@
   'cflags!': [
     '-mfpu=vfpv3-d16',
   ],
-  'cflags': [
-    '-flax-vector-conversions',
-  ],
   'conditions': [
     # "-mfpu=neon" is not requried for arm64 in GCC.
     ['target_arch!="arm64"', {

diff --git a/common_audio/BUILD.gn b/common_audio/BUILD.gn
index ba1d179..5c14e51 100644
--- a/common_audio/BUILD.gn
+++ b/common_audio/BUILD.gn

@@ -211,7 +211,6 @@
     # Remove the -mfpu=vfpv3-d16 cflag.
     configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
     cflags = [
-      "-flax-vector-conversions",
       "-mfpu=neon",
     ]
 

diff --git a/modules/audio_coding/BUILD.gn b/modules/audio_coding/BUILD.gn
index 810fcf1..0c087ec 100644
--- a/modules/audio_coding/BUILD.gn
+++ b/modules/audio_coding/BUILD.gn

@@ -501,7 +501,6 @@
     # Remove the -mfpu=vfpv3-d16 cflag.
     configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
     cflags = [
-      "-flax-vector-conversions",
       "-mfpu=neon",
     ]
 
@@ -572,7 +571,6 @@
     # Remove the -mfpu=vfpv3-d16 cflag.
     configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
     cflags = [
-      "-flax-vector-conversions",
       "-mfpu=neon",
     ]
 

diff --git a/modules/audio_processing/BUILD.gn b/modules/audio_processing/BUILD.gn
index 3efc8ee..fbc1e7a 100644
--- a/modules/audio_processing/BUILD.gn
+++ b/modules/audio_processing/BUILD.gn

@@ -210,13 +210,10 @@
     # //build/config/arm.gni instead, to reduce code duplication.
     # Remove the -mfpu=vfpv3-d16 cflag.
     configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
-    cflags = [
-      "-flax-vector-conversions",
-    ]
 
     # "-mfpu=neon" is not requried for arm64 in GCC.
     if (cpu_arch != "arm64") {
-       cflags += [ "-mfpu=neon" ]
+       cflags = [ "-mfpu=neon" ]
     }
 
     # Disable LTO in audio_processing_neon target due to compiler bug.

diff --git a/modules/audio_processing/aecm/aecm_core_neon.c b/modules/audio_processing/aecm/aecm_core_neon.c
index 4efe5d8..1a0a6f5 100644
--- a/modules/audio_processing/aecm/aecm_core_neon.c
+++ b/modules/audio_processing/aecm/aecm_core_neon.c

@@ -53,13 +53,14 @@
   int32_t* echo_est_p = echo_est;
   const int16_t* end_stored_p = aecm->channelStored + PART_LEN;
   const uint16_t* far_spectrum_p = far_spectrum;
-  int16x8_t store_v, adapt_v, spectrum_v;
+  int16x8_t store_v, adapt_v;
+  uint16x8_t spectrum_v;
   uint32x4_t echo_est_v_low, echo_est_v_high;
-  uint32x4_t far_energy_v, echo_energy_stored_v, echo_energy_adapt_v;
+  uint32x4_t far_energy_v, echo_stored_v, echo_adapt_v;
 
   far_energy_v = vdupq_n_u32(0);
-  echo_energy_adapt_v = vdupq_n_u32(0);
-  echo_energy_stored_v = vdupq_n_u32(0);
+  echo_adapt_v = vdupq_n_u32(0);
+  echo_stored_v = vdupq_n_u32(0);
 
   // Get energy for the delayed far end signal and estimated
   // echo using both stored and adapted channels.
@@ -76,24 +77,25 @@
     adapt_v = vld1q_s16(start_adapt_p);
     store_v = vld1q_s16(start_stored_p);
 
-    far_energy_v = vaddw_u16(far_energy_v, vget_low_s16(spectrum_v));
-    far_energy_v = vaddw_u16(far_energy_v, vget_high_s16(spectrum_v));
+    far_energy_v = vaddw_u16(far_energy_v, vget_low_u16(spectrum_v));
+    far_energy_v = vaddw_u16(far_energy_v, vget_high_u16(spectrum_v));
 
-    echo_est_v_low = vmull_u16(vget_low_s16(store_v), vget_low_s16(spectrum_v));
-    echo_est_v_high = vmull_u16(vget_high_s16(store_v),
-                                vget_high_s16(spectrum_v));
-    vst1q_s32(echo_est_p, echo_est_v_low);
-    vst1q_s32(echo_est_p + 4, echo_est_v_high);
+    echo_est_v_low = vmull_u16(vreinterpret_u16_s16(vget_low_s16(store_v)),
+                               vget_low_u16(spectrum_v));
+    echo_est_v_high = vmull_u16(vreinterpret_u16_s16(vget_high_s16(store_v)),
+                                vget_high_u16(spectrum_v));
+    vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low));
+    vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high));
 
-    echo_energy_stored_v = vaddq_s32(echo_est_v_low, echo_energy_stored_v);
-    echo_energy_stored_v = vaddq_s32(echo_est_v_high, echo_energy_stored_v);
+    echo_stored_v = vaddq_u32(echo_est_v_low, echo_stored_v);
+    echo_stored_v = vaddq_u32(echo_est_v_high, echo_stored_v);
 
-    echo_energy_adapt_v = vmlal_u16(echo_energy_adapt_v,
-                                    vget_low_s16(adapt_v),
-                                    vget_low_s16(spectrum_v));
-    echo_energy_adapt_v = vmlal_u16(echo_energy_adapt_v,
-                                    vget_high_s16(adapt_v),
-                                    vget_high_s16(spectrum_v));
+    echo_adapt_v = vmlal_u16(echo_adapt_v,
+                             vreinterpret_u16_s16(vget_low_s16(adapt_v)),
+                             vget_low_u16(spectrum_v));
+    echo_adapt_v = vmlal_u16(echo_adapt_v,
+                             vreinterpret_u16_s16(vget_high_s16(adapt_v)),
+                             vget_high_u16(spectrum_v));
 
     start_stored_p += 8;
     start_adapt_p += 8;
@@ -102,8 +104,8 @@
   }
 
   AddLanes(far_energy, far_energy_v);
-  AddLanes(echo_energy_stored, echo_energy_stored_v);
-  AddLanes(echo_energy_adapt, echo_energy_adapt_v);
+  AddLanes(echo_energy_stored, echo_stored_v);
+  AddLanes(echo_energy_adapt, echo_adapt_v);
 
   echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN],
                                              far_spectrum[PART_LEN]);
@@ -143,8 +145,9 @@
   const int16_t* end_stored_p = aecm->channelStored + PART_LEN;
   int32_t* echo_est_p = echo_est;
 
-  int16x8_t far_spectrum_v, adapt_v;
-  int32x4_t echo_est_v_low, echo_est_v_high;
+  uint16x8_t far_spectrum_v;
+  int16x8_t adapt_v;
+  uint32x4_t echo_est_v_low, echo_est_v_high;
 
   while (start_stored_p < end_stored_p) {
     far_spectrum_v = vld1q_u16(far_spectrum_p);
@@ -153,12 +156,12 @@
     vst1q_s16(start_stored_p, adapt_v);
 
     echo_est_v_low = vmull_u16(vget_low_u16(far_spectrum_v),
-                               vget_low_u16(adapt_v));
+                               vget_low_u16(vreinterpretq_u16_s16(adapt_v)));
     echo_est_v_high = vmull_u16(vget_high_u16(far_spectrum_v),
-                                vget_high_u16(adapt_v));
+                                vget_high_u16(vreinterpretq_u16_s16(adapt_v)));
 
-    vst1q_s32(echo_est_p, echo_est_v_low);
-    vst1q_s32(echo_est_p + 4, echo_est_v_high);
+    vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low));
+    vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high));
 
     far_spectrum_p += 8;
     start_adapt_p += 8;
commit	ba372c0bc14990ab5f93e0a2735c12cf039dd7f9	[log] [tgz]
author	andrew@webrtc.org <andrew@webrtc.org>	Tue Dec 02 19:36:14 2014
committer	andrew@webrtc.org <andrew@webrtc.org>	Tue Dec 02 19:36:14 2014
tree	0edd4908083af34901639aaefa5b1d5103edd74f
parent	422bafd9af282bf9810c13de8ddc577868e7b6fe [diff]