Remove -flax-vector-conversions flag for ARM NEON building.

Pass compilation on both ARMv7 and ARM64. The generated
binary (audioproc) is byte to byte (with symbol striped) same as
before. The output of audioproc -aecm is also byte to byte same between
C and NEON version on ARMv7 and ARM64.

Change-Id: Ibdf40fe085f6bad1311f59bf9318bbcf37dd7ce5

BUG=3850
R=andrew@webrtc.org, jridges@masque.com

Review URL: https://webrtc-codereview.appspot.com/30239004

Patch from Zhongwei Yao <zhongwei.yao@arm.com>.

git-svn-id: http://webrtc.googlecode.com/svn/trunk/webrtc@7783 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/build/arm_neon.gypi b/build/arm_neon.gypi
index 037dd70..9d8f71c 100644
--- a/build/arm_neon.gypi
+++ b/build/arm_neon.gypi
@@ -23,9 +23,6 @@
   'cflags!': [
     '-mfpu=vfpv3-d16',
   ],
-  'cflags': [
-    '-flax-vector-conversions',
-  ],
   'conditions': [
     # "-mfpu=neon" is not requried for arm64 in GCC.
     ['target_arch!="arm64"', {
diff --git a/common_audio/BUILD.gn b/common_audio/BUILD.gn
index ba1d179..5c14e51 100644
--- a/common_audio/BUILD.gn
+++ b/common_audio/BUILD.gn
@@ -211,7 +211,6 @@
     # Remove the -mfpu=vfpv3-d16 cflag.
     configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
     cflags = [
-      "-flax-vector-conversions",
       "-mfpu=neon",
     ]
 
diff --git a/modules/audio_coding/BUILD.gn b/modules/audio_coding/BUILD.gn
index 810fcf1..0c087ec 100644
--- a/modules/audio_coding/BUILD.gn
+++ b/modules/audio_coding/BUILD.gn
@@ -501,7 +501,6 @@
     # Remove the -mfpu=vfpv3-d16 cflag.
     configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
     cflags = [
-      "-flax-vector-conversions",
       "-mfpu=neon",
     ]
 
@@ -572,7 +571,6 @@
     # Remove the -mfpu=vfpv3-d16 cflag.
     configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
     cflags = [
-      "-flax-vector-conversions",
       "-mfpu=neon",
     ]
 
diff --git a/modules/audio_processing/BUILD.gn b/modules/audio_processing/BUILD.gn
index 3efc8ee..fbc1e7a 100644
--- a/modules/audio_processing/BUILD.gn
+++ b/modules/audio_processing/BUILD.gn
@@ -210,13 +210,10 @@
     # //build/config/arm.gni instead, to reduce code duplication.
     # Remove the -mfpu=vfpv3-d16 cflag.
     configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
-    cflags = [
-      "-flax-vector-conversions",
-    ]
 
     # "-mfpu=neon" is not requried for arm64 in GCC.
     if (cpu_arch != "arm64") {
-       cflags += [ "-mfpu=neon" ]
+       cflags = [ "-mfpu=neon" ]
     }
 
     # Disable LTO in audio_processing_neon target due to compiler bug.
diff --git a/modules/audio_processing/aecm/aecm_core_neon.c b/modules/audio_processing/aecm/aecm_core_neon.c
index 4efe5d8..1a0a6f5 100644
--- a/modules/audio_processing/aecm/aecm_core_neon.c
+++ b/modules/audio_processing/aecm/aecm_core_neon.c
@@ -53,13 +53,14 @@
   int32_t* echo_est_p = echo_est;
   const int16_t* end_stored_p = aecm->channelStored + PART_LEN;
   const uint16_t* far_spectrum_p = far_spectrum;
-  int16x8_t store_v, adapt_v, spectrum_v;
+  int16x8_t store_v, adapt_v;
+  uint16x8_t spectrum_v;
   uint32x4_t echo_est_v_low, echo_est_v_high;
-  uint32x4_t far_energy_v, echo_energy_stored_v, echo_energy_adapt_v;
+  uint32x4_t far_energy_v, echo_stored_v, echo_adapt_v;
 
   far_energy_v = vdupq_n_u32(0);
-  echo_energy_adapt_v = vdupq_n_u32(0);
-  echo_energy_stored_v = vdupq_n_u32(0);
+  echo_adapt_v = vdupq_n_u32(0);
+  echo_stored_v = vdupq_n_u32(0);
 
   // Get energy for the delayed far end signal and estimated
   // echo using both stored and adapted channels.
@@ -76,24 +77,25 @@
     adapt_v = vld1q_s16(start_adapt_p);
     store_v = vld1q_s16(start_stored_p);
 
-    far_energy_v = vaddw_u16(far_energy_v, vget_low_s16(spectrum_v));
-    far_energy_v = vaddw_u16(far_energy_v, vget_high_s16(spectrum_v));
+    far_energy_v = vaddw_u16(far_energy_v, vget_low_u16(spectrum_v));
+    far_energy_v = vaddw_u16(far_energy_v, vget_high_u16(spectrum_v));
 
-    echo_est_v_low = vmull_u16(vget_low_s16(store_v), vget_low_s16(spectrum_v));
-    echo_est_v_high = vmull_u16(vget_high_s16(store_v),
-                                vget_high_s16(spectrum_v));
-    vst1q_s32(echo_est_p, echo_est_v_low);
-    vst1q_s32(echo_est_p + 4, echo_est_v_high);
+    echo_est_v_low = vmull_u16(vreinterpret_u16_s16(vget_low_s16(store_v)),
+                               vget_low_u16(spectrum_v));
+    echo_est_v_high = vmull_u16(vreinterpret_u16_s16(vget_high_s16(store_v)),
+                                vget_high_u16(spectrum_v));
+    vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low));
+    vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high));
 
-    echo_energy_stored_v = vaddq_s32(echo_est_v_low, echo_energy_stored_v);
-    echo_energy_stored_v = vaddq_s32(echo_est_v_high, echo_energy_stored_v);
+    echo_stored_v = vaddq_u32(echo_est_v_low, echo_stored_v);
+    echo_stored_v = vaddq_u32(echo_est_v_high, echo_stored_v);
 
-    echo_energy_adapt_v = vmlal_u16(echo_energy_adapt_v,
-                                    vget_low_s16(adapt_v),
-                                    vget_low_s16(spectrum_v));
-    echo_energy_adapt_v = vmlal_u16(echo_energy_adapt_v,
-                                    vget_high_s16(adapt_v),
-                                    vget_high_s16(spectrum_v));
+    echo_adapt_v = vmlal_u16(echo_adapt_v,
+                             vreinterpret_u16_s16(vget_low_s16(adapt_v)),
+                             vget_low_u16(spectrum_v));
+    echo_adapt_v = vmlal_u16(echo_adapt_v,
+                             vreinterpret_u16_s16(vget_high_s16(adapt_v)),
+                             vget_high_u16(spectrum_v));
 
     start_stored_p += 8;
     start_adapt_p += 8;
@@ -102,8 +104,8 @@
   }
 
   AddLanes(far_energy, far_energy_v);
-  AddLanes(echo_energy_stored, echo_energy_stored_v);
-  AddLanes(echo_energy_adapt, echo_energy_adapt_v);
+  AddLanes(echo_energy_stored, echo_stored_v);
+  AddLanes(echo_energy_adapt, echo_adapt_v);
 
   echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN],
                                              far_spectrum[PART_LEN]);
@@ -143,8 +145,9 @@
   const int16_t* end_stored_p = aecm->channelStored + PART_LEN;
   int32_t* echo_est_p = echo_est;
 
-  int16x8_t far_spectrum_v, adapt_v;
-  int32x4_t echo_est_v_low, echo_est_v_high;
+  uint16x8_t far_spectrum_v;
+  int16x8_t adapt_v;
+  uint32x4_t echo_est_v_low, echo_est_v_high;
 
   while (start_stored_p < end_stored_p) {
     far_spectrum_v = vld1q_u16(far_spectrum_p);
@@ -153,12 +156,12 @@
     vst1q_s16(start_stored_p, adapt_v);
 
     echo_est_v_low = vmull_u16(vget_low_u16(far_spectrum_v),
-                               vget_low_u16(adapt_v));
+                               vget_low_u16(vreinterpretq_u16_s16(adapt_v)));
     echo_est_v_high = vmull_u16(vget_high_u16(far_spectrum_v),
-                                vget_high_u16(adapt_v));
+                                vget_high_u16(vreinterpretq_u16_s16(adapt_v)));
 
-    vst1q_s32(echo_est_p, echo_est_v_low);
-    vst1q_s32(echo_est_p + 4, echo_est_v_high);
+    vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low));
+    vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high));
 
     far_spectrum_p += 8;
     start_adapt_p += 8;