APM-QA annotations: incorrect type bugfix and level estimation with 1 ms frames.

TBR=

Bug: webrtc:7494
Change-Id: I2d4432d5b135e70b9abb5f2794a28228ec6808ba
Reviewed-on: https://webrtc-review.googlesource.com/13621
Reviewed-by: Alessio Bazzica <alessiob@webrtc.org>
Commit-Queue: Alessio Bazzica <alessiob@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#20346}
diff --git a/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations.py b/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations.py
index 81f6af4..55b3388 100644
--- a/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations.py
+++ b/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations.py
@@ -31,11 +31,13 @@
   _VAD_FILENAME = 'vad.npy'
   _SPEECH_LEVEL_FILENAME = 'speech_level.npy'
 
-  # Level estimation params. The time constants in ms indicate the time it takes
-  # for the level estimate to go down/up by 1 db if the signal is zero.
+  # Level estimation params.
+  _ONE_DB_REDUCTION = np.power(10.0, -1.0 / 20.0)
+  _LEVEL_FRAME_SIZE_MS = 1.0
+  # The time constants in ms indicate the time it takes for the level estimate
+  # to go down/up by 1 db if the signal is zero.
   _LEVEL_ATTACK_MS = 5.0
   _LEVEL_DECAY_MS = 20.0
-  _ONE_DB_REDUCTION = np.power(10.0, -1.0 / 20.0)
 
   # VAD params.
   _VAD_THRESHOLD = 1
@@ -45,6 +47,7 @@
     self._level = None
     self._vad = None
     self._speech_level = None
+    self._level_frame_size = None
     self._c_attack = None
     self._c_decay = None
 
@@ -75,12 +78,15 @@
     if self._signal.channels != 1:
       raise NotImplementedError('multiple-channel annotations not implemented')
 
-    # Smoothing params.
-    sample_duration_ms = 1000.0 / self._signal.frame_rate
-    self._c_attack = 0 if self._LEVEL_ATTACK_MS == 0 else (
-        self._ONE_DB_REDUCTION ** (sample_duration_ms / self._LEVEL_ATTACK_MS))
-    self._c_decay = 0 if self._LEVEL_DECAY_MS == 0 else (
-        self._ONE_DB_REDUCTION ** (sample_duration_ms / self._LEVEL_DECAY_MS))
+    # level estimation params.
+    self._level_frame_size = int(self._signal.frame_rate / 1000 * (
+        self._LEVEL_FRAME_SIZE_MS))
+    self._c_attack = 0.0 if self._LEVEL_ATTACK_MS == 0 else (
+        self._ONE_DB_REDUCTION ** (
+            self._LEVEL_FRAME_SIZE_MS / self._LEVEL_ATTACK_MS))
+    self._c_decay = 0.0 if self._LEVEL_DECAY_MS == 0 else (
+        self._ONE_DB_REDUCTION ** (
+            self._LEVEL_FRAME_SIZE_MS / self._LEVEL_DECAY_MS))
 
     # Compute level.
     self._LevelEstimation()
@@ -95,6 +101,11 @@
     # Speech level based on VAD output.
     self._speech_level = self._level * self._vad
 
+    # Expand to one value per sample.
+    self._level = np.repeat(self._level, self._level_frame_size)
+    self._vad = np.repeat(self._vad, self._level_frame_size)
+    self._speech_level = np.repeat(self._speech_level, self._level_frame_size)
+
   def Save(self, output_path):
     np.save(os.path.join(output_path, self._LEVEL_FILENAME), self._level)
     np.save(os.path.join(output_path, self._VAD_FILENAME), self._vad)
@@ -104,16 +115,21 @@
   def _LevelEstimation(self):
     # Read samples.
     samples = signal_processing.SignalProcessingUtils.AudioSegmentToRawData(
-        self._signal)
-    num_samples = len(samples)
+        self._signal).astype(np.float32) / 32768.0
+    num_frames = len(samples) // self._level_frame_size
+    num_samples = num_frames * self._level_frame_size
 
     # Envelope.
-    self._level = np.abs(samples)
+    self._level = np.max(np.reshape(np.abs(samples[:num_samples]), (
+        num_frames, self._level_frame_size)), axis=1)
+    assert len(self._level) == num_frames
 
     # Envelope smoothing.
     smooth = lambda curr, prev, k: (1 - k) * curr  + k * prev
     self._level[0] = smooth(self._level[0], 0.0, self._c_attack)
-    for i in range(1, num_samples):
+    for i in range(1, num_frames):
       self._level[i] = smooth(
           self._level[i], self._level[i - 1], self._c_attack if (
               self._level[i] > self._level[i - 1]) else self._c_decay)
+
+    return self._level
diff --git a/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations_unittest.py b/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations_unittest.py
index b59397c..bac3d21 100644
--- a/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations_unittest.py
+++ b/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations_unittest.py
@@ -26,7 +26,7 @@
   """Unit tests for the annotations module.
   """
 
-  _CLEAN_TMP_OUTPUT = False
+  _CLEAN_TMP_OUTPUT = True
 
   def setUp(self):
     """Create temporary folder."""
@@ -56,12 +56,16 @@
     e = annotations.AudioAnnotationsExtractor()
     e.Extract(self._wav_file_path)
     e.Save(self._tmp_path)
-    np.testing.assert_array_equal(
-        e.GetLevel(),
-        np.load(os.path.join(self._tmp_path, e.GetLevelFileName())))
-    np.testing.assert_array_equal(
-        e.GetVad(),
-        np.load(os.path.join(self._tmp_path, e.GetVadFileName())))
-    np.testing.assert_array_equal(
-        e.GetSpeechLevel(),
-        np.load(os.path.join(self._tmp_path, e.GetSpeechLevelFileName())))
+
+    level = np.load(os.path.join(self._tmp_path, e.GetLevelFileName()))
+    np.testing.assert_array_equal(e.GetLevel(), level)
+    self.assertEqual(np.float32, level.dtype)
+
+    vad = np.load(os.path.join(self._tmp_path, e.GetVadFileName()))
+    np.testing.assert_array_equal(e.GetVad(), vad)
+    self.assertEqual(np.uint8, vad.dtype)
+
+    speech_level = np.load(os.path.join(
+        self._tmp_path, e.GetSpeechLevelFileName()))
+    np.testing.assert_array_equal(e.GetSpeechLevel(), speech_level)
+    self.assertEqual(np.float32, speech_level.dtype)