APM-QA annotations: incorrect type bugfix and level estimation with 1 ms frames.
TBR=
Bug: webrtc:7494
Change-Id: I2d4432d5b135e70b9abb5f2794a28228ec6808ba
Reviewed-on: https://webrtc-review.googlesource.com/13621
Reviewed-by: Alessio Bazzica <alessiob@webrtc.org>
Commit-Queue: Alessio Bazzica <alessiob@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#20346}
diff --git a/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations.py b/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations.py
index 81f6af4..55b3388 100644
--- a/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations.py
+++ b/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations.py
@@ -31,11 +31,13 @@
_VAD_FILENAME = 'vad.npy'
_SPEECH_LEVEL_FILENAME = 'speech_level.npy'
- # Level estimation params. The time constants in ms indicate the time it takes
- # for the level estimate to go down/up by 1 db if the signal is zero.
+ # Level estimation params.
+ _ONE_DB_REDUCTION = np.power(10.0, -1.0 / 20.0)
+ _LEVEL_FRAME_SIZE_MS = 1.0
+ # The time constants in ms indicate the time it takes for the level estimate
+ # to go down/up by 1 db if the signal is zero.
_LEVEL_ATTACK_MS = 5.0
_LEVEL_DECAY_MS = 20.0
- _ONE_DB_REDUCTION = np.power(10.0, -1.0 / 20.0)
# VAD params.
_VAD_THRESHOLD = 1
@@ -45,6 +47,7 @@
self._level = None
self._vad = None
self._speech_level = None
+ self._level_frame_size = None
self._c_attack = None
self._c_decay = None
@@ -75,12 +78,15 @@
if self._signal.channels != 1:
raise NotImplementedError('multiple-channel annotations not implemented')
- # Smoothing params.
- sample_duration_ms = 1000.0 / self._signal.frame_rate
- self._c_attack = 0 if self._LEVEL_ATTACK_MS == 0 else (
- self._ONE_DB_REDUCTION ** (sample_duration_ms / self._LEVEL_ATTACK_MS))
- self._c_decay = 0 if self._LEVEL_DECAY_MS == 0 else (
- self._ONE_DB_REDUCTION ** (sample_duration_ms / self._LEVEL_DECAY_MS))
+ # level estimation params.
+ self._level_frame_size = int(self._signal.frame_rate / 1000 * (
+ self._LEVEL_FRAME_SIZE_MS))
+ self._c_attack = 0.0 if self._LEVEL_ATTACK_MS == 0 else (
+ self._ONE_DB_REDUCTION ** (
+ self._LEVEL_FRAME_SIZE_MS / self._LEVEL_ATTACK_MS))
+ self._c_decay = 0.0 if self._LEVEL_DECAY_MS == 0 else (
+ self._ONE_DB_REDUCTION ** (
+ self._LEVEL_FRAME_SIZE_MS / self._LEVEL_DECAY_MS))
# Compute level.
self._LevelEstimation()
@@ -95,6 +101,11 @@
# Speech level based on VAD output.
self._speech_level = self._level * self._vad
+ # Expand to one value per sample.
+ self._level = np.repeat(self._level, self._level_frame_size)
+ self._vad = np.repeat(self._vad, self._level_frame_size)
+ self._speech_level = np.repeat(self._speech_level, self._level_frame_size)
+
def Save(self, output_path):
np.save(os.path.join(output_path, self._LEVEL_FILENAME), self._level)
np.save(os.path.join(output_path, self._VAD_FILENAME), self._vad)
@@ -104,16 +115,21 @@
def _LevelEstimation(self):
# Read samples.
samples = signal_processing.SignalProcessingUtils.AudioSegmentToRawData(
- self._signal)
- num_samples = len(samples)
+ self._signal).astype(np.float32) / 32768.0
+ num_frames = len(samples) // self._level_frame_size
+ num_samples = num_frames * self._level_frame_size
# Envelope.
- self._level = np.abs(samples)
+ self._level = np.max(np.reshape(np.abs(samples[:num_samples]), (
+ num_frames, self._level_frame_size)), axis=1)
+ assert len(self._level) == num_frames
# Envelope smoothing.
smooth = lambda curr, prev, k: (1 - k) * curr + k * prev
self._level[0] = smooth(self._level[0], 0.0, self._c_attack)
- for i in range(1, num_samples):
+ for i in range(1, num_frames):
self._level[i] = smooth(
self._level[i], self._level[i - 1], self._c_attack if (
self._level[i] > self._level[i - 1]) else self._c_decay)
+
+ return self._level
diff --git a/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations_unittest.py b/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations_unittest.py
index b59397c..bac3d21 100644
--- a/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations_unittest.py
+++ b/modules/audio_processing/test/py_quality_assessment/quality_assessment/annotations_unittest.py
@@ -26,7 +26,7 @@
"""Unit tests for the annotations module.
"""
- _CLEAN_TMP_OUTPUT = False
+ _CLEAN_TMP_OUTPUT = True
def setUp(self):
"""Create temporary folder."""
@@ -56,12 +56,16 @@
e = annotations.AudioAnnotationsExtractor()
e.Extract(self._wav_file_path)
e.Save(self._tmp_path)
- np.testing.assert_array_equal(
- e.GetLevel(),
- np.load(os.path.join(self._tmp_path, e.GetLevelFileName())))
- np.testing.assert_array_equal(
- e.GetVad(),
- np.load(os.path.join(self._tmp_path, e.GetVadFileName())))
- np.testing.assert_array_equal(
- e.GetSpeechLevel(),
- np.load(os.path.join(self._tmp_path, e.GetSpeechLevelFileName())))
+
+ level = np.load(os.path.join(self._tmp_path, e.GetLevelFileName()))
+ np.testing.assert_array_equal(e.GetLevel(), level)
+ self.assertEqual(np.float32, level.dtype)
+
+ vad = np.load(os.path.join(self._tmp_path, e.GetVadFileName()))
+ np.testing.assert_array_equal(e.GetVad(), vad)
+ self.assertEqual(np.uint8, vad.dtype)
+
+ speech_level = np.load(os.path.join(
+ self._tmp_path, e.GetSpeechLevelFileName()))
+ np.testing.assert_array_equal(e.GetSpeechLevel(), speech_level)
+ self.assertEqual(np.float32, speech_level.dtype)