Modify the _vadActivity member of the AudioFrame passed to AudioProcessing.
This saves the user from having to explicitly check stream_has_voice(). It will allow typing detection to function, which relies on this behaviour.
Review URL: http://webrtc-codereview.appspot.com/144004
git-svn-id: http://webrtc.googlecode.com/svn/trunk@621 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/src/modules/audio_processing/main/interface/audio_processing.h b/src/modules/audio_processing/main/interface/audio_processing.h
index 350ef82..c8c8712 100644
--- a/src/modules/audio_processing/main/interface/audio_processing.h
+++ b/src/modules/audio_processing/main/interface/audio_processing.h
@@ -486,6 +486,7 @@
};
// An estimation component used to retrieve level metrics.
+// NOTE: currently unavailable. All methods return errors.
class LevelEstimator {
public:
virtual int Enable(bool enable) = 0;
@@ -539,6 +540,10 @@
// The voice activity detection (VAD) component analyzes the stream to
// determine if voice is present. A facility is also provided to pass in an
// external VAD decision.
+//
+// In addition to |stream_has_voice()| the VAD decision is provided through the
+// |AudioFrame| passed to |ProcessStream()|. The |_vadActivity| member will be
+// modified to reflect the current decision.
class VoiceDetection {
public:
virtual int Enable(bool enable) = 0;
diff --git a/src/modules/audio_processing/main/source/audio_buffer.cc b/src/modules/audio_processing/main/source/audio_buffer.cc
index 6b20fce..f7c55b4 100644
--- a/src/modules/audio_processing/main/source/audio_buffer.cc
+++ b/src/modules/audio_processing/main/source/audio_buffer.cc
@@ -10,8 +10,6 @@
#include "audio_buffer.h"
-#include "module_common_types.h"
-
namespace webrtc {
namespace {
@@ -64,21 +62,22 @@
WebRtc_Word32 synthesis_filter_state2[6];
};
-// TODO(am): check range of input parameters?
-AudioBuffer::AudioBuffer(WebRtc_Word32 max_num_channels,
- WebRtc_Word32 samples_per_channel)
- : max_num_channels_(max_num_channels),
- num_channels_(0),
- num_mixed_channels_(0),
- num_mixed_low_pass_channels_(0),
- samples_per_channel_(samples_per_channel),
- samples_per_split_channel_(samples_per_channel),
- reference_copied_(false),
- data_(NULL),
- channels_(NULL),
- split_channels_(NULL),
- mixed_low_pass_channels_(NULL),
- low_pass_reference_channels_(NULL) {
+// TODO(andrew): check range of input parameters?
+AudioBuffer::AudioBuffer(int max_num_channels,
+ int samples_per_channel)
+ : max_num_channels_(max_num_channels),
+ num_channels_(0),
+ num_mixed_channels_(0),
+ num_mixed_low_pass_channels_(0),
+ samples_per_channel_(samples_per_channel),
+ samples_per_split_channel_(samples_per_channel),
+ reference_copied_(false),
+ activity_(AudioFrame::kVadUnknown),
+ data_(NULL),
+ channels_(NULL),
+ split_channels_(NULL),
+ mixed_low_pass_channels_(NULL),
+ low_pass_reference_channels_(NULL) {
if (max_num_channels_ > 1) {
channels_ = new AudioChannel[max_num_channels_];
mixed_low_pass_channels_ = new AudioChannel[max_num_channels_];
@@ -109,7 +108,7 @@
}
}
-WebRtc_Word16* AudioBuffer::data(WebRtc_Word32 channel) const {
+WebRtc_Word16* AudioBuffer::data(int channel) const {
assert(channel >= 0 && channel < num_channels_);
if (data_ != NULL) {
return data_;
@@ -118,7 +117,7 @@
return channels_[channel].data;
}
-WebRtc_Word16* AudioBuffer::low_pass_split_data(WebRtc_Word32 channel) const {
+WebRtc_Word16* AudioBuffer::low_pass_split_data(int channel) const {
assert(channel >= 0 && channel < num_channels_);
if (split_channels_ == NULL) {
return data(channel);
@@ -127,7 +126,7 @@
return split_channels_[channel].low_pass_data;
}
-WebRtc_Word16* AudioBuffer::high_pass_split_data(WebRtc_Word32 channel) const {
+WebRtc_Word16* AudioBuffer::high_pass_split_data(int channel) const {
assert(channel >= 0 && channel < num_channels_);
if (split_channels_ == NULL) {
return NULL;
@@ -136,13 +135,13 @@
return split_channels_[channel].high_pass_data;
}
-WebRtc_Word16* AudioBuffer::mixed_low_pass_data(WebRtc_Word32 channel) const {
+WebRtc_Word16* AudioBuffer::mixed_low_pass_data(int channel) const {
assert(channel >= 0 && channel < num_mixed_low_pass_channels_);
return mixed_low_pass_channels_[channel].data;
}
-WebRtc_Word16* AudioBuffer::low_pass_reference(WebRtc_Word32 channel) const {
+WebRtc_Word16* AudioBuffer::low_pass_reference(int channel) const {
assert(channel >= 0 && channel < num_channels_);
if (!reference_copied_) {
return NULL;
@@ -151,58 +150,67 @@
return low_pass_reference_channels_[channel].data;
}
-WebRtc_Word32* AudioBuffer::analysis_filter_state1(WebRtc_Word32 channel) const {
+WebRtc_Word32* AudioBuffer::analysis_filter_state1(int channel) const {
assert(channel >= 0 && channel < num_channels_);
return split_channels_[channel].analysis_filter_state1;
}
-WebRtc_Word32* AudioBuffer::analysis_filter_state2(WebRtc_Word32 channel) const {
+WebRtc_Word32* AudioBuffer::analysis_filter_state2(int channel) const {
assert(channel >= 0 && channel < num_channels_);
return split_channels_[channel].analysis_filter_state2;
}
-WebRtc_Word32* AudioBuffer::synthesis_filter_state1(WebRtc_Word32 channel) const {
+WebRtc_Word32* AudioBuffer::synthesis_filter_state1(int channel) const {
assert(channel >= 0 && channel < num_channels_);
return split_channels_[channel].synthesis_filter_state1;
}
-WebRtc_Word32* AudioBuffer::synthesis_filter_state2(WebRtc_Word32 channel) const {
+WebRtc_Word32* AudioBuffer::synthesis_filter_state2(int channel) const {
assert(channel >= 0 && channel < num_channels_);
return split_channels_[channel].synthesis_filter_state2;
}
-WebRtc_Word32 AudioBuffer::num_channels() const {
+void AudioBuffer::set_activity(AudioFrame::VADActivity activity) {
+ activity_ = activity;
+}
+
+AudioFrame::VADActivity AudioBuffer::activity() {
+ return activity_;
+}
+
+int AudioBuffer::num_channels() const {
return num_channels_;
}
-WebRtc_Word32 AudioBuffer::samples_per_channel() const {
+int AudioBuffer::samples_per_channel() const {
return samples_per_channel_;
}
-WebRtc_Word32 AudioBuffer::samples_per_split_channel() const {
+int AudioBuffer::samples_per_split_channel() const {
return samples_per_split_channel_;
}
-// TODO(ajm): Do deinterleaving and mixing in one step?
-void AudioBuffer::DeinterleaveFrom(AudioFrame* audioFrame) {
- assert(audioFrame->_audioChannel <= max_num_channels_);
- assert(audioFrame->_payloadDataLengthInSamples == samples_per_channel_);
+// TODO(andrew): Do deinterleaving and mixing in one step?
+void AudioBuffer::DeinterleaveFrom(AudioFrame* frame) {
+ assert(frame->_audioChannel <= max_num_channels_);
+ assert(frame->_payloadDataLengthInSamples == samples_per_channel_);
- num_channels_ = audioFrame->_audioChannel;
+ num_channels_ = frame->_audioChannel;
num_mixed_channels_ = 0;
num_mixed_low_pass_channels_ = 0;
reference_copied_ = false;
+ activity_ = frame->_vadActivity;
if (num_channels_ == 1) {
// We can get away with a pointer assignment in this case.
- data_ = audioFrame->_payloadData;
+ data_ = frame->_payloadData;
return;
}
+ WebRtc_Word16* interleaved = frame->_payloadData;
for (int i = 0; i < num_channels_; i++) {
WebRtc_Word16* deinterleaved = channels_[i].data;
- WebRtc_Word16* interleaved = audioFrame->_payloadData;
- WebRtc_Word32 interleaved_idx = i;
+ int interleaved_idx = i;
for (int j = 0; j < samples_per_channel_; j++) {
deinterleaved[j] = interleaved[interleaved_idx];
interleaved_idx += num_channels_;
@@ -210,27 +218,28 @@
}
}
-void AudioBuffer::InterleaveTo(AudioFrame* audioFrame) const {
- assert(audioFrame->_audioChannel == num_channels_);
- assert(audioFrame->_payloadDataLengthInSamples == samples_per_channel_);
+void AudioBuffer::InterleaveTo(AudioFrame* frame) const {
+ assert(frame->_audioChannel == num_channels_);
+ assert(frame->_payloadDataLengthInSamples == samples_per_channel_);
+ frame->_vadActivity = activity_;
if (num_channels_ == 1) {
if (num_mixed_channels_ == 1) {
- memcpy(audioFrame->_payloadData,
+ memcpy(frame->_payloadData,
channels_[0].data,
sizeof(WebRtc_Word16) * samples_per_channel_);
} else {
// These should point to the same buffer in this case.
- assert(data_ == audioFrame->_payloadData);
+ assert(data_ == frame->_payloadData);
}
return;
}
+ WebRtc_Word16* interleaved = frame->_payloadData;
for (int i = 0; i < num_channels_; i++) {
WebRtc_Word16* deinterleaved = channels_[i].data;
- WebRtc_Word16* interleaved = audioFrame->_payloadData;
- WebRtc_Word32 interleaved_idx = i;
+ int interleaved_idx = i;
for (int j = 0; j < samples_per_channel_; j++) {
interleaved[interleaved_idx] = deinterleaved[j];
interleaved_idx += num_channels_;
@@ -238,9 +247,10 @@
}
}
-// TODO(ajm): would be good to support the no-mix case with pointer assignment.
-// TODO(ajm): handle mixing to multiple channels?
-void AudioBuffer::Mix(WebRtc_Word32 num_mixed_channels) {
+// TODO(andrew): would be good to support the no-mix case with pointer
+// assignment.
+// TODO(andrew): handle mixing to multiple channels?
+void AudioBuffer::Mix(int num_mixed_channels) {
// We currently only support the stereo to mono case.
assert(num_channels_ == 2);
assert(num_mixed_channels == 1);
@@ -254,7 +264,7 @@
num_mixed_channels_ = num_mixed_channels;
}
-void AudioBuffer::CopyAndMixLowPass(WebRtc_Word32 num_mixed_channels) {
+void AudioBuffer::CopyAndMixLowPass(int num_mixed_channels) {
// We currently only support the stereo to mono case.
assert(num_channels_ == 2);
assert(num_mixed_channels == 1);
diff --git a/src/modules/audio_processing/main/source/audio_buffer.h b/src/modules/audio_processing/main/source/audio_buffer.h
index 15f850b..1bdd3c7 100644
--- a/src/modules/audio_processing/main/source/audio_buffer.h
+++ b/src/modules/audio_processing/main/source/audio_buffer.h
@@ -11,55 +11,58 @@
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_MAIN_SOURCE_AUDIO_BUFFER_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_MAIN_SOURCE_AUDIO_BUFFER_H_
+#include "module_common_types.h"
#include "typedefs.h"
-
namespace webrtc {
struct AudioChannel;
struct SplitAudioChannel;
-class AudioFrame;
class AudioBuffer {
public:
- AudioBuffer(WebRtc_Word32 max_num_channels, WebRtc_Word32 samples_per_channel);
+ AudioBuffer(int max_num_channels, int samples_per_channel);
virtual ~AudioBuffer();
- WebRtc_Word32 num_channels() const;
- WebRtc_Word32 samples_per_channel() const;
- WebRtc_Word32 samples_per_split_channel() const;
+ int num_channels() const;
+ int samples_per_channel() const;
+ int samples_per_split_channel() const;
- WebRtc_Word16* data(WebRtc_Word32 channel) const;
- WebRtc_Word16* low_pass_split_data(WebRtc_Word32 channel) const;
- WebRtc_Word16* high_pass_split_data(WebRtc_Word32 channel) const;
- WebRtc_Word16* mixed_low_pass_data(WebRtc_Word32 channel) const;
- WebRtc_Word16* low_pass_reference(WebRtc_Word32 channel) const;
+ WebRtc_Word16* data(int channel) const;
+ WebRtc_Word16* low_pass_split_data(int channel) const;
+ WebRtc_Word16* high_pass_split_data(int channel) const;
+ WebRtc_Word16* mixed_low_pass_data(int channel) const;
+ WebRtc_Word16* low_pass_reference(int channel) const;
- WebRtc_Word32* analysis_filter_state1(WebRtc_Word32 channel) const;
- WebRtc_Word32* analysis_filter_state2(WebRtc_Word32 channel) const;
- WebRtc_Word32* synthesis_filter_state1(WebRtc_Word32 channel) const;
- WebRtc_Word32* synthesis_filter_state2(WebRtc_Word32 channel) const;
+ WebRtc_Word32* analysis_filter_state1(int channel) const;
+ WebRtc_Word32* analysis_filter_state2(int channel) const;
+ WebRtc_Word32* synthesis_filter_state1(int channel) const;
+ WebRtc_Word32* synthesis_filter_state2(int channel) const;
+
+ void set_activity(AudioFrame::VADActivity activity);
+ AudioFrame::VADActivity activity();
void DeinterleaveFrom(AudioFrame* audioFrame);
void InterleaveTo(AudioFrame* audioFrame) const;
- void Mix(WebRtc_Word32 num_mixed_channels);
- void CopyAndMixLowPass(WebRtc_Word32 num_mixed_channels);
+ void Mix(int num_mixed_channels);
+ void CopyAndMixLowPass(int num_mixed_channels);
void CopyLowPassToReference();
private:
- const WebRtc_Word32 max_num_channels_;
- WebRtc_Word32 num_channels_;
- WebRtc_Word32 num_mixed_channels_;
- WebRtc_Word32 num_mixed_low_pass_channels_;
- const WebRtc_Word32 samples_per_channel_;
- WebRtc_Word32 samples_per_split_channel_;
+ const int max_num_channels_;
+ int num_channels_;
+ int num_mixed_channels_;
+ int num_mixed_low_pass_channels_;
+ const int samples_per_channel_;
+ int samples_per_split_channel_;
bool reference_copied_;
+ AudioFrame::VADActivity activity_;
WebRtc_Word16* data_;
- // TODO(ajm): Prefer to make these vectors if permitted...
+ // TODO(andrew): use vectors here.
AudioChannel* channels_;
SplitAudioChannel* split_channels_;
- // TODO(ajm): improve this, we don't need the full 32 kHz space here.
+ // TODO(andrew): improve this, we don't need the full 32 kHz space here.
AudioChannel* mixed_low_pass_channels_;
AudioChannel* low_pass_reference_channels_;
};
diff --git a/src/modules/audio_processing/main/source/voice_detection_impl.cc b/src/modules/audio_processing/main/source/voice_detection_impl.cc
index 3eb446e..49aac2e 100644
--- a/src/modules/audio_processing/main/source/voice_detection_impl.cc
+++ b/src/modules/audio_processing/main/source/voice_detection_impl.cc
@@ -74,16 +74,16 @@
// TODO(ajm): concatenate data in frame buffer here.
- int vad_ret_val;
- vad_ret_val = WebRtcVad_Process(static_cast<Handle*>(handle(0)),
- apm_->split_sample_rate_hz(),
- mixed_data,
- frame_size_samples_);
-
- if (vad_ret_val == 0) {
+ int vad_ret = WebRtcVad_Process(static_cast<Handle*>(handle(0)),
+ apm_->split_sample_rate_hz(),
+ mixed_data,
+ frame_size_samples_);
+ if (vad_ret == 0) {
stream_has_voice_ = false;
- } else if (vad_ret_val == 1) {
+ audio->set_activity(AudioFrame::kVadPassive);
+ } else if (vad_ret == 1) {
stream_has_voice_ = true;
+ audio->set_activity(AudioFrame::kVadActive);
} else {
return apm_->kUnspecifiedError;
}
diff --git a/src/modules/audio_processing/main/test/unit_test/unit_test.cc b/src/modules/audio_processing/main/test/unit_test/unit_test.cc
index 0563fdf..5c9f5afd 100644
--- a/src/modules/audio_processing/main/test/unit_test/unit_test.cc
+++ b/src/modules/audio_processing/main/test/unit_test/unit_test.cc
@@ -555,6 +555,7 @@
&temp_data[0],
sizeof(WebRtc_Word16) * read_count);
}
+ frame_->_vadActivity = AudioFrame::kVadUnknown;
EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_));
@@ -571,6 +572,9 @@
}
if (apm_->voice_detection()->stream_has_voice()) {
has_voice_count++;
+ EXPECT_EQ(AudioFrame::kVadActive, frame_->_vadActivity);
+ } else {
+ EXPECT_EQ(AudioFrame::kVadPassive, frame_->_vadActivity);
}
frame_count++;
@@ -966,27 +970,27 @@
EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(false));
EXPECT_FALSE(apm_->voice_detection()->is_enabled());
+ // Test that AudioFrame activity is maintained when VAD is disabled.
+ EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(false));
+ AudioFrame::VADActivity activity[] = {
+ AudioFrame::kVadActive,
+ AudioFrame::kVadPassive,
+ AudioFrame::kVadUnknown
+ };
+ for (size_t i = 0; i < sizeof(activity)/sizeof(*activity); i++) {
+ frame_->_vadActivity = activity[i];
+ EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_));
+ EXPECT_EQ(activity[i], frame_->_vadActivity);
+ }
+
+ // Test that AudioFrame activity is set when VAD is enabled.
+ EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(true));
+ frame_->_vadActivity = AudioFrame::kVadUnknown;
+ EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_));
+ EXPECT_NE(AudioFrame::kVadUnknown, frame_->_vadActivity);
+
// TODO(bjornv): Add tests for streamed voice; stream_has_voice()
}
-
-// Below are some ideas for tests from VPM.
-
-/*TEST_F(VideoProcessingModuleTest, GetVersionTest)
-{
-}
-
-TEST_F(VideoProcessingModuleTest, HandleNullBuffer)
-{
-}
-
-TEST_F(VideoProcessingModuleTest, HandleBadSize)
-{
-}
-
-TEST_F(VideoProcessingModuleTest, IdenticalResultsAfterReset)
-{
-}
-*/
} // namespace
int main(int argc, char** argv) {