| /* |
| * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. |
| * |
| * Use of this source code is governed by a BSD-style license |
| * that can be found in the LICENSE file in the root of the source |
| * tree. An additional intellectual property rights grant can be found |
| * in the file PATENTS. All contributing project authors may |
| * be found in the AUTHORS file in the root of the source tree. |
| */ |
| |
| |
| #include <math.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| |
| #include <algorithm> |
| #include <memory> |
| |
| #include "gflags/gflags.h" |
| #include "testing/gtest/include/gtest/gtest.h" |
| #include "webrtc/modules/audio_processing/agc/agc.h" |
| #include "webrtc/modules/audio_processing/agc/loudness_histogram.h" |
| #include "webrtc/modules/audio_processing/agc/utility.h" |
| #include "webrtc/modules/audio_processing/vad/vad_audio_proc.h" |
| #include "webrtc/modules/audio_processing/vad/common.h" |
| #include "webrtc/modules/audio_processing/vad/pitch_based_vad.h" |
| #include "webrtc/modules/audio_processing/vad/standalone_vad.h" |
| #include "webrtc/modules/include/module_common_types.h" |
| |
| static const int kAgcAnalWindowSamples = 100; |
| static const double kDefaultActivityThreshold = 0.3; |
| |
| DEFINE_bool(standalone_vad, true, "enable stand-alone VAD"); |
| DEFINE_string(true_vad, "", "name of a file containing true VAD in 'int'" |
| " format"); |
| DEFINE_string(video_vad, "", "name of a file containing video VAD (activity" |
| " probabilities) in double format. One activity per 10ms is" |
| " required. If no file is given the video information is not" |
| " incorporated. Negative activity is interpreted as video is" |
| " not adapted and the statistics are not computed during" |
| " the learning phase. Note that the negative video activities" |
| " are ONLY allowed at the beginning."); |
| DEFINE_string(result, "", "name of a file to write the results. The results" |
| " will be appended to the end of the file. This is optional."); |
| DEFINE_string(audio_content, "", "name of a file where audio content is written" |
| " to, in double format."); |
| DEFINE_double(activity_threshold, kDefaultActivityThreshold, |
| "Activity threshold"); |
| |
| namespace webrtc { |
| |
| // TODO(turajs) A new CL will be committed soon where ExtractFeatures will |
| // notify the caller of "silence" input, instead of bailing out. We would not |
| // need the following function when such a change is made. |
| |
| // Add some dither to quiet frames. This avoids the ExtractFeatures skip a |
| // silence frame. Otherwise true VAD would drift with respect to the audio. |
| // We only consider mono inputs. |
| static void DitherSilence(AudioFrame* frame) { |
| ASSERT_EQ(1u, frame->num_channels_); |
| const double kRmsSilence = 5; |
| const double sum_squared_silence = kRmsSilence * kRmsSilence * |
| frame->samples_per_channel_; |
| double sum_squared = 0; |
| for (size_t n = 0; n < frame->samples_per_channel_; n++) |
| sum_squared += frame->data_[n] * frame->data_[n]; |
| if (sum_squared <= sum_squared_silence) { |
| for (size_t n = 0; n < frame->samples_per_channel_; n++) |
| frame->data_[n] = (rand() & 0xF) - 8; // NOLINT: ignore non-threadsafe. |
| } |
| } |
| |
| class AgcStat { |
| public: |
| AgcStat() |
| : video_index_(0), |
| activity_threshold_(kDefaultActivityThreshold), |
| audio_content_(LoudnessHistogram::Create(kAgcAnalWindowSamples)), |
| audio_processing_(new VadAudioProc()), |
| vad_(new PitchBasedVad()), |
| standalone_vad_(StandaloneVad::Create()), |
| audio_content_fid_(NULL) { |
| for (size_t n = 0; n < kMaxNumFrames; n++) |
| video_vad_[n] = 0.5; |
| } |
| |
| ~AgcStat() { |
| if (audio_content_fid_ != NULL) { |
| fclose(audio_content_fid_); |
| } |
| } |
| |
| void set_audio_content_file(FILE* audio_content_fid) { |
| audio_content_fid_ = audio_content_fid; |
| } |
| |
| int AddAudio(const AudioFrame& frame, double p_video, |
| int* combined_vad) { |
| if (frame.num_channels_ != 1 || |
| frame.samples_per_channel_ != |
| kSampleRateHz / 100 || |
| frame.sample_rate_hz_ != kSampleRateHz) |
| return -1; |
| video_vad_[video_index_++] = p_video; |
| AudioFeatures features; |
| audio_processing_->ExtractFeatures( |
| frame.data_, frame.samples_per_channel_, &features); |
| if (FLAGS_standalone_vad) { |
| standalone_vad_->AddAudio(frame.data_, |
| frame.samples_per_channel_); |
| } |
| if (features.num_frames > 0) { |
| double p[kMaxNumFrames] = {0.5, 0.5, 0.5, 0.5}; |
| if (FLAGS_standalone_vad) { |
| standalone_vad_->GetActivity(p, kMaxNumFrames); |
| } |
| // TODO(turajs) combining and limiting are used in the source files as |
| // well they can be moved to utility. |
| // Combine Video and stand-alone VAD. |
| for (size_t n = 0; n < features.num_frames; n++) { |
| double p_active = p[n] * video_vad_[n]; |
| double p_passive = (1 - p[n]) * (1 - video_vad_[n]); |
| p[n] = p_active / (p_active + p_passive); |
| // Limit probabilities. |
| p[n] = std::min(std::max(p[n], 0.01), 0.99); |
| } |
| if (vad_->VoicingProbability(features, p) < 0) |
| return -1; |
| for (size_t n = 0; n < features.num_frames; n++) { |
| audio_content_->Update(features.rms[n], p[n]); |
| double ac = audio_content_->AudioContent(); |
| if (audio_content_fid_ != NULL) { |
| fwrite(&ac, sizeof(ac), 1, audio_content_fid_); |
| } |
| if (ac > kAgcAnalWindowSamples * activity_threshold_) { |
| combined_vad[n] = 1; |
| } else { |
| combined_vad[n] = 0; |
| } |
| } |
| video_index_ = 0; |
| } |
| return static_cast<int>(features.num_frames); |
| } |
| |
| void Reset() { |
| audio_content_->Reset(); |
| } |
| |
| void SetActivityThreshold(double activity_threshold) { |
| activity_threshold_ = activity_threshold; |
| } |
| |
| private: |
| int video_index_; |
| double activity_threshold_; |
| double video_vad_[kMaxNumFrames]; |
| std::unique_ptr<LoudnessHistogram> audio_content_; |
| std::unique_ptr<VadAudioProc> audio_processing_; |
| std::unique_ptr<PitchBasedVad> vad_; |
| std::unique_ptr<StandaloneVad> standalone_vad_; |
| |
| FILE* audio_content_fid_; |
| }; |
| |
| |
| void void_main(int argc, char* argv[]) { |
| webrtc::AgcStat agc_stat; |
| |
| FILE* pcm_fid = fopen(argv[1], "rb"); |
| ASSERT_TRUE(pcm_fid != NULL) << "Cannot open PCM file " << argv[1]; |
| |
| if (argc < 2) { |
| fprintf(stderr, "\nNot Enough arguments\n"); |
| } |
| |
| FILE* true_vad_fid = NULL; |
| ASSERT_GT(FLAGS_true_vad.size(), 0u) << "Specify the file containing true " |
| "VADs using --true_vad flag."; |
| true_vad_fid = fopen(FLAGS_true_vad.c_str(), "rb"); |
| ASSERT_TRUE(true_vad_fid != NULL) << "Cannot open the active list " << |
| FLAGS_true_vad; |
| |
| FILE* results_fid = NULL; |
| if (FLAGS_result.size() > 0) { |
| // True if this is the first time writing to this function and we add a |
| // header to the beginning of the file. |
| bool write_header; |
| // Open in the read mode. If it fails, the file doesn't exist and has to |
| // write a header for it. Otherwise no need to write a header. |
| results_fid = fopen(FLAGS_result.c_str(), "r"); |
| if (results_fid == NULL) { |
| write_header = true; |
| } else { |
| fclose(results_fid); |
| write_header = false; |
| } |
| // Open in append mode. |
| results_fid = fopen(FLAGS_result.c_str(), "a"); |
| ASSERT_TRUE(results_fid != NULL) << "Cannot open the file, " << |
| FLAGS_result << ", to write the results."; |
| // Write the header if required. |
| if (write_header) { |
| fprintf(results_fid, "%% Total Active, Misdetection, " |
| "Total inactive, False Positive, On-sets, Missed segments, " |
| "Average response\n"); |
| } |
| } |
| |
| FILE* video_vad_fid = NULL; |
| if (FLAGS_video_vad.size() > 0) { |
| video_vad_fid = fopen(FLAGS_video_vad.c_str(), "rb"); |
| ASSERT_TRUE(video_vad_fid != NULL) << "Cannot open the file, " << |
| FLAGS_video_vad << " to read video-based VAD decisions.\n"; |
| } |
| |
| // AgsStat will be the owner of this file and will close it at its |
| // destructor. |
| FILE* audio_content_fid = NULL; |
| if (FLAGS_audio_content.size() > 0) { |
| audio_content_fid = fopen(FLAGS_audio_content.c_str(), "wb"); |
| ASSERT_TRUE(audio_content_fid != NULL) << "Cannot open file, " << |
| FLAGS_audio_content << " to write audio-content.\n"; |
| agc_stat.set_audio_content_file(audio_content_fid); |
| } |
| |
| webrtc::AudioFrame frame; |
| frame.num_channels_ = 1; |
| frame.sample_rate_hz_ = 16000; |
| frame.samples_per_channel_ = frame.sample_rate_hz_ / 100; |
| const size_t kSamplesToRead = frame.num_channels_ * |
| frame.samples_per_channel_; |
| |
| agc_stat.SetActivityThreshold(FLAGS_activity_threshold); |
| |
| int ret_val = 0; |
| int num_frames = 0; |
| int agc_vad[kMaxNumFrames]; |
| uint8_t true_vad[kMaxNumFrames]; |
| double p_video = 0.5; |
| int total_active = 0; |
| int total_passive = 0; |
| int total_false_positive = 0; |
| int total_missed_detection = 0; |
| int onset_adaptation = 0; |
| int num_onsets = 0; |
| bool onset = false; |
| uint8_t previous_true_vad = 0; |
| int num_not_adapted = 0; |
| size_t true_vad_index = 0; |
| bool in_false_positive_region = false; |
| int total_false_positive_duration = 0; |
| bool video_adapted = false; |
| while (kSamplesToRead == fread(frame.data_, sizeof(int16_t), |
| kSamplesToRead, pcm_fid)) { |
| assert(true_vad_index < kMaxNumFrames); |
| ASSERT_EQ(1u, fread(&true_vad[true_vad_index], sizeof(*true_vad), 1, |
| true_vad_fid)) |
| << "Size mismatch between True-VAD and the PCM file.\n"; |
| if (video_vad_fid != NULL) { |
| ASSERT_EQ(1u, fread(&p_video, sizeof(p_video), 1, video_vad_fid)) << |
| "Not enough video-based VAD probabilities."; |
| } |
| |
| // Negative video activity indicates that the video-based VAD is not yet |
| // adapted. Disregards the learning phase in statistics. |
| if (p_video < 0) { |
| if (video_adapted) { |
| fprintf(stderr, "Negative video probabilities ONLY allowed at the " |
| "beginning of the sequence, not in the middle.\n"); |
| exit(1); |
| } |
| continue; |
| } else { |
| video_adapted = true; |
| } |
| |
| num_frames++; |
| uint8_t last_true_vad; |
| if (true_vad_index == 0) { |
| last_true_vad = previous_true_vad; |
| } else { |
| last_true_vad = true_vad[true_vad_index - 1]; |
| } |
| if (last_true_vad == 1 && true_vad[true_vad_index] == 0) { |
| agc_stat.Reset(); |
| } |
| true_vad_index++; |
| |
| DitherSilence(&frame); |
| |
| ret_val = agc_stat.AddAudio(frame, p_video, agc_vad); |
| ASSERT_GE(ret_val, 0); |
| |
| if (ret_val > 0) { |
| ASSERT_EQ(true_vad_index, static_cast<size_t>(ret_val)); |
| for (int n = 0; n < ret_val; n++) { |
| if (true_vad[n] == 1) { |
| total_active++; |
| if (previous_true_vad == 0) { |
| num_onsets++; |
| onset = true; |
| } |
| if (agc_vad[n] == 0) { |
| total_missed_detection++; |
| if (onset) |
| onset_adaptation++; |
| } else { |
| in_false_positive_region = false; |
| onset = false; |
| } |
| } else if (true_vad[n] == 0) { |
| // Check if |on_set| flag is still up. If so it means that we totally |
| // missed an active region |
| if (onset) |
| num_not_adapted++; |
| onset = false; |
| |
| total_passive++; |
| if (agc_vad[n] == 1) { |
| total_false_positive++; |
| in_false_positive_region = true; |
| } |
| if (in_false_positive_region) { |
| total_false_positive_duration++; |
| } |
| } else { |
| ASSERT_TRUE(false) << "Invalid value for true-VAD.\n"; |
| } |
| previous_true_vad = true_vad[n]; |
| } |
| true_vad_index = 0; |
| } |
| } |
| |
| if (results_fid != NULL) { |
| fprintf(results_fid, "%4d %4d %4d %4d %4d %4d %4.0f %4.0f\n", |
| total_active, |
| total_missed_detection, |
| total_passive, |
| total_false_positive, |
| num_onsets, |
| num_not_adapted, |
| static_cast<float>(onset_adaptation) / (num_onsets + 1e-12), |
| static_cast<float>(total_false_positive_duration) / |
| (total_passive + 1e-12)); |
| } |
| fprintf(stdout, "%4d %4d %4d %4d %4d %4d %4.0f %4.0f\n", |
| total_active, |
| total_missed_detection, |
| total_passive, |
| total_false_positive, |
| num_onsets, |
| num_not_adapted, |
| static_cast<float>(onset_adaptation) / (num_onsets + 1e-12), |
| static_cast<float>(total_false_positive_duration) / |
| (total_passive + 1e-12)); |
| |
| fclose(true_vad_fid); |
| fclose(pcm_fid); |
| if (video_vad_fid != NULL) { |
| fclose(video_vad_fid); |
| } |
| if (results_fid != NULL) { |
| fclose(results_fid); |
| } |
| } |
| |
| } // namespace webrtc |
| |
| int main(int argc, char* argv[]) { |
| char kUsage[] = |
| "\nCompute the number of misdetected and false-positive frames. Not\n" |
| " that for each frame of audio (10 ms) there should be one true\n" |
| " activity. If any video-based activity is given, there should also be\n" |
| " one probability per frame.\n" |
| "\nUsage:\n\n" |
| "activity_metric input_pcm [options]\n" |
| "where 'input_pcm' is the input audio sampled at 16 kHz in 16 bits " |
| "format.\n\n"; |
| google::SetUsageMessage(kUsage); |
| google::ParseCommandLineFlags(&argc, &argv, true); |
| webrtc::void_main(argc, argv); |
| return 0; |
| } |