| /* | 
 |  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. | 
 |  * | 
 |  *  Use of this source code is governed by a BSD-style license | 
 |  *  that can be found in the LICENSE file in the root of the source | 
 |  *  tree. An additional intellectual property rights grant can be found | 
 |  *  in the file PATENTS.  All contributing project authors may | 
 |  *  be found in the AUTHORS file in the root of the source tree. | 
 |  */ | 
 |  | 
 | #include <assert.h> | 
 | #include <stdint.h> | 
 | #include <stdio.h> | 
 | #include <stdlib.h> | 
 | #include <string.h> | 
 | #include <iostream> | 
 | #include <memory> | 
 |  | 
 | #include "api/audio/audio_frame.h" | 
 | #include "modules/audio_processing/agc/loudness_histogram.h" | 
 | #include "modules/audio_processing/vad/common.h" | 
 | #include "modules/audio_processing/vad/pitch_based_vad.h" | 
 | #include "modules/audio_processing/vad/standalone_vad.h" | 
 | #include "modules/audio_processing/vad/vad_audio_proc.h" | 
 | #include "rtc_base/flags.h" | 
 | #include "rtc_base/numerics/safe_minmax.h" | 
 | #include "test/gtest.h" | 
 |  | 
 | static const int kAgcAnalWindowSamples = 100; | 
 | static const float kDefaultActivityThreshold = 0.3f; | 
 |  | 
 | WEBRTC_DEFINE_bool(standalone_vad, true, "enable stand-alone VAD"); | 
 | WEBRTC_DEFINE_string(true_vad, | 
 |                      "", | 
 |                      "name of a file containing true VAD in 'int'" | 
 |                      " format"); | 
 | WEBRTC_DEFINE_string( | 
 |     video_vad, | 
 |     "", | 
 |     "name of a file containing video VAD (activity" | 
 |     " probabilities) in double format. One activity per 10ms is" | 
 |     " required. If no file is given the video information is not" | 
 |     " incorporated. Negative activity is interpreted as video is" | 
 |     " not adapted and the statistics are not computed during" | 
 |     " the learning phase. Note that the negative video activities" | 
 |     " are ONLY allowed at the beginning."); | 
 | WEBRTC_DEFINE_string( | 
 |     result, | 
 |     "", | 
 |     "name of a file to write the results. The results" | 
 |     " will be appended to the end of the file. This is optional."); | 
 | WEBRTC_DEFINE_string(audio_content, | 
 |                      "", | 
 |                      "name of a file where audio content is written" | 
 |                      " to, in double format."); | 
 | WEBRTC_DEFINE_float(activity_threshold, | 
 |                     kDefaultActivityThreshold, | 
 |                     "Activity threshold"); | 
 | WEBRTC_DEFINE_bool(help, false, "prints this message"); | 
 |  | 
 | namespace webrtc { | 
 |  | 
 | // TODO(turajs) A new CL will be committed soon where ExtractFeatures will | 
 | // notify the caller of "silence" input, instead of bailing out. We would not | 
 | // need the following function when such a change is made. | 
 |  | 
 | // Add some dither to quiet frames. This avoids the ExtractFeatures skip a | 
 | // silence frame. Otherwise true VAD would drift with respect to the audio. | 
 | // We only consider mono inputs. | 
 | static void DitherSilence(AudioFrame* frame) { | 
 |   ASSERT_EQ(1u, frame->num_channels_); | 
 |   const double kRmsSilence = 5; | 
 |   const double sum_squared_silence = | 
 |       kRmsSilence * kRmsSilence * frame->samples_per_channel_; | 
 |   double sum_squared = 0; | 
 |   int16_t* frame_data = frame->mutable_data(); | 
 |   for (size_t n = 0; n < frame->samples_per_channel_; n++) | 
 |     sum_squared += frame_data[n] * frame_data[n]; | 
 |   if (sum_squared <= sum_squared_silence) { | 
 |     for (size_t n = 0; n < frame->samples_per_channel_; n++) | 
 |       frame_data[n] = (rand() & 0xF) - 8;  // NOLINT: ignore non-threadsafe. | 
 |   } | 
 | } | 
 |  | 
 | class AgcStat { | 
 |  public: | 
 |   AgcStat() | 
 |       : video_index_(0), | 
 |         activity_threshold_(kDefaultActivityThreshold), | 
 |         audio_content_(LoudnessHistogram::Create(kAgcAnalWindowSamples)), | 
 |         audio_processing_(new VadAudioProc()), | 
 |         vad_(new PitchBasedVad()), | 
 |         standalone_vad_(StandaloneVad::Create()), | 
 |         audio_content_fid_(NULL) { | 
 |     for (size_t n = 0; n < kMaxNumFrames; n++) | 
 |       video_vad_[n] = 0.5; | 
 |   } | 
 |  | 
 |   ~AgcStat() { | 
 |     if (audio_content_fid_ != NULL) { | 
 |       fclose(audio_content_fid_); | 
 |     } | 
 |   } | 
 |  | 
 |   void set_audio_content_file(FILE* audio_content_fid) { | 
 |     audio_content_fid_ = audio_content_fid; | 
 |   } | 
 |  | 
 |   int AddAudio(const AudioFrame& frame, double p_video, int* combined_vad) { | 
 |     if (frame.num_channels_ != 1 || | 
 |         frame.samples_per_channel_ != kSampleRateHz / 100 || | 
 |         frame.sample_rate_hz_ != kSampleRateHz) | 
 |       return -1; | 
 |     video_vad_[video_index_++] = p_video; | 
 |     AudioFeatures features; | 
 |     const int16_t* frame_data = frame.data(); | 
 |     audio_processing_->ExtractFeatures(frame_data, frame.samples_per_channel_, | 
 |                                        &features); | 
 |     if (FLAG_standalone_vad) { | 
 |       standalone_vad_->AddAudio(frame_data, frame.samples_per_channel_); | 
 |     } | 
 |     if (features.num_frames > 0) { | 
 |       double p[kMaxNumFrames] = {0.5, 0.5, 0.5, 0.5}; | 
 |       if (FLAG_standalone_vad) { | 
 |         standalone_vad_->GetActivity(p, kMaxNumFrames); | 
 |       } | 
 |       // TODO(turajs) combining and limiting are used in the source files as | 
 |       // well they can be moved to utility. | 
 |       // Combine Video and stand-alone VAD. | 
 |       for (size_t n = 0; n < features.num_frames; n++) { | 
 |         double p_active = p[n] * video_vad_[n]; | 
 |         double p_passive = (1 - p[n]) * (1 - video_vad_[n]); | 
 |         p[n] = rtc::SafeClamp(p_active / (p_active + p_passive), 0.01, 0.99); | 
 |       } | 
 |       if (vad_->VoicingProbability(features, p) < 0) | 
 |         return -1; | 
 |       for (size_t n = 0; n < features.num_frames; n++) { | 
 |         audio_content_->Update(features.rms[n], p[n]); | 
 |         double ac = audio_content_->AudioContent(); | 
 |         if (audio_content_fid_ != NULL) { | 
 |           fwrite(&ac, sizeof(ac), 1, audio_content_fid_); | 
 |         } | 
 |         if (ac > kAgcAnalWindowSamples * activity_threshold_) { | 
 |           combined_vad[n] = 1; | 
 |         } else { | 
 |           combined_vad[n] = 0; | 
 |         } | 
 |       } | 
 |       video_index_ = 0; | 
 |     } | 
 |     return static_cast<int>(features.num_frames); | 
 |   } | 
 |  | 
 |   void Reset() { audio_content_->Reset(); } | 
 |  | 
 |   void SetActivityThreshold(double activity_threshold) { | 
 |     activity_threshold_ = activity_threshold; | 
 |   } | 
 |  | 
 |  private: | 
 |   int video_index_; | 
 |   double activity_threshold_; | 
 |   double video_vad_[kMaxNumFrames]; | 
 |   std::unique_ptr<LoudnessHistogram> audio_content_; | 
 |   std::unique_ptr<VadAudioProc> audio_processing_; | 
 |   std::unique_ptr<PitchBasedVad> vad_; | 
 |   std::unique_ptr<StandaloneVad> standalone_vad_; | 
 |  | 
 |   FILE* audio_content_fid_; | 
 | }; | 
 |  | 
 | void void_main(int argc, char* argv[]) { | 
 |   webrtc::AgcStat agc_stat; | 
 |  | 
 |   FILE* pcm_fid = fopen(argv[1], "rb"); | 
 |   ASSERT_TRUE(pcm_fid != NULL) << "Cannot open PCM file " << argv[1]; | 
 |  | 
 |   if (argc < 2) { | 
 |     fprintf(stderr, "\nNot Enough arguments\n"); | 
 |   } | 
 |  | 
 |   FILE* true_vad_fid = NULL; | 
 |   ASSERT_GT(strlen(FLAG_true_vad), 0u) << "Specify the file containing true " | 
 |                                           "VADs using --true_vad flag."; | 
 |   true_vad_fid = fopen(FLAG_true_vad, "rb"); | 
 |   ASSERT_TRUE(true_vad_fid != NULL) | 
 |       << "Cannot open the active list " << FLAG_true_vad; | 
 |  | 
 |   FILE* results_fid = NULL; | 
 |   if (strlen(FLAG_result) > 0) { | 
 |     // True if this is the first time writing to this function and we add a | 
 |     // header to the beginning of the file. | 
 |     bool write_header; | 
 |     // Open in the read mode. If it fails, the file doesn't exist and has to | 
 |     // write a header for it. Otherwise no need to write a header. | 
 |     results_fid = fopen(FLAG_result, "r"); | 
 |     if (results_fid == NULL) { | 
 |       write_header = true; | 
 |     } else { | 
 |       fclose(results_fid); | 
 |       write_header = false; | 
 |     } | 
 |     // Open in append mode. | 
 |     results_fid = fopen(FLAG_result, "a"); | 
 |     ASSERT_TRUE(results_fid != NULL) | 
 |         << "Cannot open the file, " << FLAG_result << ", to write the results."; | 
 |     // Write the header if required. | 
 |     if (write_header) { | 
 |       fprintf(results_fid, | 
 |               "%% Total Active,  Misdetection,  " | 
 |               "Total inactive,  False Positive,  On-sets,  Missed segments,  " | 
 |               "Average response\n"); | 
 |     } | 
 |   } | 
 |  | 
 |   FILE* video_vad_fid = NULL; | 
 |   if (strlen(FLAG_video_vad) > 0) { | 
 |     video_vad_fid = fopen(FLAG_video_vad, "rb"); | 
 |     ASSERT_TRUE(video_vad_fid != NULL) | 
 |         << "Cannot open the file, " << FLAG_video_vad | 
 |         << " to read video-based VAD decisions.\n"; | 
 |   } | 
 |  | 
 |   // AgsStat will be the owner of this file and will close it at its | 
 |   // destructor. | 
 |   FILE* audio_content_fid = NULL; | 
 |   if (strlen(FLAG_audio_content) > 0) { | 
 |     audio_content_fid = fopen(FLAG_audio_content, "wb"); | 
 |     ASSERT_TRUE(audio_content_fid != NULL) | 
 |         << "Cannot open file, " << FLAG_audio_content | 
 |         << " to write audio-content.\n"; | 
 |     agc_stat.set_audio_content_file(audio_content_fid); | 
 |   } | 
 |  | 
 |   webrtc::AudioFrame frame; | 
 |   frame.num_channels_ = 1; | 
 |   frame.sample_rate_hz_ = 16000; | 
 |   frame.samples_per_channel_ = frame.sample_rate_hz_ / 100; | 
 |   const size_t kSamplesToRead = | 
 |       frame.num_channels_ * frame.samples_per_channel_; | 
 |  | 
 |   agc_stat.SetActivityThreshold(FLAG_activity_threshold); | 
 |  | 
 |   int ret_val = 0; | 
 |   int num_frames = 0; | 
 |   int agc_vad[kMaxNumFrames]; | 
 |   uint8_t true_vad[kMaxNumFrames]; | 
 |   double p_video = 0.5; | 
 |   int total_active = 0; | 
 |   int total_passive = 0; | 
 |   int total_false_positive = 0; | 
 |   int total_missed_detection = 0; | 
 |   int onset_adaptation = 0; | 
 |   int num_onsets = 0; | 
 |   bool onset = false; | 
 |   uint8_t previous_true_vad = 0; | 
 |   int num_not_adapted = 0; | 
 |   size_t true_vad_index = 0; | 
 |   bool in_false_positive_region = false; | 
 |   int total_false_positive_duration = 0; | 
 |   bool video_adapted = false; | 
 |   while (kSamplesToRead == fread(frame.mutable_data(), sizeof(int16_t), | 
 |                                  kSamplesToRead, pcm_fid)) { | 
 |     assert(true_vad_index < kMaxNumFrames); | 
 |     ASSERT_EQ(1u, fread(&true_vad[true_vad_index], sizeof(*true_vad), 1, | 
 |                         true_vad_fid)) | 
 |         << "Size mismatch between True-VAD and the PCM file.\n"; | 
 |     if (video_vad_fid != NULL) { | 
 |       ASSERT_EQ(1u, fread(&p_video, sizeof(p_video), 1, video_vad_fid)) | 
 |           << "Not enough video-based VAD probabilities."; | 
 |     } | 
 |  | 
 |     // Negative video activity indicates that the video-based VAD is not yet | 
 |     // adapted. Disregards the learning phase in statistics. | 
 |     if (p_video < 0) { | 
 |       if (video_adapted) { | 
 |         fprintf(stderr, | 
 |                 "Negative video probabilities ONLY allowed at the " | 
 |                 "beginning of the sequence, not in the middle.\n"); | 
 |         exit(1); | 
 |       } | 
 |       continue; | 
 |     } else { | 
 |       video_adapted = true; | 
 |     } | 
 |  | 
 |     num_frames++; | 
 |     uint8_t last_true_vad; | 
 |     if (true_vad_index == 0) { | 
 |       last_true_vad = previous_true_vad; | 
 |     } else { | 
 |       last_true_vad = true_vad[true_vad_index - 1]; | 
 |     } | 
 |     if (last_true_vad == 1 && true_vad[true_vad_index] == 0) { | 
 |       agc_stat.Reset(); | 
 |     } | 
 |     true_vad_index++; | 
 |  | 
 |     DitherSilence(&frame); | 
 |  | 
 |     ret_val = agc_stat.AddAudio(frame, p_video, agc_vad); | 
 |     ASSERT_GE(ret_val, 0); | 
 |  | 
 |     if (ret_val > 0) { | 
 |       ASSERT_EQ(true_vad_index, static_cast<size_t>(ret_val)); | 
 |       for (int n = 0; n < ret_val; n++) { | 
 |         if (true_vad[n] == 1) { | 
 |           total_active++; | 
 |           if (previous_true_vad == 0) { | 
 |             num_onsets++; | 
 |             onset = true; | 
 |           } | 
 |           if (agc_vad[n] == 0) { | 
 |             total_missed_detection++; | 
 |             if (onset) | 
 |               onset_adaptation++; | 
 |           } else { | 
 |             in_false_positive_region = false; | 
 |             onset = false; | 
 |           } | 
 |         } else if (true_vad[n] == 0) { | 
 |           // Check if |on_set| flag is still up. If so it means that we totally | 
 |           // missed an active region | 
 |           if (onset) | 
 |             num_not_adapted++; | 
 |           onset = false; | 
 |  | 
 |           total_passive++; | 
 |           if (agc_vad[n] == 1) { | 
 |             total_false_positive++; | 
 |             in_false_positive_region = true; | 
 |           } | 
 |           if (in_false_positive_region) { | 
 |             total_false_positive_duration++; | 
 |           } | 
 |         } else { | 
 |           ASSERT_TRUE(false) << "Invalid value for true-VAD.\n"; | 
 |         } | 
 |         previous_true_vad = true_vad[n]; | 
 |       } | 
 |       true_vad_index = 0; | 
 |     } | 
 |   } | 
 |  | 
 |   if (results_fid != NULL) { | 
 |     fprintf(results_fid, "%4d  %4d  %4d  %4d  %4d  %4d  %4.0f %4.0f\n", | 
 |             total_active, total_missed_detection, total_passive, | 
 |             total_false_positive, num_onsets, num_not_adapted, | 
 |             static_cast<float>(onset_adaptation) / (num_onsets + 1e-12), | 
 |             static_cast<float>(total_false_positive_duration) / | 
 |                 (total_passive + 1e-12)); | 
 |   } | 
 |   fprintf(stdout, "%4d %4d %4d %4d %4d %4d %4.0f %4.0f\n", total_active, | 
 |           total_missed_detection, total_passive, total_false_positive, | 
 |           num_onsets, num_not_adapted, | 
 |           static_cast<float>(onset_adaptation) / (num_onsets + 1e-12), | 
 |           static_cast<float>(total_false_positive_duration) / | 
 |               (total_passive + 1e-12)); | 
 |  | 
 |   fclose(true_vad_fid); | 
 |   fclose(pcm_fid); | 
 |   if (video_vad_fid != NULL) { | 
 |     fclose(video_vad_fid); | 
 |   } | 
 |   if (results_fid != NULL) { | 
 |     fclose(results_fid); | 
 |   } | 
 | } | 
 |  | 
 | }  // namespace webrtc | 
 |  | 
 | int main(int argc, char* argv[]) { | 
 |   if (argc == 1) { | 
 |     // Print usage information. | 
 |     std::cout | 
 |         << "\nCompute the number of misdetected and false-positive frames. " | 
 |            "Not\n" | 
 |            " that for each frame of audio (10 ms) there should be one true\n" | 
 |            " activity. If any video-based activity is given, there should also " | 
 |            "be\n" | 
 |            " one probability per frame.\n" | 
 |            "Run with --help for more details on available flags.\n" | 
 |            "\nUsage:\n\n" | 
 |            "activity_metric input_pcm [options]\n" | 
 |            "where 'input_pcm' is the input audio sampled at 16 kHz in 16 bits " | 
 |            "format.\n\n"; | 
 |     return 0; | 
 |   } | 
 |   rtc::FlagList::SetFlagsFromCommandLine(&argc, argv, true); | 
 |   if (FLAG_help) { | 
 |     rtc::FlagList::Print(nullptr, false); | 
 |     return 0; | 
 |   } | 
 |   webrtc::void_main(argc, argv); | 
 |   return 0; | 
 | } |