Reland "Add tool for aliging video files"

This is a reland of b2c0e8f60fad10e2786e5e131136a0da1299d883

Original change's description:
> Add tool for aliging video files
>
> This class adds logic for aligning a test video to a reference video
> by an algorithm that maximizes SSIM between them. Aligned videos will be
> easier to run video quality metrics on. This is a generic way of
> aligning videos and can be replace the intrusive barcode stamping that
> we currently use. This will be done in a follow-up CL.
>
> Change-Id: I71cf1e2179c0f1e03eff9e4d8fc492fd5cfbbb1c
> Bug: webrtc:9642
> Reviewed-on: https://webrtc-review.googlesource.com/94773
> Commit-Queue: Magnus Jedvert <magjed@webrtc.org>
> Reviewed-by: Patrik Höglund <phoglund@webrtc.org>
> Reviewed-by: Paulina Hensman <phensman@webrtc.org>
> Cr-Commit-Position: refs/heads/master@{#24407}

TBR=phensman,phoglund

Bug: webrtc:9642
Change-Id: I35d6b0e598335b8d80fbfa37ba06d5c651bda4f6
Reviewed-on: https://webrtc-review.googlesource.com/98040
Commit-Queue: Magnus Jedvert <magjed@webrtc.org>
Reviewed-by: Magnus Jedvert <magjed@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#24580}
diff --git a/rtc_tools/BUILD.gn b/rtc_tools/BUILD.gn
index b2a97ea..294cc5a 100644
--- a/rtc_tools/BUILD.gn
+++ b/rtc_tools/BUILD.gn
@@ -77,12 +77,17 @@
   sources = [
     "frame_analyzer/video_quality_analysis.cc",
     "frame_analyzer/video_quality_analysis.h",
+    "frame_analyzer/video_temporal_aligner.cc",
+    "frame_analyzer/video_temporal_aligner.h",
   ]
   deps = [
     ":video_file_reader",
     "../api/video:video_frame_i420",
     "../common_video",
+    "../rtc_base:checks",
+    "../rtc_base:rtc_base_approved",
     "../test:perf_test",
+    "//third_party/abseil-cpp/absl/types:optional",
     "//third_party/libyuv",
   ]
 }
@@ -298,6 +303,7 @@
   }
 
   tools_unittests_resources = [
+    "../resources/foreman_128x96.yuv",
     "../resources/foreman_cif.yuv",
     "../resources/reference_less_video_test_file.y4m",
   ]
@@ -318,6 +324,7 @@
     sources = [
       "frame_analyzer/reference_less_video_analysis_unittest.cc",
       "frame_analyzer/video_quality_analysis_unittest.cc",
+      "frame_analyzer/video_temporal_aligner_unittest.cc",
       "frame_editing/frame_editing_unittest.cc",
       "sanitizers_unittest.cc",
       "simple_command_line_parser_unittest.cc",
diff --git a/rtc_tools/frame_analyzer/video_temporal_aligner.cc b/rtc_tools/frame_analyzer/video_temporal_aligner.cc
new file mode 100644
index 0000000..f106852
--- /dev/null
+++ b/rtc_tools/frame_analyzer/video_temporal_aligner.cc
@@ -0,0 +1,229 @@
+/*
+ *  Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "rtc_tools/frame_analyzer/video_temporal_aligner.h"
+
+#include <algorithm>
+#include <cmath>
+#include <deque>
+#include <limits>
+#include <vector>
+
+#include "api/video/i420_buffer.h"
+#include "rtc_base/checks.h"
+#include "rtc_base/refcountedobject.h"
+#include "rtc_tools/frame_analyzer/video_quality_analysis.h"
+#include "third_party/libyuv/include/libyuv/compare.h"
+
+namespace webrtc {
+namespace test {
+
+namespace {
+
+// This constant controls how many frames we look ahead while seeking for the
+// match for the next frame. Note that we may span bigger gaps than this number
+// since we reset the counter as soon as we find a better match. The seeking
+// will stop when there is no improvement in the next kNumberOfFramesLookAhead
+// frames. Typically, the SSIM will improve as we get closer and closer to the
+// real match.
+const int kNumberOfFramesLookAhead = 60;
+
+// Helper class that takes a video and generates an infinite looping video.
+class LoopingVideo : public rtc::RefCountedObject<Video> {
+ public:
+  explicit LoopingVideo(const rtc::scoped_refptr<Video>& video)
+      : video_(video) {}
+
+  int width() const override { return video_->width(); }
+  int height() const override { return video_->height(); }
+  size_t number_of_frames() const override {
+    return std::numeric_limits<size_t>::max();
+  }
+
+  rtc::scoped_refptr<I420BufferInterface> GetFrame(
+      size_t index) const override {
+    return video_->GetFrame(index % video_->number_of_frames());
+  }
+
+ private:
+  const rtc::scoped_refptr<Video> video_;
+};
+
+// Helper class that take a vector of frame indices and a video and produces a
+// new video where the frames have been reshuffled.
+class ReorderedVideo : public rtc::RefCountedObject<Video> {
+ public:
+  ReorderedVideo(const rtc::scoped_refptr<Video>& video,
+                 const std::vector<size_t>& indices)
+      : video_(video), indices_(indices) {}
+
+  int width() const override { return video_->width(); }
+  int height() const override { return video_->height(); }
+  size_t number_of_frames() const override { return indices_.size(); }
+
+  rtc::scoped_refptr<I420BufferInterface> GetFrame(
+      size_t index) const override {
+    return video_->GetFrame(indices_.at(index));
+  }
+
+ private:
+  const rtc::scoped_refptr<Video> video_;
+  const std::vector<size_t> indices_;
+};
+
+// Helper class that takes a video and produces a downscaled video.
+class DownscaledVideo : public rtc::RefCountedObject<Video> {
+ public:
+  DownscaledVideo(float scale_factor, const rtc::scoped_refptr<Video>& video)
+      : downscaled_width_(
+            static_cast<int>(round(scale_factor * video->width()))),
+        downscaled_height_(
+            static_cast<int>(round(scale_factor * video->height()))),
+        video_(video) {}
+
+  int width() const override { return downscaled_width_; }
+  int height() const override { return downscaled_height_; }
+  size_t number_of_frames() const override {
+    return video_->number_of_frames();
+  }
+
+  rtc::scoped_refptr<I420BufferInterface> GetFrame(
+      size_t index) const override {
+    const rtc::scoped_refptr<I420BufferInterface> frame =
+        video_->GetFrame(index);
+    rtc::scoped_refptr<I420Buffer> downscaled_frame =
+        I420Buffer::Create(downscaled_width_, downscaled_height_);
+    downscaled_frame->ScaleFrom(*frame);
+    return downscaled_frame;
+  }
+
+ private:
+  const int downscaled_width_;
+  const int downscaled_height_;
+  const rtc::scoped_refptr<Video> video_;
+};
+
+// Helper class that takes a video and caches the latest frame access. This
+// improves performance a lot since the original source is often from a file.
+class CachedVideo : public rtc::RefCountedObject<Video> {
+ public:
+  CachedVideo(int max_cache_size, const rtc::scoped_refptr<Video>& video)
+      : max_cache_size_(max_cache_size), video_(video) {}
+
+  int width() const override { return video_->width(); }
+  int height() const override { return video_->height(); }
+  size_t number_of_frames() const override {
+    return video_->number_of_frames();
+  }
+
+  rtc::scoped_refptr<I420BufferInterface> GetFrame(
+      size_t index) const override {
+    for (const CachedFrame& cached_frame : cache_) {
+      if (cached_frame.index == index)
+        return cached_frame.frame;
+    }
+
+    rtc::scoped_refptr<I420BufferInterface> frame = video_->GetFrame(index);
+    cache_.push_front({index, frame});
+    if (cache_.size() > max_cache_size_)
+      cache_.pop_back();
+
+    return frame;
+  }
+
+ private:
+  struct CachedFrame {
+    size_t index;
+    rtc::scoped_refptr<I420BufferInterface> frame;
+  };
+
+  const size_t max_cache_size_;
+  const rtc::scoped_refptr<Video> video_;
+  mutable std::deque<CachedFrame> cache_;
+};
+
+// Try matching the test frame against all frames in the reference video and
+// return the index of the best matching frame.
+size_t FindBestMatch(const rtc::scoped_refptr<I420BufferInterface>& test_frame,
+                     const Video& reference_video) {
+  std::vector<double> ssim;
+  for (const auto& ref_frame : reference_video)
+    ssim.push_back(Ssim(test_frame, ref_frame));
+  return std::distance(ssim.begin(),
+                       std::max_element(ssim.begin(), ssim.end()));
+}
+
+// Find and return the index of the frame matching the test frame. The search
+// starts at the starting index and continues until there is no better match
+// within the next kNumberOfFramesLookAhead frames.
+size_t FindNextMatch(const rtc::scoped_refptr<I420BufferInterface>& test_frame,
+                     const Video& reference_video,
+                     size_t start_index) {
+  const double start_ssim =
+      Ssim(test_frame, reference_video.GetFrame(start_index));
+  for (int i = 1; i < kNumberOfFramesLookAhead; ++i) {
+    const size_t next_index = start_index + i;
+    // If we find a better match, restart the search at that point.
+    if (start_ssim < Ssim(test_frame, reference_video.GetFrame(next_index)))
+      return FindNextMatch(test_frame, reference_video, next_index);
+  }
+  // The starting index was the best match.
+  return start_index;
+}
+
+}  // namespace
+
+std::vector<size_t> FindMatchingFrameIndices(
+    const rtc::scoped_refptr<Video>& reference_video,
+    const rtc::scoped_refptr<Video>& test_video) {
+  // This is done to get a 10x speedup. We don't need the full resolution in
+  // order to match frames, and we should limit file access and not read the
+  // same memory tens of times.
+  const float kScaleFactor = 0.25f;
+  const rtc::scoped_refptr<Video> cached_downscaled_reference_video =
+      new CachedVideo(kNumberOfFramesLookAhead,
+                      new DownscaledVideo(kScaleFactor, reference_video));
+  const rtc::scoped_refptr<Video> downscaled_test_video =
+      new DownscaledVideo(kScaleFactor, test_video);
+
+  // Assume the video is looping around.
+  const rtc::scoped_refptr<Video> looping_reference_video =
+      new LoopingVideo(cached_downscaled_reference_video);
+
+  std::vector<size_t> match_indices;
+  for (const rtc::scoped_refptr<I420BufferInterface>& test_frame :
+       *downscaled_test_video) {
+    if (match_indices.empty()) {
+      // First frame.
+      match_indices.push_back(
+          FindBestMatch(test_frame, *cached_downscaled_reference_video));
+    } else {
+      match_indices.push_back(FindNextMatch(
+          test_frame, *looping_reference_video, match_indices.back()));
+    }
+  }
+
+  return match_indices;
+}
+
+rtc::scoped_refptr<Video> ReorderVideo(const rtc::scoped_refptr<Video>& video,
+                                       const std::vector<size_t>& indices) {
+  return new ReorderedVideo(video, indices);
+}
+
+rtc::scoped_refptr<Video> GenerateAlignedReferenceVideo(
+    const rtc::scoped_refptr<Video>& reference_video,
+    const rtc::scoped_refptr<Video>& test_video) {
+  return ReorderVideo(new LoopingVideo(reference_video),
+                      FindMatchingFrameIndices(reference_video, test_video));
+}
+
+}  // namespace test
+}  // namespace webrtc
diff --git a/rtc_tools/frame_analyzer/video_temporal_aligner.h b/rtc_tools/frame_analyzer/video_temporal_aligner.h
new file mode 100644
index 0000000..6f2015a
--- /dev/null
+++ b/rtc_tools/frame_analyzer/video_temporal_aligner.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef RTC_TOOLS_FRAME_ANALYZER_VIDEO_TEMPORAL_ALIGNER_H_
+#define RTC_TOOLS_FRAME_ANALYZER_VIDEO_TEMPORAL_ALIGNER_H_
+
+#include <vector>
+
+#include "rtc_tools/video_file_reader.h"
+
+namespace webrtc {
+namespace test {
+
+// Returns a vector with the same size as the given test video. Each index
+// corresponds to what reference frame that test frame matches to. These
+// indices are strictly increasing and might loop around the reference video,
+// e.g. their values can be bigger than the number of frames in the reference
+// video and they should be interpreted modulo that size. The matching frames
+// will be determined by maximizing SSIM.
+std::vector<size_t> FindMatchingFrameIndices(
+    const rtc::scoped_refptr<Video>& reference_video,
+    const rtc::scoped_refptr<Video>& test_video);
+
+// Generate a new video using the frames from the original video. The returned
+// video will have the same number of frames as the size of |indices|, and
+// frame nr i in the returned video will point to frame nr indices[i] in the
+// original video.
+rtc::scoped_refptr<Video> ReorderVideo(const rtc::scoped_refptr<Video>& video,
+                                       const std::vector<size_t>& indices);
+
+// Returns a modified version of the reference video where the frames have
+// been aligned to the test video. The test video is assumed to be captured
+// during a quality measurement test where the reference video is the source.
+// The test video may start at an arbitrary position in the reference video
+// and there might be missing frames. The reference video is assumed to loop
+// over when it reaches the end. The returned result is a version of the
+// reference video where the missing frames are left out so it aligns to the
+// test video.
+rtc::scoped_refptr<Video> GenerateAlignedReferenceVideo(
+    const rtc::scoped_refptr<Video>& reference_video,
+    const rtc::scoped_refptr<Video>& test_video);
+
+}  // namespace test
+}  // namespace webrtc
+
+#endif  // RTC_TOOLS_FRAME_ANALYZER_VIDEO_TEMPORAL_ALIGNER_H_
diff --git a/rtc_tools/frame_analyzer/video_temporal_aligner_unittest.cc b/rtc_tools/frame_analyzer/video_temporal_aligner_unittest.cc
new file mode 100644
index 0000000..8fc8d19
--- /dev/null
+++ b/rtc_tools/frame_analyzer/video_temporal_aligner_unittest.cc
@@ -0,0 +1,127 @@
+/*
+ *  Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "rtc_tools/frame_analyzer/video_temporal_aligner.h"
+
+#include "rtc_tools/frame_analyzer/video_quality_analysis.h"
+#include "rtc_tools/video_file_reader.h"
+#include "test/gtest.h"
+#include "test/testsupport/fileutils.h"
+
+namespace webrtc {
+namespace test {
+
+class VideoTemporalAlignerTest : public ::testing::Test {
+ protected:
+  void SetUp() {
+    reference_video =
+        OpenYuvFile(ResourcePath("foreman_128x96", "yuv"), 128, 96);
+    ASSERT_TRUE(reference_video);
+  }
+
+  rtc::scoped_refptr<Video> reference_video;
+};
+
+TEST_F(VideoTemporalAlignerTest, FindMatchingFrameIndicesEmpty) {
+  rtc::scoped_refptr<Video> empty_test_video =
+      ReorderVideo(reference_video, std::vector<size_t>());
+
+  const std::vector<size_t> matched_indices =
+      FindMatchingFrameIndices(reference_video, empty_test_video);
+
+  EXPECT_TRUE(matched_indices.empty());
+}
+
+TEST_F(VideoTemporalAlignerTest, FindMatchingFrameIndicesIdentity) {
+  const std::vector<size_t> indices =
+      FindMatchingFrameIndices(reference_video, reference_video);
+
+  EXPECT_EQ(indices.size(), reference_video->number_of_frames());
+  for (size_t i = 0; i < indices.size(); ++i)
+    EXPECT_EQ(i, indices[i]);
+}
+
+TEST_F(VideoTemporalAlignerTest, FindMatchingFrameIndicesDuplicateFrames) {
+  const std::vector<size_t> indices = {2, 2, 2, 2};
+
+  // Generate a test video based on this sequence.
+  rtc::scoped_refptr<Video> test_video = ReorderVideo(reference_video, indices);
+
+  const std::vector<size_t> matched_indices =
+      FindMatchingFrameIndices(reference_video, test_video);
+
+  EXPECT_EQ(indices, matched_indices);
+}
+
+TEST_F(VideoTemporalAlignerTest, FindMatchingFrameIndicesLoopAround) {
+  std::vector<size_t> indices;
+  for (size_t i = 0; i < reference_video->number_of_frames() * 2; ++i)
+    indices.push_back(i % reference_video->number_of_frames());
+
+  // Generate a test video based on this sequence.
+  rtc::scoped_refptr<Video> test_video = ReorderVideo(reference_video, indices);
+
+  const std::vector<size_t> matched_indices =
+      FindMatchingFrameIndices(reference_video, test_video);
+
+  for (size_t i = 0; i < matched_indices.size(); ++i)
+    EXPECT_EQ(i, matched_indices[i]);
+}
+
+TEST_F(VideoTemporalAlignerTest, FindMatchingFrameIndicesStressTest) {
+  std::vector<size_t> indices;
+  // Arbitrary start index.
+  const size_t start_index = 12345;
+  // Generate some generic sequence of frames.
+  indices.push_back(start_index % reference_video->number_of_frames());
+  indices.push_back((start_index + 1) % reference_video->number_of_frames());
+  indices.push_back((start_index + 2) % reference_video->number_of_frames());
+  indices.push_back((start_index + 5) % reference_video->number_of_frames());
+  indices.push_back((start_index + 10) % reference_video->number_of_frames());
+  indices.push_back((start_index + 20) % reference_video->number_of_frames());
+  indices.push_back((start_index + 20) % reference_video->number_of_frames());
+  indices.push_back((start_index + 22) % reference_video->number_of_frames());
+  indices.push_back((start_index + 32) % reference_video->number_of_frames());
+
+  // Generate a test video based on this sequence.
+  rtc::scoped_refptr<Video> test_video = ReorderVideo(reference_video, indices);
+
+  const std::vector<size_t> matched_indices =
+      FindMatchingFrameIndices(reference_video, test_video);
+
+  EXPECT_EQ(indices, matched_indices);
+}
+
+TEST_F(VideoTemporalAlignerTest, GenerateAlignedReferenceVideo) {
+  // Arbitrary start index.
+  const size_t start_index = 12345;
+  std::vector<size_t> indices;
+  const size_t frame_step = 10;
+  for (size_t i = 0; i < reference_video->number_of_frames() / frame_step;
+       ++i) {
+    indices.push_back((start_index + i * frame_step) %
+                      reference_video->number_of_frames());
+  }
+
+  // Generate a test video based on this sequence.
+  rtc::scoped_refptr<Video> test_video = ReorderVideo(reference_video, indices);
+
+  rtc::scoped_refptr<Video> aligned_reference_video =
+      GenerateAlignedReferenceVideo(reference_video, test_video);
+
+  // Assume perfect match, i.e. ssim == 1, for all frames.
+  for (size_t i = 0; i < test_video->number_of_frames(); ++i) {
+    EXPECT_EQ(1.0, Ssim(test_video->GetFrame(i),
+                        aligned_reference_video->GetFrame(i)));
+  }
+}
+
+}  // namespace test
+}  // namespace webrtc