Add blob-encoding support for RTC event logs

A blob is a string of binary information, whose length may not
necessarily be determined by looking into the string, so that
concatenating all blobs without explicitly including their lengths
as part of their encoding is not a viable option.

Bug: webrtc:8111
Change-Id: I89fdca660e89a6a71eff3ecb7b86416312b81f23
Reviewed-on: https://webrtc-review.googlesource.com/c/104201
Commit-Queue: Elad Alon <eladalon@webrtc.org>
Reviewed-by: Björn Terelius <terelius@webrtc.org>
Reviewed-by: Yves Gerey <yvesg@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#25278}
diff --git a/logging/BUILD.gn b/logging/BUILD.gn
index 91c560f..ac0b720 100644
--- a/logging/BUILD.gn
+++ b/logging/BUILD.gn
@@ -156,6 +156,8 @@
 rtc_static_library("rtc_event_log_impl_encoder") {
   visibility = [ "*" ]
   sources = [
+    "rtc_event_log/encoder/blob_encoding.cc",
+    "rtc_event_log/encoder/blob_encoding.h",
     "rtc_event_log/encoder/delta_encoding.cc",
     "rtc_event_log/encoder/delta_encoding.h",
     "rtc_event_log/encoder/rtc_event_log_encoder_legacy.cc",
@@ -180,6 +182,7 @@
     "../rtc_base:checks",
     "../rtc_base:rtc_base_approved",
     "//third_party/abseil-cpp/absl/memory",
+    "//third_party/abseil-cpp/absl/strings:strings",
   ]
 
   if (rtc_enable_protobuf) {
@@ -302,6 +305,7 @@
       assert(rtc_enable_protobuf)
       defines = [ "ENABLE_RTC_EVENT_LOG" ]
       sources = [
+        "rtc_event_log/encoder/blob_encoding_unittest.cc",
         "rtc_event_log/encoder/delta_encoding_unittest.cc",
         "rtc_event_log/encoder/rtc_event_log_encoder_unittest.cc",
         "rtc_event_log/output/rtc_event_log_output_file_unittest.cc",
diff --git a/logging/rtc_event_log/encoder/blob_encoding.cc b/logging/rtc_event_log/encoder/blob_encoding.cc
new file mode 100644
index 0000000..62d268b
--- /dev/null
+++ b/logging/rtc_event_log/encoder/blob_encoding.cc
@@ -0,0 +1,162 @@
+/*
+ *  Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "logging/rtc_event_log/encoder/blob_encoding.h"
+
+#include <algorithm>
+
+#include "rtc_base/logging.h"
+
+namespace webrtc {
+
+const size_t kMaxVarIntLengthBytes = 10;  // ceil(64 / 7.0) is 10.
+
+namespace {
+
+// Encode a given uint64_t as a varint. From least to most significant,
+// each batch of seven bits are put into the lower bits of a byte, and the last
+// remaining bit in that byte (the highest one) marks whether additional bytes
+// follow (which happens if and only if there are other bits in |input| which
+// are non-zero).
+// Notes: If input == 0, one byte is used. If input is uint64_t::max, exactly
+// kMaxVarIntLengthBytes are used.
+std::string EncodeVarInt(uint64_t input) {
+  std::string output;
+  output.reserve(kMaxVarIntLengthBytes);
+
+  do {
+    uint8_t byte = static_cast<uint8_t>(input & 0x7f);
+    input >>= 7;
+    if (input > 0) {
+      byte |= 0x80;
+    }
+    output += byte;
+  } while (input > 0);
+
+  RTC_DCHECK_GE(output.size(), 1u);
+  RTC_DCHECK_LE(output.size(), kMaxVarIntLengthBytes);
+
+  return output;
+}
+
+// Inverse of EncodeVarInt().
+// If decoding is successful, a non-zero number is returned, indicating the
+// number of bytes read from |input|, and the decoded varint is written
+// into |output|.
+// If not successful, 0 is returned, and |output| is not modified.
+size_t DecodeVarInt(absl::string_view input, uint64_t* output) {
+  RTC_DCHECK(output);
+
+  uint64_t decoded = 0;
+  for (size_t i = 0; i < input.length() && i < kMaxVarIntLengthBytes; ++i) {
+    decoded += (static_cast<uint64_t>(input[i] & 0x7f)
+                << static_cast<uint64_t>(7 * i));
+    if (!(input[i] & 0x80)) {
+      *output = decoded;
+      return i + 1;
+    }
+  }
+
+  return 0;
+}
+
+}  // namespace
+
+std::string EncodeBlobs(const std::vector<std::string>& blobs) {
+  RTC_DCHECK(!blobs.empty());
+
+  size_t result_length_bound = kMaxVarIntLengthBytes * blobs.size();
+  for (const auto& blob : blobs) {
+    // Providing an input so long that it would cause a wrap-around is an error.
+    RTC_DCHECK_GE(result_length_bound + blob.length(), result_length_bound);
+    result_length_bound += blob.length();
+  }
+
+  std::string result;
+  result.reserve(result_length_bound);
+
+  // First, encode all of the lengths.
+  for (absl::string_view blob : blobs) {
+    result += EncodeVarInt(blob.length());
+  }
+
+  // Second, encode the actual blobs.
+  for (absl::string_view blob : blobs) {
+    result.append(blob.data(), blob.length());
+  }
+
+  RTC_DCHECK_LE(result.size(), result_length_bound);
+  return result;
+}
+
+std::vector<absl::string_view> DecodeBlobs(absl::string_view encoded_blobs,
+                                           size_t num_of_blobs) {
+  if (encoded_blobs.empty()) {
+    RTC_LOG(LS_WARNING) << "Corrupt input; empty input.";
+    return std::vector<absl::string_view>();
+  }
+
+  if (num_of_blobs == 0u) {
+    RTC_LOG(LS_WARNING)
+        << "Corrupt input; number of blobs must be greater than 0.";
+    return std::vector<absl::string_view>();
+  }
+
+  size_t read_idx = 0;
+
+  // Read the lengths of all blobs.
+  std::vector<uint64_t> lengths(num_of_blobs);
+  for (size_t i = 0; i < num_of_blobs; ++i) {
+    if (read_idx >= encoded_blobs.length()) {
+      RTC_DCHECK_EQ(read_idx, encoded_blobs.length());
+      RTC_LOG(LS_WARNING) << "Corrupt input; excessive number of blobs.";
+      return std::vector<absl::string_view>();
+    }
+
+    const size_t read_bytes =
+        DecodeVarInt(encoded_blobs.substr(read_idx), &lengths[i]);
+    if (read_bytes == 0) {
+      RTC_LOG(LS_WARNING) << "Corrupt input; varint decoding failed.";
+      return std::vector<absl::string_view>();
+    }
+
+    read_idx += read_bytes;
+
+    // Note: It might be that read_idx == encoded_blobs.length(), if this
+    // is the last iteration, and all of the blobs are the empty string.
+    RTC_DCHECK_LE(read_idx, encoded_blobs.length());
+  }
+
+  // Read the blobs themselves.
+  std::vector<absl::string_view> blobs(num_of_blobs);
+  for (size_t i = 0; i < num_of_blobs; ++i) {
+    if (read_idx + lengths[i] < read_idx) {  // Wrap-around detection.
+      RTC_LOG(LS_WARNING) << "Corrupt input; unreasonably large blob sequence.";
+      return std::vector<absl::string_view>();
+    }
+
+    if (read_idx + lengths[i] > encoded_blobs.length()) {
+      RTC_LOG(LS_WARNING) << "Corrupt input; blob sizes exceed input size.";
+      return std::vector<absl::string_view>();
+    }
+
+    blobs[i] = encoded_blobs.substr(read_idx, lengths[i]);
+    read_idx += lengths[i];
+  }
+
+  if (read_idx != encoded_blobs.length()) {
+    RTC_LOG(LS_WARNING) << "Corrupt input; unrecognized trailer.";
+    return std::vector<absl::string_view>();
+  }
+
+  return blobs;
+}
+
+}  // namespace webrtc
diff --git a/logging/rtc_event_log/encoder/blob_encoding.h b/logging/rtc_event_log/encoder/blob_encoding.h
new file mode 100644
index 0000000..3087534
--- /dev/null
+++ b/logging/rtc_event_log/encoder/blob_encoding.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef LOGGING_RTC_EVENT_LOG_ENCODER_BLOB_ENCODING_H_
+#define LOGGING_RTC_EVENT_LOG_ENCODER_BLOB_ENCODING_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+
+namespace webrtc {
+
+extern const size_t kMaxVarIntLengthBytes;
+
+// Encode/decode a sequence of strings, whose length is not known to be
+// discernable from the blob itself (i.e. without being transmitted OOB),
+// in a way that would allow us to separate them again on the decoding side.
+// The number of blobs is assumed to be transmitted OOB. For example, if
+// multiple sequences of different blobs are sent, but all sequences contain
+// the same number of blobs, it is beneficial to not encode the number of blobs.
+//
+// EncodeBlobs() must be given a non-empty vector. The blobs themselves may
+// be equal to "", though.
+// EncodeBlobs() may not fail.
+// EncodeBlobs() never returns the empty string.
+//
+// Calling DecodeBlobs() on an empty string, or with |num_of_blobs| set to 0,
+// is an error.
+// DecodeBlobs() returns an empty vector if it fails, e.g. due to a mismatch
+// between |num_of_blobs| and |encoded_blobs|, which can happen if
+// |encoded_blobs| is corrupted.
+// When successful, DecodeBlobs() returns a vector of string_view objects,
+// which refer to the original input (|encoded_blobs|), and therefore may
+// not outlive it.
+//
+// Note that the returned std::string might have been reserved for significantly
+// more memory than it ends up using. If the caller to EncodeBlobs() intends
+// to store the result long-term, he should consider shrink_to_fit()-ing it.
+std::string EncodeBlobs(const std::vector<std::string>& blobs);
+std::vector<absl::string_view> DecodeBlobs(absl::string_view encoded_blobs,
+                                           size_t num_of_blobs);
+
+}  // namespace webrtc
+
+#endif  // LOGGING_RTC_EVENT_LOG_ENCODER_BLOB_ENCODING_H_
diff --git a/logging/rtc_event_log/encoder/blob_encoding_unittest.cc b/logging/rtc_event_log/encoder/blob_encoding_unittest.cc
new file mode 100644
index 0000000..3992445
--- /dev/null
+++ b/logging/rtc_event_log/encoder/blob_encoding_unittest.cc
@@ -0,0 +1,151 @@
+/*
+ *  Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "logging/rtc_event_log/encoder/blob_encoding.h"
+
+#include <string>
+#include <vector>
+
+#include "rtc_base/checks.h"
+#include "test/gtest.h"
+
+using CharT = std::string::value_type;
+
+namespace webrtc {
+
+namespace {
+
+void TestEncodingAndDecoding(const std::vector<std::string>& blobs) {
+  RTC_DCHECK(!blobs.empty());
+
+  const std::string encoded = EncodeBlobs(blobs);
+  ASSERT_FALSE(encoded.empty());
+
+  const std::vector<absl::string_view> decoded =
+      DecodeBlobs(encoded, blobs.size());
+
+  ASSERT_EQ(decoded.size(), blobs.size());
+  for (size_t i = 0; i < decoded.size(); ++i) {
+    ASSERT_EQ(decoded[i], blobs[i]);
+  }
+}
+
+void TestGracefulErrorHandling(absl::string_view encoded_blobs,
+                               size_t num_of_blobs) {
+  const std::vector<absl::string_view> decoded =
+      DecodeBlobs(encoded_blobs, num_of_blobs);
+  EXPECT_TRUE(decoded.empty());
+}
+
+}  // namespace
+
+TEST(BlobEncoding, EmptyBlob) {
+  TestEncodingAndDecoding({""});
+}
+
+TEST(BlobEncoding, SingleCharacterBlob) {
+  TestEncodingAndDecoding({"a"});
+}
+
+TEST(BlobEncoding, LongBlob) {
+  std::string blob = "";
+  for (size_t i = 0; i < 100000; ++i) {
+    blob += std::to_string(i + 1) + " Mississippi\n";
+  }
+  TestEncodingAndDecoding({blob});
+}
+
+TEST(BlobEncoding, BlobsOfVariousLengths) {
+  constexpr size_t kJump = 0xf032d;  // Arbitrary.
+  constexpr size_t kMax = 0xffffff;  // Arbitrary.
+
+  std::string blob;
+  blob.reserve(kMax);
+
+  for (size_t i = 0; i < kMax; i += kJump) {
+    blob.append(kJump, 'x');
+    TestEncodingAndDecoding({blob});
+  }
+}
+
+TEST(BlobEncoding, MultipleBlobs) {
+  std::vector<std::string> blobs;
+  for (size_t i = 0; i < 100000; ++i) {
+    blobs.push_back(std::to_string(i + 1) + " Mississippi\n");
+  }
+  TestEncodingAndDecoding(blobs);
+}
+
+TEST(BlobEncoding, DecodeBlobsHandlesErrorsGracefullyEmptyInput) {
+  TestGracefulErrorHandling("", 1);
+}
+
+TEST(BlobEncoding, DecodeBlobsHandlesErrorsGracefullyZeroBlobs) {
+  const std::string encoded = EncodeBlobs({"a"});
+  ASSERT_FALSE(encoded.empty());
+  TestGracefulErrorHandling(encoded, 0);
+}
+
+TEST(BlobEncoding, DecodeBlobsHandlesErrorsGracefullyBlobLengthTooSmall) {
+  std::string encoded = EncodeBlobs({"ab"});
+  ASSERT_FALSE(encoded.empty());
+  ASSERT_EQ(encoded[0], 0x02);
+  encoded[0] = 0x01;
+  TestGracefulErrorHandling(encoded, 1);
+}
+
+TEST(BlobEncoding, DecodeBlobsHandlesErrorsGracefullyBlobLengthTooLarge) {
+  std::string encoded = EncodeBlobs({"a"});
+  ASSERT_FALSE(encoded.empty());
+  ASSERT_EQ(encoded[0], 0x01);
+  encoded[0] = 0x02;
+  TestGracefulErrorHandling(encoded, 1);
+}
+
+TEST(BlobEncoding,
+     DecodeBlobsHandlesErrorsGracefullyNumberOfBlobsIncorrectlyHigh) {
+  const std::vector<std::string> blobs = {"a", "b"};
+  const std::string encoded = EncodeBlobs(blobs);
+  // Test focus - two empty strings encoded, but DecodeBlobs() told way more
+  // blobs are in the strings than could be expected.
+  TestGracefulErrorHandling(encoded, 1000);
+
+  // Test sanity - show that DecodeBlobs() would have worked if it got the
+  // correct input.
+  TestEncodingAndDecoding(blobs);
+}
+
+TEST(BlobEncoding, DecodeBlobsHandlesErrorsGracefullyDefectiveVarInt) {
+  std::string defective_varint;
+  for (size_t i = 0; i < kMaxVarIntLengthBytes; ++i) {
+    ASSERT_LE(kMaxVarIntLengthBytes, 0xffu);
+    defective_varint += static_cast<CharT>(static_cast<size_t>(0x80u) | i);
+  }
+  defective_varint += 0x01u;
+
+  const std::string defective_encoded = defective_varint + "whatever";
+
+  TestGracefulErrorHandling(defective_encoded, 1);
+}
+
+TEST(BlobEncoding, DecodeBlobsHandlesErrorsGracefullyLengthSumWrapAround) {
+  std::string max_size_varint;
+  for (size_t i = 0; i < kMaxVarIntLengthBytes - 1; ++i) {
+    max_size_varint += 0xffu;
+  }
+  max_size_varint += 0x7fu;
+
+  const std::string defective_encoded =
+      max_size_varint + max_size_varint + "whatever";
+
+  TestGracefulErrorHandling(defective_encoded, 2);
+}
+
+}  // namespace webrtc
diff --git a/logging/rtc_event_log/encoder/delta_encoding.h b/logging/rtc_event_log/encoder/delta_encoding.h
index 4f3b9a1..7ce5e4f 100644
--- a/logging/rtc_event_log/encoder/delta_encoding.h
+++ b/logging/rtc_event_log/encoder/delta_encoding.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2018 The WebRTC project authors. All Rights Reserved.
+ *  Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source