Fix getStats() freeze bug affecting Chromium but not WebRTC standalone.

PeerConnection::Close() is, per-spec, a blocking operation.
Unfortunately, PeerConnection is implemented to own resources used by
the network thread, and Close() - on the signaling thread - destroys
these resources. As such, tasks run in parallel like getStats() get into
race conditions with Close() unless synchronized. The mechanism in-place
is RTCStatsCollector::WaitForPendingRequest(), it waits until the
network thread is done with the in-parallel stats request.

Prior to this CL, this was implemented by performing
rtc::Thread::ProcessMessages() in a loop until the network thread had
posted a task on the signaling thread to say that it was done which
would then get processed by ProcessMessages(). In WebRTC this works, and
the test is RTCStatsIntegrationTest.GetsStatsWhileClosingPeerConnection.

But because Chromium's thread wrapper does no support
ProcessMessages(), calling getStats() followed by close() in Chrome
resulted in waiting forever (https://crbug.com/850907).

In this CL, the process messages loop is removed. Instead, the shared
resources are guarded by an rtc::Event. WaitForPendingRequest() still
blocks the signaling thread, but only while shared resources are in use
by the network thread. After this CL, calling WaitForPendingRequest() no
longer has any unexpected side-effects since it no longer processes
other messages that might have been posted on the thread.

The resource ownership and threading model of WebRTC deserves to be
revisited, but this fixes a common Chromium crash without redesigning
PeerConnection, in a way that does not cause more blocking than what
the other PeerConnection methods are already doing.

Note: An alternative to using rtc::Event is to use resource locks and
to not perform the stats collection on the network thread if the
request was cancelled before the start of processing, but this has very
little benefit in terms of performance: once the network thread starts
collecting the stats, it would use the lock until collection is
completed, blocking the signaling thread trying to acquire that lock
anyway. This defeats the purpose and is a riskier change, since
cancelling partial collection in this inherently racy edge-case would
have observable differences from the returned stats, which may cause
more regressions.

Bug: chromium:850907
Change-Id: Idceeee0bddc0c9d5518b58a2b263abb2bbf47cff
Reviewed-on: https://webrtc-review.googlesource.com/c/121567
Commit-Queue: Henrik Boström <hbos@webrtc.org>
Reviewed-by: Steve Anton <steveanton@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#26707}
diff --git a/pc/BUILD.gn b/pc/BUILD.gn
index fab9632..75ee47f 100644
--- a/pc/BUILD.gn
+++ b/pc/BUILD.gn
@@ -217,6 +217,7 @@
     "../rtc_base:checks",
     "../rtc_base:rtc_base",
     "../rtc_base:rtc_base_approved",
+    "../rtc_base:rtc_post_message_with_functor",
     "../rtc_base/system:rtc_export",
     "../rtc_base/third_party/base64",
     "../rtc_base/third_party/sigslot",
diff --git a/pc/rtc_stats_collector.cc b/pc/rtc_stats_collector.cc
index a25a785..3287b92 100644
--- a/pc/rtc_stats_collector.cc
+++ b/pc/rtc_stats_collector.cc
@@ -25,6 +25,7 @@
 #include "pc/peer_connection.h"
 #include "pc/rtc_stats_traversal.h"
 #include "rtc_base/checks.h"
+#include "rtc_base/post_message_with_functor.h"
 #include "rtc_base/strings/string_builder.h"
 #include "rtc_base/time_utils.h"
 #include "rtc_base/trace_event.h"
@@ -753,6 +754,8 @@
       network_thread_(pc->network_thread()),
       num_pending_partial_reports_(0),
       partial_report_timestamp_us_(0),
+      network_report_event_(true /* manual_reset */,
+                            true /* initially_signaled */),
       cache_timestamp_us_(0),
       cache_lifetime_us_(cache_lifetime_us) {
   RTC_DCHECK(pc_);
@@ -799,7 +802,7 @@
     // reentrancy problems.
     std::vector<RequestInfo> requests;
     requests.swap(requests_);
-    invoker_.AsyncInvoke<void>(
+    rtc::PostMessageWithFunctor(
         RTC_FROM_HERE, signaling_thread_,
         rtc::Bind(&RTCStatsCollector::DeliverCachedReport, this, cached_report_,
                   std::move(requests)));
@@ -830,10 +833,14 @@
     // network thread, where it more naturally belongs.
     call_stats_ = pc_->GetCallStats();
 
-    invoker_.AsyncInvoke<void>(
+    // Don't touch |network_report_| on the signaling thread until
+    // ProducePartialResultsOnNetworkThread() has signaled the
+    // |network_report_event_|.
+    network_report_event_.Reset();
+    rtc::PostMessageWithFunctor(
         RTC_FROM_HERE, network_thread_,
         rtc::Bind(&RTCStatsCollector::ProducePartialResultsOnNetworkThread,
-                  rtc::scoped_refptr<RTCStatsCollector>(this), timestamp_us));
+                  this, timestamp_us));
     ProducePartialResultsOnSignalingThread(timestamp_us);
   }
 }
@@ -845,89 +852,117 @@
 
 void RTCStatsCollector::WaitForPendingRequest() {
   RTC_DCHECK(signaling_thread_->IsCurrent());
-  if (num_pending_partial_reports_) {
-    rtc::Thread::Current()->ProcessMessages(0);
-    while (num_pending_partial_reports_) {
-      rtc::Thread::Current()->SleepMs(1);
-      rtc::Thread::Current()->ProcessMessages(0);
-    }
-  }
+  // If a request is pending, blocks until the |network_report_event_| is
+  // signaled and then delivers the result. Otherwise this is a NO-OP.
+  MergeNetworkReport_s();
 }
 
 void RTCStatsCollector::ProducePartialResultsOnSignalingThread(
     int64_t timestamp_us) {
   RTC_DCHECK(signaling_thread_->IsCurrent());
-  rtc::scoped_refptr<RTCStatsReport> report = RTCStatsReport::Create(
-      timestamp_us);
+  partial_report_ = RTCStatsReport::Create(timestamp_us);
 
-  ProduceDataChannelStats_s(timestamp_us, report.get());
-  ProduceMediaStreamStats_s(timestamp_us, report.get());
-  ProduceMediaStreamTrackStats_s(timestamp_us, report.get());
-  ProducePeerConnectionStats_s(timestamp_us, report.get());
+  ProducePartialResultsOnSignalingThreadImpl(timestamp_us,
+                                             partial_report_.get());
 
-  AddPartialResults(report);
+  // ProducePartialResultsOnSignalingThread() is running synchronously on the
+  // signaling thread, so it is always the first partial result delivered on the
+  // signaling thread. The request is not complete until MergeNetworkReport_s()
+  // happens; we don't have to do anything here.
+  RTC_DCHECK_GT(num_pending_partial_reports_, 1);
+  --num_pending_partial_reports_;
+}
+
+void RTCStatsCollector::ProducePartialResultsOnSignalingThreadImpl(
+    int64_t timestamp_us,
+    RTCStatsReport* partial_report) {
+  RTC_DCHECK(signaling_thread_->IsCurrent());
+  ProduceDataChannelStats_s(timestamp_us, partial_report);
+  ProduceMediaStreamStats_s(timestamp_us, partial_report);
+  ProduceMediaStreamTrackStats_s(timestamp_us, partial_report);
+  ProducePeerConnectionStats_s(timestamp_us, partial_report);
 }
 
 void RTCStatsCollector::ProducePartialResultsOnNetworkThread(
     int64_t timestamp_us) {
   RTC_DCHECK(network_thread_->IsCurrent());
-  rtc::scoped_refptr<RTCStatsReport> report = RTCStatsReport::Create(
-      timestamp_us);
+  // Touching |network_report_| on this thread is safe by this method because
+  // |network_report_event_| is reset before this method is invoked.
+  network_report_ = RTCStatsReport::Create(timestamp_us);
 
   std::map<std::string, cricket::TransportStats> transport_stats_by_name =
       pc_->GetTransportStatsByNames(transport_names_);
-
   std::map<std::string, CertificateStatsPair> transport_cert_stats =
       PrepareTransportCertificateStats_n(transport_stats_by_name);
 
-  ProduceCertificateStats_n(timestamp_us, transport_cert_stats, report.get());
-  ProduceCodecStats_n(timestamp_us, transceiver_stats_infos_, report.get());
-  ProduceIceCandidateAndPairStats_n(timestamp_us, transport_stats_by_name,
-                                    call_stats_, report.get());
-  ProduceRTPStreamStats_n(timestamp_us, transceiver_stats_infos_, report.get());
-  ProduceTransportStats_n(timestamp_us, transport_stats_by_name,
-                          transport_cert_stats, report.get());
+  ProducePartialResultsOnNetworkThreadImpl(
+      timestamp_us, transport_stats_by_name, transport_cert_stats,
+      network_report_.get());
 
-  AddPartialResults(report);
+  // Signal that it is now safe to touch |network_report_| on the signaling
+  // thread, and post a task to merge it into the final results.
+  network_report_event_.Set();
+  rtc::PostMessageWithFunctor(
+      RTC_FROM_HERE, signaling_thread_,
+      rtc::Bind(&RTCStatsCollector::MergeNetworkReport_s, this));
 }
 
-void RTCStatsCollector::AddPartialResults(
-    const rtc::scoped_refptr<RTCStatsReport>& partial_report) {
-  if (!signaling_thread_->IsCurrent()) {
-    invoker_.AsyncInvoke<void>(RTC_FROM_HERE, signaling_thread_,
-        rtc::Bind(&RTCStatsCollector::AddPartialResults_s,
-                  rtc::scoped_refptr<RTCStatsCollector>(this),
-                  partial_report));
+void RTCStatsCollector::ProducePartialResultsOnNetworkThreadImpl(
+    int64_t timestamp_us,
+    const std::map<std::string, cricket::TransportStats>&
+        transport_stats_by_name,
+    const std::map<std::string, CertificateStatsPair>& transport_cert_stats,
+    RTCStatsReport* partial_report) {
+  RTC_DCHECK(network_thread_->IsCurrent());
+  ProduceCertificateStats_n(timestamp_us, transport_cert_stats, partial_report);
+  ProduceCodecStats_n(timestamp_us, transceiver_stats_infos_, partial_report);
+  ProduceIceCandidateAndPairStats_n(timestamp_us, transport_stats_by_name,
+                                    call_stats_, partial_report);
+  ProduceRTPStreamStats_n(timestamp_us, transceiver_stats_infos_,
+                          partial_report);
+  ProduceTransportStats_n(timestamp_us, transport_stats_by_name,
+                          transport_cert_stats, partial_report);
+}
+
+void RTCStatsCollector::MergeNetworkReport_s() {
+  RTC_DCHECK(signaling_thread_->IsCurrent());
+  // The |network_report_event_| must be signaled for it to be safe to touch
+  // |network_report_|. This is normally not blocking, but if
+  // WaitForPendingRequest() is called while a request is pending, we might have
+  // to wait until the network thread is done touching |network_report_|.
+  network_report_event_.Wait(rtc::Event::kForever);
+  if (!network_report_) {
+    // Normally, MergeNetworkReport_s() is executed because it is posted from
+    // the network thread. But if WaitForPendingRequest() is called while a
+    // request is pending, an early call to MergeNetworkReport_s() is made,
+    // merging the report and setting |network_report_| to null. If so, when the
+    // previously posted MergeNetworkReport_s() is later executed, the report is
+    // already null and nothing needs to be done here.
     return;
   }
-  AddPartialResults_s(partial_report);
-}
-
-void RTCStatsCollector::AddPartialResults_s(
-    rtc::scoped_refptr<RTCStatsReport> partial_report) {
-  RTC_DCHECK(signaling_thread_->IsCurrent());
   RTC_DCHECK_GT(num_pending_partial_reports_, 0);
-  if (!partial_report_)
-    partial_report_ = partial_report;
-  else
-    partial_report_->TakeMembersFrom(partial_report);
+  RTC_DCHECK(partial_report_);
+  partial_report_->TakeMembersFrom(network_report_);
+  network_report_ = nullptr;
   --num_pending_partial_reports_;
-  if (!num_pending_partial_reports_) {
-    cache_timestamp_us_ = partial_report_timestamp_us_;
-    cached_report_ = partial_report_;
-    partial_report_ = nullptr;
-    transceiver_stats_infos_.clear();
-    // Trace WebRTC Stats when getStats is called on Javascript.
-    // This allows access to WebRTC stats from trace logs. To enable them,
-    // select the "webrtc_stats" category when recording traces.
-    TRACE_EVENT_INSTANT1("webrtc_stats", "webrtc_stats", "report",
-                         cached_report_->ToJson());
+  // |network_report_| is currently the only partial report collected
+  // asynchronously, so |num_pending_partial_reports_| must now be 0 and we are
+  // ready to deliver the result.
+  RTC_DCHECK_EQ(num_pending_partial_reports_, 0);
+  cache_timestamp_us_ = partial_report_timestamp_us_;
+  cached_report_ = partial_report_;
+  partial_report_ = nullptr;
+  transceiver_stats_infos_.clear();
+  // Trace WebRTC Stats when getStats is called on Javascript.
+  // This allows access to WebRTC stats from trace logs. To enable them,
+  // select the "webrtc_stats" category when recording traces.
+  TRACE_EVENT_INSTANT1("webrtc_stats", "webrtc_stats", "report",
+                       cached_report_->ToJson());
 
-    // Deliver report and clear |requests_|.
-    std::vector<RequestInfo> requests;
-    requests.swap(requests_);
-    DeliverCachedReport(cached_report_, std::move(requests));
-  }
+  // Deliver report and clear |requests_|.
+  std::vector<RequestInfo> requests;
+  requests.swap(requests_);
+  DeliverCachedReport(cached_report_, std::move(requests));
 }
 
 void RTCStatsCollector::DeliverCachedReport(
diff --git a/pc/rtc_stats_collector.h b/pc/rtc_stats_collector.h
index a3f1593..4837fc0 100644
--- a/pc/rtc_stats_collector.h
+++ b/pc/rtc_stats_collector.h
@@ -27,7 +27,7 @@
 #include "pc/data_channel.h"
 #include "pc/peer_connection_internal.h"
 #include "pc/track_media_info_map.h"
-#include "rtc_base/async_invoker.h"
+#include "rtc_base/event.h"
 #include "rtc_base/ref_count.h"
 #include "rtc_base/ssl_identity.h"
 #include "rtc_base/third_party/sigslot/sigslot.h"
@@ -77,14 +77,21 @@
   RTCStatsCollector(PeerConnectionInternal* pc, int64_t cache_lifetime_us);
   ~RTCStatsCollector();
 
-  // Stats gathering on a particular thread. Calls |AddPartialResults| before
-  // returning. Virtual for the sake of testing.
-  virtual void ProducePartialResultsOnSignalingThread(int64_t timestamp_us);
-  virtual void ProducePartialResultsOnNetworkThread(int64_t timestamp_us);
+  struct CertificateStatsPair {
+    std::unique_ptr<rtc::SSLCertificateStats> local;
+    std::unique_ptr<rtc::SSLCertificateStats> remote;
+  };
 
-  // Can be called on any thread.
-  void AddPartialResults(
-      const rtc::scoped_refptr<RTCStatsReport>& partial_report);
+  // Stats gathering on a particular thread. Virtual for the sake of testing.
+  virtual void ProducePartialResultsOnSignalingThreadImpl(
+      int64_t timestamp_us,
+      RTCStatsReport* partial_report);
+  virtual void ProducePartialResultsOnNetworkThreadImpl(
+      int64_t timestamp_us,
+      const std::map<std::string, cricket::TransportStats>&
+          transport_stats_by_name,
+      const std::map<std::string, CertificateStatsPair>& transport_cert_stats,
+      RTCStatsReport* partial_report);
 
  private:
   class RequestInfo {
@@ -130,11 +137,6 @@
 
   void GetStatsReportInternal(RequestInfo request);
 
-  struct CertificateStatsPair {
-    std::unique_ptr<rtc::SSLCertificateStats> local;
-    std::unique_ptr<rtc::SSLCertificateStats> remote;
-  };
-
   // Structure for tracking stats about each RtpTransceiver managed by the
   // PeerConnection. This can either by a Plan B style or Unified Plan style
   // transceiver (i.e., can have 0 or many senders and receivers).
@@ -150,7 +152,6 @@
     std::unique_ptr<TrackMediaInfoMap> track_media_info_map;
   };
 
-  void AddPartialResults_s(rtc::scoped_refptr<RTCStatsReport> partial_report);
   void DeliverCachedReport(
       rtc::scoped_refptr<const RTCStatsReport> cached_report,
       std::vector<RequestInfo> requests);
@@ -211,6 +212,13 @@
   std::vector<RtpTransceiverStatsInfo> PrepareTransceiverStatsInfos_s() const;
   std::set<std::string> PrepareTransportNames_s() const;
 
+  // Stats gathering on a particular thread.
+  void ProducePartialResultsOnSignalingThread(int64_t timestamp_us);
+  void ProducePartialResultsOnNetworkThread(int64_t timestamp_us);
+  // Merges |network_report_| into |partial_report_| and completes the request.
+  // This is a NO-OP if |network_report_| is null.
+  void MergeNetworkReport_s();
+
   // Slots for signals (sigslot) that are wired up to |pc_|.
   void OnDataChannelCreated(DataChannel* channel);
   // Slots for signals (sigslot) that are wired up to |channel|.
@@ -221,12 +229,24 @@
   rtc::Thread* const signaling_thread_;
   rtc::Thread* const worker_thread_;
   rtc::Thread* const network_thread_;
-  rtc::AsyncInvoker invoker_;
 
   int num_pending_partial_reports_;
   int64_t partial_report_timestamp_us_;
+  // Reports that are produced on the signaling thread or the network thread are
+  // merged into this report. It is only touched on the signaling thread. Once
+  // all partial reports are merged this is the result of a request.
   rtc::scoped_refptr<RTCStatsReport> partial_report_;
   std::vector<RequestInfo> requests_;
+  // Holds the result of ProducePartialResultsOnNetworkThread(). It is merged
+  // into |partial_report_| on the signaling thread and then nulled by
+  // MergeNetworkReport_s(). Thread-safety is ensured by using
+  // |network_report_event_|.
+  rtc::scoped_refptr<RTCStatsReport> network_report_;
+  // If set, it is safe to touch the |network_report_| on the signaling thread.
+  // This is reset before async-invoking ProducePartialResultsOnNetworkThread()
+  // and set when ProducePartialResultsOnNetworkThread() is complete, after it
+  // has updated the value of |network_report_|.
+  rtc::Event network_report_event_;
 
   // Set in |GetStatsReport|, read in |ProducePartialResultsOnNetworkThread| and
   // |ProducePartialResultsOnSignalingThread|, reset after work is complete. Not
diff --git a/pc/rtc_stats_collector_unittest.cc b/pc/rtc_stats_collector_unittest.cc
index f628f82..3be4ae7 100644
--- a/pc/rtc_stats_collector_unittest.cc
+++ b/pc/rtc_stats_collector_unittest.cc
@@ -2249,7 +2249,9 @@
         worker_thread_(pc->worker_thread()),
         network_thread_(pc->network_thread()) {}
 
-  void ProducePartialResultsOnSignalingThread(int64_t timestamp_us) override {
+  void ProducePartialResultsOnSignalingThreadImpl(
+      int64_t timestamp_us,
+      RTCStatsReport* partial_report) override {
     EXPECT_TRUE(signaling_thread_->IsCurrent());
     {
       rtc::CritScope cs(&lock_);
@@ -2257,13 +2259,15 @@
       ++produced_on_signaling_thread_;
     }
 
-    rtc::scoped_refptr<RTCStatsReport> signaling_report =
-        RTCStatsReport::Create(0);
-    signaling_report->AddStats(std::unique_ptr<const RTCStats>(
+    partial_report->AddStats(std::unique_ptr<const RTCStats>(
         new RTCTestStats("SignalingThreadStats", timestamp_us)));
-    AddPartialResults(signaling_report);
   }
-  void ProducePartialResultsOnNetworkThread(int64_t timestamp_us) override {
+  void ProducePartialResultsOnNetworkThreadImpl(
+      int64_t timestamp_us,
+      const std::map<std::string, cricket::TransportStats>&
+          transport_stats_by_name,
+      const std::map<std::string, CertificateStatsPair>& transport_cert_stats,
+      RTCStatsReport* partial_report) override {
     EXPECT_TRUE(network_thread_->IsCurrent());
     {
       rtc::CritScope cs(&lock_);
@@ -2271,11 +2275,8 @@
       ++produced_on_network_thread_;
     }
 
-    rtc::scoped_refptr<RTCStatsReport> network_report =
-        RTCStatsReport::Create(0);
-    network_report->AddStats(std::unique_ptr<const RTCStats>(
+    partial_report->AddStats(std::unique_ptr<const RTCStats>(
         new RTCTestStats("NetworkThreadStats", timestamp_us)));
-    AddPartialResults(network_report);
   }
 
  private: