Optimize RoboCaller::AddReceiver() for code size

Essentially, instead of having the inlined UntypedFunction::Create(f)
return an UntypedFunction which is then passed as an argument to
non-inlined RoboCallerReceivers::AddReceiverImpl(), we let
UntypedFunction::PrepareArgs(f) return a few different kinds of
trivial structs (depending on what sort of type f has) which are
passed as arguments to non-inlined RoboCallerReceivers::AddReceiver()
(which then converts them to UntypedFunction by calling
UntypedFunction::Create()). These structs are smaller than
UntypedFunction and optimized for argument passing, so many fewer
instructions are needed.

Example code:

  struct Foo {
    void Receive(int, float, int, float);
    void TestAddLambdaReceiver();
    webrtc::RoboCaller<int, float, int, float> rc;
  };

  void Foo::TestAddLambdaReceiver() {
    rc.AddReceiver([this](int a, float b, int c, float d){
        Receive(a, b, c, d);});
  }

On arm32, we get before this CL:

  Foo::TestAddLambdaReceiver():
        push    {r11, lr}
        mov     r11, sp
        sub     sp, sp, #24
        ldr     r1, .LCPI0_0
        mov     r2, #0
        stm     sp, {r0, r2}
        add     r1, pc, r1
        str     r2, [sp, #20]
        str     r1, [sp, #16]
        mov     r1, sp
        bl      RoboCallerReceivers::AddReceiverImpl
        mov     sp, r11
        pop     {r11, pc}
  .LCPI0_0:
        .long   CallInlineStorage<Foo::TestAddLambdaReceiver()::$_0>
  CallInlineStorage<Foo::TestAddLambdaReceiver()::$_0>:
        ldr     r0, [r0]
        b       Foo::Receive(int, float, int, float)

After this CL:

  Foo::TestAddLambdaReceiver():
        ldr     r3, .LCPI0_0
        mov     r2, r0
        add     r3, pc, r3
        b       RoboCallerReceivers::AddReceiver<1u>
  .LCPI0_0:
        .long   CallInlineStorage<Foo::TestAddLambdaReceiver()::$_0>
  CallInlineStorage<Foo::TestAddLambdaReceiver()::$_0>:
        ldr     r0, [r0]
        b       Foo::Receive(int, float, int, float)

(Symbol names abbreviated so that they'll fit on one line.)

So a reduction from 64 to 28 bytes. The improvements on arm64 and
x86_64 are similar.

Bug: webrtc:11943
Change-Id: I93fbba083be0235051c3279d3e3f6852a4a9fdad
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/185960
Commit-Queue: Karl Wiberg <kwiberg@webrtc.org>
Reviewed-by: Mirko Bonadei <mbonadei@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#32244}
diff --git a/rtc_base/BUILD.gn b/rtc_base/BUILD.gn
index a09c06e..489a5c6 100644
--- a/rtc_base/BUILD.gn
+++ b/rtc_base/BUILD.gn
@@ -57,6 +57,7 @@
     ":untyped_function",
     "../api:function_view",
     "system:assume",
+    "system:inline",
   ]
 }
 
diff --git a/rtc_base/robo_caller.cc b/rtc_base/robo_caller.cc
index a97687a..a7c35cb 100644
--- a/rtc_base/robo_caller.cc
+++ b/rtc_base/robo_caller.cc
@@ -16,10 +16,6 @@
 RoboCallerReceivers::RoboCallerReceivers() = default;
 RoboCallerReceivers::~RoboCallerReceivers() = default;
 
-void RoboCallerReceivers::AddReceiverImpl(UntypedFunction* f) {
-  receivers_.push_back(std::move(*f));
-}
-
 void RoboCallerReceivers::Foreach(
     rtc::FunctionView<void(UntypedFunction&)> fv) {
   for (auto& r : receivers_) {
@@ -27,5 +23,18 @@
   }
 }
 
+template void RoboCallerReceivers::AddReceiver(
+    UntypedFunction::TrivialUntypedFunctionArgs<1>);
+template void RoboCallerReceivers::AddReceiver(
+    UntypedFunction::TrivialUntypedFunctionArgs<2>);
+template void RoboCallerReceivers::AddReceiver(
+    UntypedFunction::TrivialUntypedFunctionArgs<3>);
+template void RoboCallerReceivers::AddReceiver(
+    UntypedFunction::TrivialUntypedFunctionArgs<4>);
+template void RoboCallerReceivers::AddReceiver(
+    UntypedFunction::NontrivialUntypedFunctionArgs);
+template void RoboCallerReceivers::AddReceiver(
+    UntypedFunction::FunctionPointerUntypedFunctionArgs);
+
 }  // namespace robo_caller_impl
 }  // namespace webrtc
diff --git a/rtc_base/robo_caller.h b/rtc_base/robo_caller.h
index 573e7b6..9df6a48 100644
--- a/rtc_base/robo_caller.h
+++ b/rtc_base/robo_caller.h
@@ -16,6 +16,7 @@
 
 #include "api/function_view.h"
 #include "rtc_base/system/assume.h"
+#include "rtc_base/system/inline.h"
 #include "rtc_base/untyped_function.h"
 
 namespace webrtc {
@@ -30,20 +31,30 @@
   RoboCallerReceivers& operator=(RoboCallerReceivers&&) = delete;
   ~RoboCallerReceivers();
 
-  void AddReceiver(UntypedFunction&& f) {
-    AddReceiverImpl(&f);
-    // Assume that f was moved from and is now trivially destructible.
-    // This helps the compiler optimize away the destructor call.
-    RTC_ASSUME(f.IsTriviallyDestructible());
+  template <typename UntypedFunctionArgsT>
+  RTC_NO_INLINE void AddReceiver(UntypedFunctionArgsT args) {
+    receivers_.push_back(UntypedFunction::Create(args));
   }
 
   void Foreach(rtc::FunctionView<void(UntypedFunction&)> fv);
 
  private:
-  void AddReceiverImpl(UntypedFunction* f);
   std::vector<UntypedFunction> receivers_;
 };
 
+extern template void RoboCallerReceivers::AddReceiver(
+    UntypedFunction::TrivialUntypedFunctionArgs<1>);
+extern template void RoboCallerReceivers::AddReceiver(
+    UntypedFunction::TrivialUntypedFunctionArgs<2>);
+extern template void RoboCallerReceivers::AddReceiver(
+    UntypedFunction::TrivialUntypedFunctionArgs<3>);
+extern template void RoboCallerReceivers::AddReceiver(
+    UntypedFunction::TrivialUntypedFunctionArgs<4>);
+extern template void RoboCallerReceivers::AddReceiver(
+    UntypedFunction::NontrivialUntypedFunctionArgs);
+extern template void RoboCallerReceivers::AddReceiver(
+    UntypedFunction::FunctionPointerUntypedFunctionArgs);
+
 }  // namespace robo_caller_impl
 
 // A collection of receivers (callable objects) that can be called all at once.
@@ -71,7 +82,7 @@
   template <typename F>
   void AddReceiver(F&& f) {
     receivers_.AddReceiver(
-        UntypedFunction::Create<void(ArgT...)>(std::forward<F>(f)));
+        UntypedFunction::PrepareArgs<void(ArgT...)>(std::forward<F>(f)));
   }
 
   // Calls all receivers with the given arguments.
diff --git a/rtc_base/untyped_function.h b/rtc_base/untyped_function.h
index 9220d3f..16c5ba0 100644
--- a/rtc_base/untyped_function.h
+++ b/rtc_base/untyped_function.h
@@ -11,6 +11,8 @@
 #ifndef RTC_BASE_UNTYPED_FUNCTION_H_
 #define RTC_BASE_UNTYPED_FUNCTION_H_
 
+#include <cstddef>
+#include <cstring>
 #include <memory>
 #include <type_traits>
 #include <utility>
@@ -25,9 +27,17 @@
 union VoidUnion {
   void* void_ptr;
   FunVoid* fun_ptr;
-  typename std::aligned_storage<16>::type inline_storage;
+  typename std::aligned_storage<4 * sizeof(uintptr_t)>::type inline_storage;
 };
 
+// Returns the number of elements of the `inline_storage` array required to
+// store an object of type T.
+template <typename T>
+constexpr size_t InlineStorageSize() {
+  // sizeof(T) / sizeof(uintptr_t), but rounded up.
+  return (sizeof(T) + sizeof(uintptr_t) - 1) / sizeof(uintptr_t);
+}
+
 template <typename T>
 struct CallHelpers;
 template <typename RetT, typename... ArgT>
@@ -64,76 +74,155 @@
 // size.
 class UntypedFunction final {
  public:
-  // Create function for lambdas and other callables; it accepts every type of
-  // argument except those noted in its enable_if call.
+  // The *UntypedFunctionArgs structs are used to transfer arguments from
+  // PrepareArgs() to Create(). They are trivial, but may own heap allocations,
+  // so make sure to pass them to Create() exactly once!
+  //
+  // The point of doing Create(PrepareArgs(foo)) instead of just Create(foo) is
+  // to separate the code that has to be inlined (PrepareArgs) from the code
+  // that can be noninlined (Create); the *UntypedFunctionArgs types are
+  // designed to efficiently carry the required information from one to the
+  // other.
+  template <size_t N>
+  struct TrivialUntypedFunctionArgs {
+    // We use an uintptr_t array here instead of std::aligned_storage, because
+    // the former can be efficiently passed in registers when using
+    // TrivialUntypedFunctionArgs as a function argument. (We can't do the same
+    // in VoidUnion, because std::aligned_storage but not uintptr_t can be
+    // legally reinterpret_casted to arbitrary types.
+    // TrivialUntypedFunctionArgs, on the other hand, only needs to handle
+    // placement new and memcpy.)
+    alignas(std::max_align_t) uintptr_t inline_storage[N];
+    webrtc_function_impl::FunVoid* call;
+  };
+  struct NontrivialUntypedFunctionArgs {
+    void* void_ptr;
+    webrtc_function_impl::FunVoid* call;
+    void (*del)(webrtc_function_impl::VoidUnion*);
+  };
+  struct FunctionPointerUntypedFunctionArgs {
+    webrtc_function_impl::FunVoid* fun_ptr;
+    webrtc_function_impl::FunVoid* call;
+  };
+
+  // Create function for lambdas and other callables that are trivial and small;
+  // it accepts every type of argument except those noted in its enable_if call.
   template <
       typename Signature,
       typename F,
+      typename F_deref = typename std::remove_reference<F>::type,
       typename std::enable_if<
           // Not for function pointers; we have another overload for that below.
-          !std::is_function<typename std::remove_pointer<
-              typename std::remove_reference<F>::type>::type>::value &&
+          !std::is_function<
+              typename std::remove_pointer<F_deref>::type>::value &&
 
-          // Not for nullptr; we have another overload for that below.
+          // Not for nullptr; we have a constructor for that below.
           !std::is_same<std::nullptr_t,
                         typename std::remove_cv<F>::type>::value &&
 
-          // Not for UntypedFunction objects; we have another overload for that.
+          // Not for UntypedFunction objects; use move construction or
+          // assignment.
           !std::is_same<UntypedFunction,
-                        typename std::remove_cv<typename std::remove_reference<
-                            F>::type>::type>::value>::type* = nullptr>
-  static UntypedFunction Create(F&& f) {
-    using F_deref = typename std::remove_reference<F>::type;
-    // TODO(C++17): Use `constexpr if` here. The compiler appears to do the
-    // right thing anyway w.r.t. resolving the branch statically and
-    // eliminating dead code, but it would be good for readability.
-    if (std::is_trivially_move_constructible<F_deref>::value &&
-        std::is_trivially_destructible<F_deref>::value &&
-        sizeof(F_deref) <=
-            sizeof(webrtc_function_impl::VoidUnion::inline_storage)) {
-      // The callable is trivial and small enough, so we just store its bytes
-      // in the inline storage.
-      webrtc_function_impl::VoidUnion vu;
-      new (&vu.inline_storage) F_deref(std::forward<F>(f));
-      return UntypedFunction(
-          vu,
-          reinterpret_cast<webrtc_function_impl::FunVoid*>(
-              webrtc_function_impl::CallHelpers<
-                  Signature>::template CallInlineStorage<F_deref>),
-          nullptr);
-    } else {
-      // The callable is either nontrivial or too large, so we can't keep it
-      // in the inline storage; use the heap instead.
-      webrtc_function_impl::VoidUnion vu;
-      vu.void_ptr = new F_deref(std::forward<F>(f));
-      return UntypedFunction(
-          vu,
-          reinterpret_cast<webrtc_function_impl::FunVoid*>(
-              webrtc_function_impl::CallHelpers<
-                  Signature>::template CallVoidPtr<F_deref>),
-          static_cast<void (*)(webrtc_function_impl::VoidUnion*)>(
-              [](webrtc_function_impl::VoidUnion* vu) {
-                // Assuming that this pointer isn't null allows the
-                // compiler to eliminate a null check in the (inlined)
-                // delete operation.
-                RTC_ASSUME(vu->void_ptr != nullptr);
-                delete reinterpret_cast<F_deref*>(vu->void_ptr);
-              }));
-    }
+                        typename std::remove_cv<F_deref>::type>::value &&
+
+          // Only for trivial callables that will fit in
+          // VoidUnion::inline_storage.
+          std::is_trivially_move_constructible<F_deref>::value &&
+          std::is_trivially_destructible<F_deref>::value &&
+          sizeof(F_deref) <=
+              sizeof(webrtc_function_impl::VoidUnion::inline_storage)>::type* =
+          nullptr,
+      size_t InlineSize = webrtc_function_impl::InlineStorageSize<F_deref>()>
+  static TrivialUntypedFunctionArgs<InlineSize> PrepareArgs(F&& f) {
+    // The callable is trivial and small enough, so we just store its bytes
+    // in the inline storage.
+    TrivialUntypedFunctionArgs<InlineSize> args;
+    new (&args.inline_storage) F_deref(std::forward<F>(f));
+    args.call = reinterpret_cast<webrtc_function_impl::FunVoid*>(
+        webrtc_function_impl::CallHelpers<
+            Signature>::template CallInlineStorage<F_deref>);
+    return args;
+  }
+  template <size_t InlineSize>
+  static UntypedFunction Create(TrivialUntypedFunctionArgs<InlineSize> args) {
+    webrtc_function_impl::VoidUnion vu;
+    std::memcpy(&vu.inline_storage, args.inline_storage,
+                sizeof(args.inline_storage));
+    return UntypedFunction(vu, args.call, nullptr);
+  }
+
+  // Create function for lambdas and other callables that are nontrivial or
+  // large; it accepts every type of argument except those noted in its
+  // enable_if call.
+  template <
+      typename Signature,
+      typename F,
+      typename F_deref = typename std::remove_reference<F>::type,
+      typename std::enable_if<
+          // Not for function pointers; we have another overload for that below.
+          !std::is_function<
+              typename std::remove_pointer<F_deref>::type>::value &&
+
+          // Not for nullptr; we have a constructor for that below.
+          !std::is_same<std::nullptr_t,
+                        typename std::remove_cv<F>::type>::value &&
+
+          // Not for UntypedFunction objects; use move construction or
+          // assignment.
+          !std::is_same<UntypedFunction,
+                        typename std::remove_cv<F_deref>::type>::value &&
+
+          // Only for nontrivial callables, or callables that won't fit in
+          // VoidUnion::inline_storage.
+          !(std::is_trivially_move_constructible<F_deref>::value &&
+            std::is_trivially_destructible<F_deref>::value &&
+            sizeof(F_deref) <= sizeof(webrtc_function_impl::VoidUnion::
+                                          inline_storage))>::type* = nullptr>
+  static NontrivialUntypedFunctionArgs PrepareArgs(F&& f) {
+    // The callable is either nontrivial or too large, so we can't keep it
+    // in the inline storage; use the heap instead.
+    NontrivialUntypedFunctionArgs args;
+    args.void_ptr = new F_deref(std::forward<F>(f));
+    args.call = reinterpret_cast<webrtc_function_impl::FunVoid*>(
+        webrtc_function_impl::CallHelpers<Signature>::template CallVoidPtr<
+            F_deref>);
+    args.del = static_cast<void (*)(webrtc_function_impl::VoidUnion*)>(
+        [](webrtc_function_impl::VoidUnion* vu) {
+          // Assuming that this pointer isn't null allows the
+          // compiler to eliminate a null check in the (inlined)
+          // delete operation.
+          RTC_ASSUME(vu->void_ptr != nullptr);
+          delete reinterpret_cast<F_deref*>(vu->void_ptr);
+        });
+    return args;
+  }
+  static UntypedFunction Create(NontrivialUntypedFunctionArgs args) {
+    webrtc_function_impl::VoidUnion vu;
+    vu.void_ptr = args.void_ptr;
+    return UntypedFunction(vu, args.call, args.del);
   }
 
   // Create function that accepts function pointers. If the argument is null,
   // the result is an empty UntypedFunction.
   template <typename Signature>
-  static UntypedFunction Create(Signature* f) {
+  static FunctionPointerUntypedFunctionArgs PrepareArgs(Signature* f) {
+    FunctionPointerUntypedFunctionArgs args;
+    args.fun_ptr = reinterpret_cast<webrtc_function_impl::FunVoid*>(f);
+    args.call = reinterpret_cast<webrtc_function_impl::FunVoid*>(
+        webrtc_function_impl::CallHelpers<Signature>::CallFunPtr);
+    return args;
+  }
+  static UntypedFunction Create(FunctionPointerUntypedFunctionArgs args) {
     webrtc_function_impl::VoidUnion vu;
-    vu.fun_ptr = reinterpret_cast<webrtc_function_impl::FunVoid*>(f);
-    return UntypedFunction(
-        vu,
-        f ? reinterpret_cast<webrtc_function_impl::FunVoid*>(
-                webrtc_function_impl::CallHelpers<Signature>::CallFunPtr)
-          : nullptr,
-        nullptr);
+    vu.fun_ptr = args.fun_ptr;
+    return UntypedFunction(vu, args.fun_ptr == nullptr ? nullptr : args.call,
+                           nullptr);
+  }
+
+  // Prepares arguments and creates an UntypedFunction in one go.
+  template <typename Signature, typename F>
+  static UntypedFunction Create(F&& f) {
+    return Create(PrepareArgs<Signature>(std::forward<F>(f)));
   }
 
   // Default constructor. Creates an empty UntypedFunction.