PipeWire capturer: import DMABufs directly into desktop frame

Originally DMABufs were imported into a temporary buffer followed by a
copy operation into the desktop frame itself. This is not needed as we
can import them directly into desktop frames and avoid this overhead.

Also drop support for MemPtr buffers as both Mutter and KWin don't seem
to support them and they are going to be too slow anyway.

Testing with latest Chromium, I could see two processes with usage around 20% and 40% without this change going down to 10% and 20% with
this change applied.

Bug: webrtc:13429
Bug: chrome:1378258
Change-Id: Ice3292528ff56300931c8638f8e03d4883d5e331
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/297501
Reviewed-by: Alexander Cooper <alcooper@chromium.org>
Commit-Queue: Jan Grulich <grulja@gmail.com>
Cr-Commit-Position: refs/heads/main@{#39594}
diff --git a/modules/desktop_capture/linux/wayland/egl_dmabuf.cc b/modules/desktop_capture/linux/wayland/egl_dmabuf.cc
index 5bbd5d7..b529077 100644
--- a/modules/desktop_capture/linux/wayland/egl_dmabuf.cc
+++ b/modules/desktop_capture/linux/wayland/egl_dmabuf.cc
@@ -101,11 +101,23 @@
 typedef void (*glGenTextures_func)(GLsizei n, GLuint* textures);
 typedef GLenum (*glGetError_func)(void);
 typedef const GLubyte* (*glGetString_func)(GLenum name);
-typedef void (*glGetTexImage_func)(GLenum target,
-                                   GLint level,
-                                   GLenum format,
-                                   GLenum type,
-                                   void* pixels);
+typedef void (*glReadPixels_func)(GLint x,
+                                  GLint y,
+                                  GLsizei width,
+                                  GLsizei height,
+                                  GLenum format,
+                                  GLenum type,
+                                  void* data);
+typedef void (*glGenFramebuffers_func)(GLsizei n, GLuint* ids);
+typedef void (*glDeleteFramebuffers_func)(GLsizei n,
+                                          const GLuint* framebuffers);
+typedef void (*glBindFramebuffer_func)(GLenum target, GLuint framebuffer);
+typedef void (*glFramebufferTexture2D_func)(GLenum target,
+                                            GLenum attachment,
+                                            GLenum textarget,
+                                            GLuint texture,
+                                            GLint level);
+typedef GLenum (*glCheckFramebufferStatus_func)(GLenum target);
 typedef void (*glTexParameteri_func)(GLenum target, GLenum pname, GLint param);
 typedef void* (*glXGetProcAddressARB_func)(const char*);
 
@@ -118,7 +130,12 @@
 glGenTextures_func GlGenTextures = nullptr;
 glGetError_func GlGetError = nullptr;
 glGetString_func GlGetString = nullptr;
-glGetTexImage_func GlGetTexImage = nullptr;
+glReadPixels_func GlReadPixels = nullptr;
+glGenFramebuffers_func GlGenFramebuffers = nullptr;
+glDeleteFramebuffers_func GlDeleteFramebuffers = nullptr;
+glBindFramebuffer_func GlBindFramebuffer = nullptr;
+glFramebufferTexture2D_func GlFramebufferTexture2D = nullptr;
+glCheckFramebufferStatus_func GlCheckFramebufferStatus = nullptr;
 glTexParameteri_func GlTexParameteri = nullptr;
 glXGetProcAddressARB_func GlXGetProcAddressARB = nullptr;
 
@@ -279,12 +296,26 @@
         (glDeleteTextures_func)GlXGetProcAddressARB("glDeleteTextures");
     GlGenTextures = (glGenTextures_func)GlXGetProcAddressARB("glGenTextures");
     GlGetError = (glGetError_func)GlXGetProcAddressARB("glGetError");
-    GlGetTexImage = (glGetTexImage_func)GlXGetProcAddressARB("glGetTexImage");
+    GlReadPixels = (glReadPixels_func)GlXGetProcAddressARB("glReadPixels");
+    GlGenFramebuffers =
+        (glGenFramebuffers_func)GlXGetProcAddressARB("glGenFramebuffers");
+    GlDeleteFramebuffers =
+        (glDeleteFramebuffers_func)GlXGetProcAddressARB("glDeleteFramebuffers");
+    GlBindFramebuffer =
+        (glBindFramebuffer_func)GlXGetProcAddressARB("glBindFramebuffer");
+    GlFramebufferTexture2D = (glFramebufferTexture2D_func)GlXGetProcAddressARB(
+        "glFramebufferTexture2D");
+    GlCheckFramebufferStatus =
+        (glCheckFramebufferStatus_func)GlXGetProcAddressARB(
+            "glCheckFramebufferStatus");
+
     GlTexParameteri =
         (glTexParameteri_func)GlXGetProcAddressARB("glTexParameteri");
 
     return GlBindTexture && GlDeleteTextures && GlGenTextures && GlGetError &&
-           GlGetTexImage && GlTexParameteri;
+           GlReadPixels && GlGenFramebuffers && GlDeleteFramebuffers &&
+           GlBindFramebuffer && GlFramebufferTexture2D &&
+           GlCheckFramebufferStatus && GlTexParameteri;
   }
 
   return false;
@@ -435,6 +466,14 @@
     EglTerminate(egl_.display);
   }
 
+  if (fbo_) {
+    GlDeleteFramebuffers(1, &fbo_);
+  }
+
+  if (texture_) {
+    GlDeleteTextures(1, &texture_);
+  }
+
   // BUG: crbug.com/1290566
   // Closing libEGL.so.1 when using NVidia drivers causes a crash
   // when EglGetPlatformDisplayEXT() is used, at least this one is enough
@@ -466,20 +505,20 @@
 }
 
 RTC_NO_SANITIZE("cfi-icall")
-std::unique_ptr<uint8_t[]> EglDmaBuf::ImageFromDmaBuf(
-    const DesktopSize& size,
-    uint32_t format,
-    const std::vector<PlaneData>& plane_datas,
-    uint64_t modifier) {
-  std::unique_ptr<uint8_t[]> src;
-
+bool EglDmaBuf::ImageFromDmaBuf(const DesktopSize& size,
+                                uint32_t format,
+                                const std::vector<PlaneData>& plane_datas,
+                                uint64_t modifier,
+                                const DesktopVector& offset,
+                                const DesktopSize& buffer_size,
+                                uint8_t* data) {
   if (!egl_initialized_) {
-    return src;
+    return false;
   }
 
   if (plane_datas.size() <= 0) {
     RTC_LOG(LS_ERROR) << "Failed to process buffer: invalid number of planes";
-    return src;
+    return false;
   }
 
   EGLint attribs[47];
@@ -568,20 +607,32 @@
   if (image == EGL_NO_IMAGE) {
     RTC_LOG(LS_ERROR) << "Failed to record frame: Error creating EGLImage - "
                       << FormatEGLError(EglGetError());
-    return src;
+    return false;
   }
 
   // create GL 2D texture for framebuffer
-  GLuint texture;
-  GlGenTextures(1, &texture);
-  GlTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-  GlTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-  GlTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
-  GlTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
-  GlBindTexture(GL_TEXTURE_2D, texture);
+  if (!texture_) {
+    GlGenTextures(1, &texture_);
+    GlTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+    GlTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+    GlTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+    GlTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+  }
+  GlBindTexture(GL_TEXTURE_2D, texture_);
   GlEGLImageTargetTexture2DOES(GL_TEXTURE_2D, image);
 
-  src = std::make_unique<uint8_t[]>(plane_datas[0].stride * size.height());
+  if (!fbo_) {
+    GlGenFramebuffers(1, &fbo_);
+  }
+
+  GlBindFramebuffer(GL_FRAMEBUFFER, fbo_);
+  GlFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D,
+                         texture_, 0);
+  if (GlCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
+    RTC_LOG(LS_ERROR) << "Failed to bind DMA buf framebuffer";
+    EglDestroyImageKHR(egl_.display, image);
+    return false;
+  }
 
   GLenum gl_format = GL_BGRA;
   switch (format) {
@@ -598,17 +649,18 @@
       gl_format = GL_BGRA;
       break;
   }
-  GlGetTexImage(GL_TEXTURE_2D, 0, gl_format, GL_UNSIGNED_BYTE, src.get());
 
-  if (GlGetError()) {
+  GlReadPixels(offset.x(), offset.y(), buffer_size.width(),
+               buffer_size.height(), gl_format, GL_UNSIGNED_BYTE, data);
+
+  const GLenum error = GlGetError();
+  if (error) {
     RTC_LOG(LS_ERROR) << "Failed to get image from DMA buffer.";
-    return src;
   }
 
-  GlDeleteTextures(1, &texture);
   EglDestroyImageKHR(egl_.display, image);
 
-  return src;
+  return !error;
 }
 
 RTC_NO_SANITIZE("cfi-icall")
diff --git a/modules/desktop_capture/linux/wayland/egl_dmabuf.h b/modules/desktop_capture/linux/wayland/egl_dmabuf.h
index f1d96b2..22a8f5a 100644
--- a/modules/desktop_capture/linux/wayland/egl_dmabuf.h
+++ b/modules/desktop_capture/linux/wayland/egl_dmabuf.h
@@ -41,11 +41,15 @@
   EglDmaBuf();
   ~EglDmaBuf();
 
-  std::unique_ptr<uint8_t[]> ImageFromDmaBuf(
-      const DesktopSize& size,
-      uint32_t format,
-      const std::vector<PlaneData>& plane_datas,
-      uint64_t modifiers);
+  // Returns whether the image was successfully imported from
+  // given DmaBuf and its parameters
+  bool ImageFromDmaBuf(const DesktopSize& size,
+                       uint32_t format,
+                       const std::vector<PlaneData>& plane_datas,
+                       uint64_t modifiers,
+                       const DesktopVector& offset,
+                       const DesktopSize& buffer_size,
+                       uint8_t* data);
   std::vector<uint64_t> QueryDmaBufModifiers(uint32_t format);
 
   bool IsEglInitialized() const { return egl_initialized_; }
@@ -58,6 +62,8 @@
   int32_t drm_fd_ = -1;               // for GBM buffer mmap
   gbm_device* gbm_device_ = nullptr;  // for passed GBM buffer retrieval
 
+  GLuint fbo_ = 0;
+  GLuint texture_ = 0;
   EGLStruct egl_;
 
   absl::optional<std::string> GetRenderNode();
diff --git a/modules/desktop_capture/linux/wayland/shared_screencast_stream.cc b/modules/desktop_capture/linux/wayland/shared_screencast_stream.cc
index d31f867..37de3a5 100644
--- a/modules/desktop_capture/linux/wayland/shared_screencast_stream.cc
+++ b/modules/desktop_capture/linux/wayland/shared_screencast_stream.cc
@@ -157,6 +157,12 @@
   struct spa_video_info_raw spa_video_format_;
 
   void ProcessBuffer(pw_buffer* buffer);
+  bool ProcessMemFDBuffer(pw_buffer* buffer,
+                          DesktopFrame& frame,
+                          const DesktopVector& offset);
+  bool ProcessDMABuffer(pw_buffer* buffer,
+                        DesktopFrame& frame,
+                        const DesktopVector& offset);
   void ConvertRGBxToBGRx(uint8_t* frame, uint32_t size);
   void UpdateFrameUpdatedRegions(const spa_buffer* spa_buffer,
                                  DesktopFrame& frame);
@@ -291,9 +297,8 @@
   std::vector<const spa_pod*> params;
   const int buffer_types =
       has_modifier || (that->pw_server_version_ >= kDmaBufMinVersion)
-          ? (1 << SPA_DATA_DmaBuf) | (1 << SPA_DATA_MemFd) |
-                (1 << SPA_DATA_MemPtr)
-          : (1 << SPA_DATA_MemFd) | (1 << SPA_DATA_MemPtr);
+          ? (1 << SPA_DATA_DmaBuf) | (1 << SPA_DATA_MemFd)
+          : (1 << SPA_DATA_MemFd);
 
   params.push_back(reinterpret_cast<spa_pod*>(spa_pod_builder_add_object(
       &builder, SPA_TYPE_OBJECT_ParamBuffers, SPA_PARAM_Buffers,
@@ -697,9 +702,6 @@
   }
 
   spa_buffer* spa_buffer = buffer->buffer;
-  ScopedBuf map;
-  std::unique_ptr<uint8_t[]> src_unique_ptr;
-  uint8_t* src = nullptr;
 
   // Try to update the mouse cursor first, because it can be the only
   // information carried by the buffer
@@ -741,79 +743,6 @@
     return;
   }
 
-  if (spa_buffer->datas[0].type == SPA_DATA_MemFd) {
-    map.initialize(
-        static_cast<uint8_t*>(
-            mmap(nullptr,
-                 spa_buffer->datas[0].maxsize + spa_buffer->datas[0].mapoffset,
-                 PROT_READ, MAP_PRIVATE, spa_buffer->datas[0].fd, 0)),
-        spa_buffer->datas[0].maxsize + spa_buffer->datas[0].mapoffset,
-        spa_buffer->datas[0].fd);
-
-    if (!map) {
-      RTC_LOG(LS_ERROR) << "Failed to mmap the memory: "
-                        << std::strerror(errno);
-      return;
-    }
-
-    src = SPA_MEMBER(map.get(), spa_buffer->datas[0].mapoffset, uint8_t);
-  } else if (spa_buffer->datas[0].type == SPA_DATA_DmaBuf) {
-    const uint n_planes = spa_buffer->n_datas;
-
-    if (!n_planes) {
-      return;
-    }
-
-    std::vector<EglDmaBuf::PlaneData> plane_datas;
-    for (uint32_t i = 0; i < n_planes; ++i) {
-      EglDmaBuf::PlaneData data = {
-          static_cast<int32_t>(spa_buffer->datas[i].fd),
-          static_cast<uint32_t>(spa_buffer->datas[i].chunk->stride),
-          static_cast<uint32_t>(spa_buffer->datas[i].chunk->offset)};
-      plane_datas.push_back(data);
-    }
-
-    // When importing DMA-BUFs, we use the stride (number of bytes from one row
-    // of pixels in the buffer) provided by PipeWire. The stride from PipeWire
-    // is given by the graphics driver and some drivers might add some
-    // additional padding for memory layout optimizations so not everytime the
-    // stride is equal to BYTES_PER_PIXEL x WIDTH. This is fine, because during
-    // the import we will use OpenGL and same graphics driver so it will be able
-    // to work with the stride it provided, but later on when we work with
-    // images we get from DMA-BUFs we will need to update the stride to be equal
-    // to BYTES_PER_PIXEL x WIDTH as that's the size of the DesktopFrame we
-    // allocate for each captured frame.
-    src_unique_ptr = egl_dmabuf_->ImageFromDmaBuf(
-        stream_size_, spa_video_format_.format, plane_datas, modifier_);
-    if (src_unique_ptr) {
-      src = src_unique_ptr.get();
-    } else {
-      RTC_LOG(LS_ERROR) << "Dropping DMA-BUF modifier: " << modifier_
-                        << " and trying to renegotiate stream parameters";
-
-      if (pw_server_version_ >= kDropSingleModifierMinVersion) {
-        modifiers_.erase(
-            std::remove(modifiers_.begin(), modifiers_.end(), modifier_),
-            modifiers_.end());
-      } else {
-        modifiers_.clear();
-      }
-
-      pw_loop_signal_event(pw_thread_loop_get_loop(pw_main_loop_),
-                           renegotiate_);
-      return;
-    }
-  } else if (spa_buffer->datas[0].type == SPA_DATA_MemPtr) {
-    src = static_cast<uint8_t*>(spa_buffer->datas[0].data);
-  }
-
-  if (!src) {
-    if (observer_) {
-      observer_->OnFailedToProcessBuffer();
-    }
-    return;
-  }
-
   // Use SPA_META_VideoCrop metadata to get the frame size. KDE and GNOME do
   // handle screen/window sharing differently. KDE/KWin doesn't use
   // SPA_META_VideoCrop metadata and when sharing a window, it always sets
@@ -871,8 +800,8 @@
   }
 
   // Get the position of the video crop within the stream. Just double-check
-  // that the position doesn't exceed the size of the stream itself. NOTE:
-  // Currently it looks there is no implementation using this.
+  // that the position doesn't exceed the size of the stream itself.
+  // NOTE: Currently it looks there is no implementation using this.
   uint32_t y_offset =
       videocrop_metadata_use &&
               (videocrop_metadata->region.position.y + frame_size_.height() <=
@@ -885,22 +814,7 @@
                stream_size_.width())
           ? videocrop_metadata->region.position.x
           : 0;
-
-  const uint32_t stream_stride = kBytesPerPixel * stream_size_.width();
-  uint32_t buffer_stride = spa_buffer->datas[0].chunk->stride;
-  uint32_t src_stride = buffer_stride;
-
-  if (spa_buffer->datas[0].type == SPA_DATA_DmaBuf &&
-      buffer_stride > stream_stride) {
-    // When DMA-BUFs are used, sometimes spa_buffer->stride we get might
-    // contain additional padding, but after we import the buffer, the stride
-    // we used is no longer relevant and we should just calculate it based on
-    // the stream width. For more context see https://crbug.com/1333304.
-    src_stride = stream_stride;
-  }
-
-  uint8_t* updated_src =
-      src + (src_stride * y_offset) + (kBytesPerPixel * x_offset);
+  DesktopVector offset = DesktopVector(x_offset, y_offset);
 
   webrtc::MutexLock lock(&queue_lock_);
 
@@ -920,9 +834,20 @@
     queue_.ReplaceCurrentFrame(SharedDesktopFrame::Wrap(std::move(frame)));
   }
 
-  queue_.current_frame()->CopyPixelsFrom(
-      updated_src, (src_stride - (kBytesPerPixel * x_offset)),
-      DesktopRect::MakeWH(frame_size_.width(), frame_size_.height()));
+  bool bufferProcessed = false;
+  if (spa_buffer->datas[0].type == SPA_DATA_MemFd) {
+    bufferProcessed =
+        ProcessMemFDBuffer(buffer, *queue_.current_frame(), offset);
+  } else if (spa_buffer->datas[0].type == SPA_DATA_DmaBuf) {
+    bufferProcessed = ProcessDMABuffer(buffer, *queue_.current_frame(), offset);
+  }
+
+  if (!bufferProcessed) {
+    if (observer_) {
+      observer_->OnFailedToProcessBuffer();
+    }
+    return;
+  }
 
   if (spa_video_format_.format == SPA_VIDEO_FORMAT_RGBx ||
       spa_video_format_.format == SPA_VIDEO_FORMAT_RGBA) {
@@ -951,6 +876,87 @@
   }
 }
 
+RTC_NO_SANITIZE("cfi-icall")
+bool SharedScreenCastStreamPrivate::ProcessMemFDBuffer(
+    pw_buffer* buffer,
+    DesktopFrame& frame,
+    const DesktopVector& offset) {
+  spa_buffer* spa_buffer = buffer->buffer;
+  ScopedBuf map;
+  uint8_t* src = nullptr;
+
+  map.initialize(
+      static_cast<uint8_t*>(
+          mmap(nullptr,
+               spa_buffer->datas[0].maxsize + spa_buffer->datas[0].mapoffset,
+               PROT_READ, MAP_PRIVATE, spa_buffer->datas[0].fd, 0)),
+      spa_buffer->datas[0].maxsize + spa_buffer->datas[0].mapoffset,
+      spa_buffer->datas[0].fd);
+
+  if (!map) {
+    RTC_LOG(LS_ERROR) << "Failed to mmap the memory: " << std::strerror(errno);
+    return false;
+  }
+
+  src = SPA_MEMBER(map.get(), spa_buffer->datas[0].mapoffset, uint8_t);
+
+  uint32_t buffer_stride = spa_buffer->datas[0].chunk->stride;
+  uint32_t src_stride = buffer_stride;
+
+  uint8_t* updated_src =
+      src + (src_stride * offset.y()) + (kBytesPerPixel * offset.x());
+
+  frame.CopyPixelsFrom(
+      updated_src, (src_stride - (kBytesPerPixel * offset.x())),
+      DesktopRect::MakeWH(frame.size().width(), frame.size().height()));
+
+  return true;
+}
+
+RTC_NO_SANITIZE("cfi-icall")
+bool SharedScreenCastStreamPrivate::ProcessDMABuffer(
+    pw_buffer* buffer,
+    DesktopFrame& frame,
+    const DesktopVector& offset) {
+  spa_buffer* spa_buffer = buffer->buffer;
+
+  const uint n_planes = spa_buffer->n_datas;
+
+  if (!n_planes) {
+    return false;
+  }
+
+  std::vector<EglDmaBuf::PlaneData> plane_datas;
+  for (uint32_t i = 0; i < n_planes; ++i) {
+    EglDmaBuf::PlaneData data = {
+        static_cast<int32_t>(spa_buffer->datas[i].fd),
+        static_cast<uint32_t>(spa_buffer->datas[i].chunk->stride),
+        static_cast<uint32_t>(spa_buffer->datas[i].chunk->offset)};
+    plane_datas.push_back(data);
+  }
+
+  const bool imported = egl_dmabuf_->ImageFromDmaBuf(
+      stream_size_, spa_video_format_.format, plane_datas, modifier_, offset,
+      frame.size(), frame.data());
+  if (!imported) {
+    RTC_LOG(LS_ERROR) << "Dropping DMA-BUF modifier: " << modifier_
+                      << " and trying to renegotiate stream parameters";
+
+    if (pw_server_version_ >= kDropSingleModifierMinVersion) {
+      modifiers_.erase(
+          std::remove(modifiers_.begin(), modifiers_.end(), modifier_),
+          modifiers_.end());
+    } else {
+      modifiers_.clear();
+    }
+
+    pw_loop_signal_event(pw_thread_loop_get_loop(pw_main_loop_), renegotiate_);
+    return false;
+  }
+
+  return true;
+}
+
 void SharedScreenCastStreamPrivate::ConvertRGBxToBGRx(uint8_t* frame,
                                                       uint32_t size) {
   for (uint32_t i = 0; i < size; i += 4) {