External denoiser based on noise estimation and moving object detection.

Improved the existing external denoiser in WebRTC: the filter strength
is adaptive based on the noise level of the whole frame and the moving
object detection result. The adaptive filter effectively removes the
artifacts in previous version, such as trailing and blockiness on moving
objects.
The external denoiser is off by default for now.

BUG=

Review URL: https://codereview.webrtc.org/1822333003

Cr-Original-Commit-Position: refs/heads/master@{#12198}
Cr-Mirrored-From: https://chromium.googlesource.com/external/webrtc
Cr-Mirrored-Commit: fa0befe13bb030adb61eb55c5557df1bf6e29205
diff --git a/modules/video_processing/video_denoiser.cc b/modules/video_processing/video_denoiser.cc
index 3951381..b00da5c 100644
--- a/modules/video_processing/video_denoiser.cc
+++ b/modules/video_processing/video_denoiser.cc
@@ -16,50 +16,144 @@
 VideoDenoiser::VideoDenoiser(bool runtime_cpu_detection)
     : width_(0),
       height_(0),
-      filter_(DenoiserFilter::Create(runtime_cpu_detection)) {}
+      filter_(DenoiserFilter::Create(runtime_cpu_detection, &cpu_type_)),
+      ne_(new NoiseEstimation()) {}
 
-void VideoDenoiser::TrailingReduction(int mb_rows,
-                                      int mb_cols,
-                                      const uint8_t* y_src,
-                                      int stride_y,
-                                      uint8_t* y_dst) {
-  for (int mb_row = 1; mb_row < mb_rows - 1; ++mb_row) {
-    for (int mb_col = 1; mb_col < mb_cols - 1; ++mb_col) {
-      int mb_index = mb_row * mb_cols + mb_col;
-      uint8_t* mb_dst = y_dst + (mb_row << 4) * stride_y + (mb_col << 4);
-      const uint8_t* mb_src = y_src + (mb_row << 4) * stride_y + (mb_col << 4);
-      // If the number of denoised neighbors is less than a threshold,
-      // do NOT denoise for the block. Set different threshold for skin MB.
-      // The change of denoising status will not propagate.
-      if (metrics_[mb_index].is_skin) {
-        // The threshold is high (more strict) for non-skin MB where the
-        // trailing usually happen.
-        if (metrics_[mb_index].denoise &&
-            metrics_[mb_index + 1].denoise + metrics_[mb_index - 1].denoise +
-                    metrics_[mb_index + mb_cols].denoise +
-                    metrics_[mb_index - mb_cols].denoise <=
-                2) {
-          metrics_[mb_index].denoise = 0;
-          filter_->CopyMem16x16(mb_src, stride_y, mb_dst, stride_y);
-        }
-      } else if (metrics_[mb_index].denoise &&
-                 metrics_[mb_index + 1].denoise +
-                         metrics_[mb_index - 1].denoise +
-                         metrics_[mb_index + mb_cols + 1].denoise +
-                         metrics_[mb_index + mb_cols - 1].denoise +
-                         metrics_[mb_index - mb_cols + 1].denoise +
-                         metrics_[mb_index - mb_cols - 1].denoise +
-                         metrics_[mb_index + mb_cols].denoise +
-                         metrics_[mb_index - mb_cols].denoise <=
-                     7) {
-        filter_->CopyMem16x16(mb_src, stride_y, mb_dst, stride_y);
-      }
+#if EXPERIMENTAL
+// Check the mb position(1: close to the center, 3: close to the border).
+static int PositionCheck(int mb_row, int mb_col, int mb_rows, int mb_cols) {
+  if ((mb_row >= (mb_rows >> 3)) && (mb_row <= (7 * mb_rows >> 3)) &&
+      (mb_col >= (mb_cols >> 3)) && (mb_col <= (7 * mb_cols >> 3)))
+    return 1;
+  else if ((mb_row >= (mb_rows >> 4)) && (mb_row <= (15 * mb_rows >> 4)) &&
+           (mb_col >= (mb_cols >> 4)) && (mb_col <= (15 * mb_cols >> 4)))
+    return 2;
+  else
+    return 3;
+}
+
+static void ReduceFalseDetection(const std::unique_ptr<uint8_t[]>& d_status,
+                                 std::unique_ptr<uint8_t[]>* d_status_tmp1,
+                                 std::unique_ptr<uint8_t[]>* d_status_tmp2,
+                                 int noise_level,
+                                 int mb_rows,
+                                 int mb_cols) {
+  // Draft. This can be optimized. This code block is to reduce false detection
+  // in moving object detection.
+  int mb_row_min = noise_level ? mb_rows >> 3 : 1;
+  int mb_col_min = noise_level ? mb_cols >> 3 : 1;
+  int mb_row_max = noise_level ? (7 * mb_rows >> 3) : mb_rows - 2;
+  int mb_col_max = noise_level ? (7 * mb_cols >> 3) : mb_cols - 2;
+  memcpy((*d_status_tmp1).get(), d_status.get(), mb_rows * mb_cols);
+  // Up left.
+  for (int mb_row = mb_row_min; mb_row <= mb_row_max; ++mb_row) {
+    for (int mb_col = mb_col_min; mb_col <= mb_col_max; ++mb_col) {
+      (*d_status_tmp1)[mb_row * mb_cols + mb_col] |=
+          ((*d_status_tmp1)[(mb_row - 1) * mb_cols + mb_col] |
+           (*d_status_tmp1)[mb_row * mb_cols + mb_col - 1]);
+    }
+  }
+  memcpy((*d_status_tmp2).get(), (*d_status_tmp1).get(), mb_rows * mb_cols);
+  memcpy((*d_status_tmp1).get(), d_status.get(), mb_rows * mb_cols);
+  // Bottom left.
+  for (int mb_row = mb_row_max; mb_row >= mb_row_min; --mb_row) {
+    for (int mb_col = mb_col_min; mb_col <= mb_col_max; ++mb_col) {
+      (*d_status_tmp1)[mb_row * mb_cols + mb_col] |=
+          ((*d_status_tmp1)[(mb_row + 1) * mb_cols + mb_col] |
+           (*d_status_tmp1)[mb_row * mb_cols + mb_col - 1]);
+      (*d_status_tmp2)[mb_row * mb_cols + mb_col] &=
+          (*d_status_tmp1)[mb_row * mb_cols + mb_col];
+    }
+  }
+  memcpy((*d_status_tmp1).get(), d_status.get(), mb_rows * mb_cols);
+  // Up right.
+  for (int mb_row = mb_row_min; mb_row <= mb_row_max; ++mb_row) {
+    for (int mb_col = mb_col_max; mb_col >= mb_col_min; --mb_col) {
+      (*d_status_tmp1)[mb_row * mb_cols + mb_col] |=
+          ((*d_status_tmp1)[(mb_row - 1) * mb_cols + mb_col] |
+           (*d_status_tmp1)[mb_row * mb_cols + mb_col + 1]);
+      (*d_status_tmp2)[mb_row * mb_cols + mb_col] &=
+          (*d_status_tmp1)[mb_row * mb_cols + mb_col];
+    }
+  }
+  memcpy((*d_status_tmp1).get(), d_status.get(), mb_rows * mb_cols);
+  // Bottom right.
+  for (int mb_row = mb_row_max; mb_row >= mb_row_min; --mb_row) {
+    for (int mb_col = mb_col_max; mb_col >= mb_col_min; --mb_col) {
+      (*d_status_tmp1)[mb_row * mb_cols + mb_col] |=
+          ((*d_status_tmp1)[(mb_row + 1) * mb_cols + mb_col] |
+           (*d_status_tmp1)[mb_row * mb_cols + mb_col + 1]);
+      (*d_status_tmp2)[mb_row * mb_cols + mb_col] &=
+          (*d_status_tmp1)[mb_row * mb_cols + mb_col];
     }
   }
 }
 
+static bool TrailingBlock(const std::unique_ptr<uint8_t[]>& d_status,
+                          int mb_row,
+                          int mb_col,
+                          int mb_rows,
+                          int mb_cols) {
+  int mb_index = mb_row * mb_cols + mb_col;
+  if (!mb_row || !mb_col || mb_row == mb_rows - 1 || mb_col == mb_cols - 1)
+    return false;
+  return d_status[mb_index + 1] || d_status[mb_index - 1] ||
+         d_status[mb_index + mb_cols] || d_status[mb_index - mb_cols];
+}
+#endif
+
+#if DISPLAY
+void ShowRect(const std::unique_ptr<DenoiserFilter>& filter,
+              const std::unique_ptr<uint8_t[]>& d_status,
+              const std::unique_ptr<uint8_t[]>& d_status_tmp2,
+              const std::unique_ptr<uint8_t[]>& x_density,
+              const std::unique_ptr<uint8_t[]>& y_density,
+              const uint8_t* u_src,
+              const uint8_t* v_src,
+              uint8_t* u_dst,
+              uint8_t* v_dst,
+              int mb_rows,
+              int mb_cols,
+              int stride_u,
+              int stride_v) {
+  for (int mb_row = 0; mb_row < mb_rows; ++mb_row) {
+    for (int mb_col = 0; mb_col < mb_cols; ++mb_col) {
+      int mb_index = mb_row * mb_cols + mb_col;
+      const uint8_t* mb_src_u =
+          u_src + (mb_row << 3) * stride_u + (mb_col << 3);
+      const uint8_t* mb_src_v =
+          v_src + (mb_row << 3) * stride_v + (mb_col << 3);
+      uint8_t* mb_dst_u = u_dst + (mb_row << 3) * stride_u + (mb_col << 3);
+      uint8_t* mb_dst_v = v_dst + (mb_row << 3) * stride_v + (mb_col << 3);
+      uint8_t y_tmp_255[8 * 8];
+      memset(y_tmp_255, 200, 8 * 8);
+      // x_density_[mb_col] * y_density_[mb_row]
+      if (d_status[mb_index] == 1) {
+        // Paint to red.
+        filter->CopyMem8x8(mb_src_u, stride_u, mb_dst_u, stride_u);
+        filter->CopyMem8x8(y_tmp_255, 8, mb_dst_v, stride_v);
+#if EXPERIMENTAL
+      } else if (d_status_tmp2[mb_row * mb_cols + mb_col] &&
+                 x_density[mb_col] * y_density[mb_row]) {
+#else
+      } else if (x_density[mb_col] * y_density[mb_row]) {
+#endif
+        // Paint to blue.
+        filter->CopyMem8x8(y_tmp_255, 8, mb_dst_u, stride_u);
+        filter->CopyMem8x8(mb_src_v, stride_v, mb_dst_v, stride_v);
+      } else {
+        filter->CopyMem8x8(mb_src_u, stride_u, mb_dst_u, stride_u);
+        filter->CopyMem8x8(mb_src_v, stride_v, mb_dst_v, stride_v);
+      }
+    }
+  }
+}
+#endif
+
 void VideoDenoiser::DenoiseFrame(const VideoFrame& frame,
-                                 VideoFrame* denoised_frame) {
+                                 VideoFrame* denoised_frame,
+                                 VideoFrame* denoised_frame_prev,
+                                 int noise_level_prev) {
   int stride_y = frame.stride(kYPlane);
   int stride_u = frame.stride(kUPlane);
   int stride_v = frame.stride(kVPlane);
@@ -71,9 +165,13 @@
     denoised_frame->CreateFrame(frame.buffer(kYPlane), frame.buffer(kUPlane),
                                 frame.buffer(kVPlane), width_, height_,
                                 stride_y, stride_u, stride_v, kVideoRotation_0);
+    denoised_frame_prev->CreateFrame(
+        frame.buffer(kYPlane), frame.buffer(kUPlane), frame.buffer(kVPlane),
+        width_, height_, stride_y, stride_u, stride_v, kVideoRotation_0);
     // Setting time parameters to the output frame.
     denoised_frame->set_timestamp(frame.timestamp());
     denoised_frame->set_render_time_ms(frame.render_time_ms());
+    ne_->Init(width_, height_, cpu_type_);
     return;
   }
   // For 16x16 block.
@@ -81,62 +179,128 @@
   int mb_rows = height_ >> 4;
   if (metrics_.get() == nullptr)
     metrics_.reset(new DenoiseMetrics[mb_cols * mb_rows]());
+  if (d_status_.get() == nullptr) {
+    d_status_.reset(new uint8_t[mb_cols * mb_rows]());
+#if EXPERIMENTAL
+    d_status_tmp1_.reset(new uint8_t[mb_cols * mb_rows]());
+    d_status_tmp2_.reset(new uint8_t[mb_cols * mb_rows]());
+#endif
+    x_density_.reset(new uint8_t[mb_cols]());
+    y_density_.reset(new uint8_t[mb_rows]());
+  }
+
   // Denoise on Y plane.
   uint8_t* y_dst = denoised_frame->buffer(kYPlane);
   uint8_t* u_dst = denoised_frame->buffer(kUPlane);
   uint8_t* v_dst = denoised_frame->buffer(kVPlane);
+  uint8_t* y_dst_prev = denoised_frame_prev->buffer(kYPlane);
   const uint8_t* y_src = frame.buffer(kYPlane);
   const uint8_t* u_src = frame.buffer(kUPlane);
   const uint8_t* v_src = frame.buffer(kVPlane);
+  uint8_t noise_level = noise_level_prev == -1 ? 0 : ne_->GetNoiseLevel();
   // Temporary buffer to store denoising result.
   uint8_t y_tmp[16 * 16] = {0};
+  memset(x_density_.get(), 0, mb_cols);
+  memset(y_density_.get(), 0, mb_rows);
+
+  // Loop over blocks to accumulate/extract noise level and update x/y_density
+  // factors for moving object detection.
+  for (int mb_row = 0; mb_row < mb_rows; ++mb_row) {
+    for (int mb_col = 0; mb_col < mb_cols; ++mb_col) {
+      const uint8_t* mb_src = y_src + (mb_row << 4) * stride_y + (mb_col << 4);
+      uint8_t* mb_dst_prev =
+          y_dst_prev + (mb_row << 4) * stride_y + (mb_col << 4);
+      int mb_index = mb_row * mb_cols + mb_col;
+#if EXPERIMENTAL
+      int pos_factor = PositionCheck(mb_row, mb_col, mb_rows, mb_cols);
+      uint32_t thr_var_adp = 16 * 16 * 5 * (noise_level ? pos_factor : 1);
+#else
+      uint32_t thr_var_adp = 16 * 16 * 5;
+#endif
+      int brightness = 0;
+      for (int i = 0; i < 16; ++i) {
+        for (int j = 0; j < 16; ++j) {
+          brightness += mb_src[i * stride_y + j];
+        }
+      }
+
+      // Get the denoised block.
+      filter_->MbDenoise(mb_dst_prev, stride_y, y_tmp, 16, mb_src, stride_y, 0,
+                         1, true);
+      // The variance is based on the denoised blocks in time T and T-1.
+      metrics_[mb_index].var = filter_->Variance16x8(
+          mb_dst_prev, stride_y, y_tmp, 16, &metrics_[mb_index].sad);
+
+      if (metrics_[mb_index].var > thr_var_adp) {
+        ne_->ResetConsecLowVar(mb_index);
+        d_status_[mb_index] = 1;
+#if EXPERIMENTAL
+        if (noise_level == 0 || pos_factor < 3) {
+          x_density_[mb_col] += 1;
+          y_density_[mb_row] += 1;
+        }
+#else
+        x_density_[mb_col] += 1;
+        y_density_[mb_row] += 1;
+#endif
+      } else {
+        uint32_t sse_t = 0;
+        // The variance is based on the src blocks in time T and denoised block
+        // in time T-1.
+        uint32_t noise_var = filter_->Variance16x8(mb_dst_prev, stride_y,
+                                                   mb_src, stride_y, &sse_t);
+        ne_->GetNoise(mb_index, noise_var, brightness);
+        d_status_[mb_index] = 0;
+      }
+      // Track denoised frame.
+      filter_->CopyMem16x16(y_tmp, 16, mb_dst_prev, stride_y);
+    }
+  }
+
+#if EXPERIMENTAL
+  ReduceFalseDetection(d_status_, &d_status_tmp1_, &d_status_tmp2_, noise_level,
+                       mb_rows, mb_cols);
+#endif
+
+  // Denoise each MB based on the results of moving objects detection.
   for (int mb_row = 0; mb_row < mb_rows; ++mb_row) {
     for (int mb_col = 0; mb_col < mb_cols; ++mb_col) {
       const uint8_t* mb_src = y_src + (mb_row << 4) * stride_y + (mb_col << 4);
       uint8_t* mb_dst = y_dst + (mb_row << 4) * stride_y + (mb_col << 4);
-      int mb_index = mb_row * mb_cols + mb_col;
-      // Denoise each MB at the very start and save the result to a temporary
-      // buffer.
-      if (filter_->MbDenoise(mb_dst, stride_y, y_tmp, 16, mb_src, stride_y, 0,
-                             1) == FILTER_BLOCK) {
-        uint32_t thr_var = 0;
-        // Save var and sad to the buffer.
-        metrics_[mb_index].var = filter_->Variance16x8(
-            mb_dst, stride_y, y_tmp, 16, &metrics_[mb_index].sad);
-        // Get skin map.
-        metrics_[mb_index].is_skin = MbHasSkinColor(
-            y_src, u_src, v_src, stride_y, stride_u, stride_v, mb_row, mb_col);
-        // Variance threshold for skin/non-skin MB is different.
-        // Skin MB use a small threshold to reduce blockiness.
-        thr_var = metrics_[mb_index].is_skin ? 128 : 12 * 128;
-        if (metrics_[mb_index].var > thr_var) {
-          metrics_[mb_index].denoise = 0;
-          // Use the source MB.
-          filter_->CopyMem16x16(mb_src, stride_y, mb_dst, stride_y);
-        } else {
-          metrics_[mb_index].denoise = 1;
-          // Use the denoised MB.
-          filter_->CopyMem16x16(y_tmp, 16, mb_dst, stride_y);
-        }
-      } else {
-        metrics_[mb_index].denoise = 0;
-        filter_->CopyMem16x16(mb_src, stride_y, mb_dst, stride_y);
-      }
-      // Copy source U/V plane.
       const uint8_t* mb_src_u =
           u_src + (mb_row << 3) * stride_u + (mb_col << 3);
       const uint8_t* mb_src_v =
           v_src + (mb_row << 3) * stride_v + (mb_col << 3);
       uint8_t* mb_dst_u = u_dst + (mb_row << 3) * stride_u + (mb_col << 3);
       uint8_t* mb_dst_v = v_dst + (mb_row << 3) * stride_v + (mb_col << 3);
+#if EXPERIMENTAL
+      if ((!d_status_tmp2_[mb_row * mb_cols + mb_col] ||
+           x_density_[mb_col] * y_density_[mb_row] == 0) &&
+          !TrailingBlock(d_status_, mb_row, mb_col, mb_rows, mb_cols)) {
+#else
+      if (x_density_[mb_col] * y_density_[mb_row] == 0) {
+#endif
+        if (filter_->MbDenoise(mb_dst, stride_y, y_tmp, 16, mb_src, stride_y, 0,
+                               noise_level, false) == FILTER_BLOCK) {
+          filter_->CopyMem16x16(y_tmp, 16, mb_dst, stride_y);
+        } else {
+          // Copy y source.
+          filter_->CopyMem16x16(mb_src, stride_y, mb_dst, stride_y);
+        }
+      } else {
+        // Copy y source.
+        filter_->CopyMem16x16(mb_src, stride_y, mb_dst, stride_y);
+      }
       filter_->CopyMem8x8(mb_src_u, stride_u, mb_dst_u, stride_u);
       filter_->CopyMem8x8(mb_src_v, stride_v, mb_dst_v, stride_v);
     }
   }
-  // Second round.
-  // This is to reduce the trailing artifact and blockiness by referring
-  // neighbors' denoising status.
-  TrailingReduction(mb_rows, mb_cols, y_src, stride_y, y_dst);
+
+#if DISPLAY  // Rectangle diagnostics
+  // Show rectangular region
+  ShowRect(filter_, d_status_, d_status_tmp2_, x_density_, y_density_, u_src,
+           v_src, u_dst, v_dst, mb_rows, mb_cols, stride_u, stride_v);
+#endif
 
   // Setting time parameters to the output frame.
   denoised_frame->set_timestamp(frame.timestamp());