From 94f28437744b33604cfc43a122298f19da5ce1e4 Mon Sep 17 00:00:00 2001
From: Seungha Yang <seungha.yang@navercorp.com>
Date: Mon, 2 Sep 2019 14:13:26 +0900
Subject: [PATCH] nvenc: Refactoring internal buffer pool structure

To support rc-lookahead and bframe encoding, nvenc needs one more
staging queue, because NvEncEncodePicture can return NV_ENC_ERR_NEED_MORE_INPUT
but which was not considered so far.
As documented by NVENC programming guide, pending buffers should wait
other inputs until NvEncEncodePicture returns success.

New encoding flow is
- Submit raw picture buffer to encoder with NvEncEncodePicture
- The submitted input/output buffer pair will be queued to pending_queue
  - If NvEncEncodePicture returned success, then move all pair in pending_queue
    to final stage
  - Otherwise, wait more input raw pictures.

Another change is dropping NV_ENC_LOCK_INPUT_BUFFER usage.
So now nvenc always uses CUDA memory input buffer. As a result,
both opengl and system memory handling are unified.
---
 sys/nvcodec/gstnvbaseenc.c | 867 +++++++++++++++++--------------------
 sys/nvcodec/gstnvbaseenc.h |  25 +-
 2 files changed, 417 insertions(+), 475 deletions(-)

diff --git a/sys/nvcodec/gstnvbaseenc.c b/sys/nvcodec/gstnvbaseenc.c
index 992204486f..a528fe5c8b 100644
--- a/sys/nvcodec/gstnvbaseenc.c
+++ b/sys/nvcodec/gstnvbaseenc.c
@@ -187,28 +187,33 @@ enum
  * some period of time. */
 G_LOCK_DEFINE_STATIC (initialization_lock);
 
-#if HAVE_NVCODEC_GST_GL
 typedef struct
 {
-  GstGLMemory *gl_mem[GST_VIDEO_MAX_PLANES];
-  CUgraphicsResource cuda_texture;
-  CUdeviceptr cuda_plane_pointers[GST_VIDEO_MAX_PLANES];
-  gpointer cuda_pointer;
+  /* Allocated CUDA device memory and registered to NVENC to be used as input
+   * buffer regardless of the input memory type (OpenGL or System memory) */
+  CUdeviceptr cuda_pointer;
+
+  /* The stride of allocated CUDA device memory (CuMemAllocPitch).
+   * This might be different from the stride of GstVideoInfo */
   gsize cuda_stride;
-  gsize cuda_num_bytes;
+
+  /* Registered NVENC resource (cuda_pointer is used for this) */
   NV_ENC_REGISTER_RESOURCE nv_resource;
+
+  /* Mapped resource of nv_resource */
   NV_ENC_MAP_INPUT_RESOURCE nv_mapped_resource;
 
   /* whether nv_mapped_resource was mapped via NvEncMapInputResource()
    * and therefore should unmap via NvEncUnmapInputResource or not */
   gboolean mapped;
-} GstNvEncGLResource;
-#endif
+} GstNvEncInputResource;
 
+/* The pair of GstNvEncInputResource () and NV_ENC_OUTPUT_PTR.
+ * The number of input/output resource are always identical */
 typedef struct
 {
-  gpointer in_buf;
-  gpointer out_buf;
+  GstNvEncInputResource *in_buf;
+  NV_ENC_OUTPUT_PTR out_buf;
 } GstNvEncFrameState;
 
 static gboolean gst_nv_base_enc_open (GstVideoEncoder * enc);
@@ -453,7 +458,8 @@ gst_nv_base_enc_sink_query (GstVideoEncoder * enc, GstQuery * query)
         gboolean ret;
 
         ret = gst_gl_handle_context_query ((GstElement *) nvenc, query,
-            nvenc->display, NULL, nvenc->other_context);
+            (GstGLDisplay *) nvenc->display, NULL,
+            (GstGLContext *) nvenc->other_context);
         if (nvenc->display) {
           gst_gl_display_filter_gl_api (GST_GL_DISPLAY (nvenc->display),
               SUPPORTED_GL_APIS);
@@ -477,9 +483,10 @@ gst_nv_base_enc_start (GstVideoEncoder * enc)
 {
   GstNvBaseEnc *nvenc = GST_NV_BASE_ENC (enc);
 
-  nvenc->bitstream_pool = g_async_queue_new ();
+  nvenc->available_queue = g_async_queue_new ();
+  nvenc->pending_queue = g_async_queue_new ();
   nvenc->bitstream_queue = g_async_queue_new ();
-  nvenc->in_bufs_pool = g_async_queue_new ();
+  nvenc->items = g_array_new (FALSE, TRUE, sizeof (GstNvEncFrameState));
 
   nvenc->last_flow = GST_FLOW_OK;
   memset (&nvenc->init_params, 0, sizeof (NV_ENC_INITIALIZE_PARAMS));
@@ -513,18 +520,18 @@ gst_nv_base_enc_stop (GstVideoEncoder * enc)
     nvenc->input_state = NULL;
   }
 
-  if (nvenc->bitstream_pool) {
-    g_async_queue_unref (nvenc->bitstream_pool);
-    nvenc->bitstream_pool = NULL;
+  if (nvenc->available_queue) {
+    g_async_queue_unref (nvenc->available_queue);
+    nvenc->available_queue = NULL;
+  }
+  if (nvenc->pending_queue) {
+    g_async_queue_unref (nvenc->pending_queue);
+    nvenc->pending_queue = NULL;
   }
   if (nvenc->bitstream_queue) {
     g_async_queue_unref (nvenc->bitstream_queue);
     nvenc->bitstream_queue = NULL;
   }
-  if (nvenc->in_bufs_pool) {
-    g_async_queue_unref (nvenc->in_bufs_pool);
-    nvenc->in_bufs_pool = NULL;
-  }
   if (nvenc->display) {
     gst_object_unref (nvenc->display);
     nvenc->display = NULL;
@@ -534,6 +541,11 @@ gst_nv_base_enc_stop (GstVideoEncoder * enc)
     nvenc->other_context = NULL;
   }
 
+  if (nvenc->items) {
+    g_array_free (nvenc->items, TRUE);
+    nvenc->items = NULL;
+  }
+
   return TRUE;
 }
 
@@ -758,12 +770,6 @@ gst_nv_base_enc_close (GstVideoEncoder * enc)
     nvenc->input_state = NULL;
   }
 
-  if (nvenc->bitstream_pool != NULL) {
-    g_assert (g_async_queue_length (nvenc->bitstream_pool) == 0);
-    g_async_queue_unref (nvenc->bitstream_pool);
-    nvenc->bitstream_pool = NULL;
-  }
-
   return ret;
 }
 
@@ -833,6 +839,7 @@ gst_nv_base_enc_bitstream_thread (gpointer user_data)
 {
   GstVideoEncoder *enc = user_data;
   GstNvBaseEnc *nvenc = user_data;
+  GstFlowReturn flow = GST_FLOW_OK;
 
   /* overview of operation:
    * 1. retreive the next buffer submitted to the bitstream pool
@@ -847,19 +854,23 @@ gst_nv_base_enc_bitstream_thread (gpointer user_data)
    */
   do {
     GstBuffer *buffer = NULL;
+    GstNvEncFrameState *state_in_queue = NULL;
     GstNvEncFrameState *state = NULL;
     GstVideoCodecFrame *frame = NULL;
     NVENCSTATUS nv_ret;
-    GstFlowReturn flow = GST_FLOW_OK;
     NV_ENC_LOCK_BITSTREAM lock_bs = { 0, };
     NV_ENC_OUTPUT_PTR out_buf;
+    GstNvEncInputResource *resource;
 
     GST_LOG_OBJECT (enc, "wait for bitstream buffer..");
 
-    out_buf = g_async_queue_pop (nvenc->bitstream_queue);
-    if ((gpointer) out_buf == SHUTDOWN_COOKIE)
+    state_in_queue = g_async_queue_pop (nvenc->bitstream_queue);
+    if ((gpointer) state_in_queue == SHUTDOWN_COOKIE)
       goto exit_thread;
 
+    out_buf = state_in_queue->out_buf;
+    resource = state_in_queue->in_buf;
+
     GST_LOG_OBJECT (nvenc, "waiting for output buffer %p to be ready", out_buf);
 
     lock_bs.version = gst_nvenc_get_lock_bitstream_version ();
@@ -869,13 +880,20 @@ gst_nv_base_enc_bitstream_thread (gpointer user_data)
     /* FIXME: this would need to be updated for other slice modes */
     lock_bs.sliceOffsets = NULL;
 
+    if (!gst_cuda_context_push (nvenc->cuda_ctx)) {
+      GST_ELEMENT_ERROR (nvenc, LIBRARY, ENCODE, (NULL),
+          ("Failed to push current context"));
+      goto error_shutdown;
+    }
+
     nv_ret = NvEncLockBitstream (nvenc->encoder, &lock_bs);
     if (nv_ret != NV_ENC_SUCCESS) {
-      /* FIXME: what to do here? */
+      gst_cuda_context_pop (NULL);
+
       GST_ELEMENT_ERROR (nvenc, STREAM, ENCODE, (NULL),
           ("Failed to lock bitstream buffer %p, ret %d",
               lock_bs.outputBitstream, nv_ret));
-      goto exit_thread;
+      goto error_shutdown;
     }
 
     frame = _find_frame_with_output_buffer (nvenc, out_buf);
@@ -896,58 +914,63 @@ gst_nv_base_enc_bitstream_thread (gpointer user_data)
     /* TODO: check pts/dts is handled properly if there are B-frames */
 
     nv_ret = NvEncUnlockBitstream (nvenc->encoder, state->out_buf);
+
     if (nv_ret != NV_ENC_SUCCESS) {
-      /* FIXME: what to do here? */
+      gst_cuda_context_pop (NULL);
+
       GST_ELEMENT_ERROR (nvenc, STREAM, ENCODE, (NULL),
           ("Failed to unlock bitstream buffer %p, ret %d",
               lock_bs.outputBitstream, nv_ret));
       gst_buffer_unref (buffer);
       gst_video_encoder_finish_frame (enc, frame);
 
-      goto exit_thread;
+      goto error_shutdown;
     }
 
-    GST_LOG_OBJECT (nvenc, "returning bitstream buffer %p to pool",
-        state->out_buf);
-    g_async_queue_push (nvenc->bitstream_pool, state->out_buf);
-
     frame->output_buffer = buffer;
 
-#if HAVE_NVCODEC_GST_GL
-    if (nvenc->gl_input) {
-      GstNvEncGLResource *in_gl_resource = state->in_buf;
+    nv_ret =
+        NvEncUnmapInputResource (nvenc->encoder,
+        resource->nv_mapped_resource.mappedResource);
+    resource->mapped = FALSE;
 
-      nv_ret =
-          NvEncUnmapInputResource (nvenc->encoder,
-          in_gl_resource->nv_mapped_resource.mappedResource);
-      in_gl_resource->mapped = FALSE;
-
-      if (nv_ret != NV_ENC_SUCCESS) {
-        GST_ERROR_OBJECT (nvenc, "Failed to unmap input resource %p, ret %d",
-            in_gl_resource, nv_ret);
-      }
-
-      memset (&in_gl_resource->nv_mapped_resource, 0,
-          sizeof (in_gl_resource->nv_mapped_resource));
+    if (nv_ret != NV_ENC_SUCCESS) {
+      GST_ERROR_OBJECT (nvenc, "Failed to unmap input resource %p, ret %d",
+          resource, nv_ret);
     }
-#endif
-    g_async_queue_push (nvenc->in_bufs_pool, state->in_buf);
+
+    gst_cuda_context_pop (NULL);
+
+    memset (&resource->nv_mapped_resource, 0,
+        sizeof (resource->nv_mapped_resource));
+
+    g_async_queue_push (nvenc->available_queue, state_in_queue);
 
     flow = gst_video_encoder_finish_frame (enc, frame);
 
     if (flow != GST_FLOW_OK) {
       GST_INFO_OBJECT (enc, "got flow %s", gst_flow_get_name (flow));
       g_atomic_int_set (&nvenc->last_flow, flow);
-      g_async_queue_push (nvenc->in_bufs_pool, SHUTDOWN_COOKIE);
+      g_async_queue_push (nvenc->available_queue, SHUTDOWN_COOKIE);
       goto exit_thread;
     }
   }
   while (TRUE);
 
-exit_thread:
-  GST_INFO_OBJECT (nvenc, "exiting thread");
+error_shutdown:
+  {
+    g_atomic_int_set (&nvenc->last_flow, GST_FLOW_ERROR);
+    g_async_queue_push (nvenc->available_queue, SHUTDOWN_COOKIE);
 
-  return NULL;
+    goto exit_thread;
+  }
+
+exit_thread:
+  {
+    GST_INFO_OBJECT (nvenc, "exiting thread");
+
+    return NULL;
+  }
 }
 
 static gboolean
@@ -974,20 +997,27 @@ gst_nv_base_enc_start_bitstream_thread (GstNvBaseEnc * nvenc)
 static gboolean
 gst_nv_base_enc_stop_bitstream_thread (GstNvBaseEnc * nvenc, gboolean force)
 {
-  gpointer out_buf;
+  GstNvEncFrameState *state;
 
   if (nvenc->bitstream_thread == NULL)
     return TRUE;
 
+  /* Always send EOS packet to flush GPU. Otherwise, randomly crash happens
+   * during NvEncDestroyEncoder especially when rc-lookahead or bframe was
+   * enabled */
+  gst_nv_base_enc_drain_encoder (nvenc);
+
   if (force) {
+    g_async_queue_lock (nvenc->available_queue);
+    g_async_queue_lock (nvenc->pending_queue);
     g_async_queue_lock (nvenc->bitstream_queue);
-    g_async_queue_lock (nvenc->bitstream_pool);
-    while ((out_buf = g_async_queue_try_pop_unlocked (nvenc->bitstream_queue))) {
-      GST_INFO_OBJECT (nvenc, "stole bitstream buffer %p from queue", out_buf);
-      g_async_queue_push_unlocked (nvenc->bitstream_pool, out_buf);
+    while ((state = g_async_queue_try_pop_unlocked (nvenc->bitstream_queue))) {
+      GST_INFO_OBJECT (nvenc, "stole bitstream buffer %p from queue", state);
+      g_async_queue_push_unlocked (nvenc->available_queue, state);
     }
     g_async_queue_push_unlocked (nvenc->bitstream_queue, SHUTDOWN_COOKIE);
-    g_async_queue_unlock (nvenc->bitstream_pool);
+    g_async_queue_unlock (nvenc->available_queue);
+    g_async_queue_unlock (nvenc->pending_queue);
     g_async_queue_unlock (nvenc->bitstream_queue);
   } else {
     /* wait for encoder to drain the remaining buffers */
@@ -1016,21 +1046,21 @@ gst_nv_base_enc_reset_queues (GstNvBaseEnc * nvenc, gboolean refill)
 
   GST_INFO_OBJECT (nvenc, "clearing queues");
 
+  while ((ptr = g_async_queue_try_pop (nvenc->available_queue))) {
+    /* do nothing */
+  }
+  while ((ptr = g_async_queue_try_pop (nvenc->pending_queue))) {
+    /* do nothing */
+  }
   while ((ptr = g_async_queue_try_pop (nvenc->bitstream_queue))) {
     /* do nothing */
   }
-  while ((ptr = g_async_queue_try_pop (nvenc->bitstream_pool))) {
-    /* do nothing */
-  }
-  while ((ptr = g_async_queue_try_pop (nvenc->in_bufs_pool))) {
-    /* do nothing */
-  }
 
   if (refill) {
     GST_INFO_OBJECT (nvenc, "refilling buffer pools");
     for (i = 0; i < nvenc->n_bufs; ++i) {
-      g_async_queue_push (nvenc->bitstream_pool, nvenc->input_bufs[i]);
-      g_async_queue_push (nvenc->in_bufs_pool, nvenc->output_bufs[i]);
+      g_async_queue_push (nvenc->available_queue,
+          &g_array_index (nvenc->items, GstNvEncFrameState, i));
     }
   }
 }
@@ -1047,56 +1077,41 @@ gst_nv_base_enc_free_buffers (GstNvBaseEnc * nvenc)
 
   gst_nv_base_enc_reset_queues (nvenc, FALSE);
 
+  gst_cuda_context_push (nvenc->cuda_ctx);
   for (i = 0; i < nvenc->n_bufs; ++i) {
-    NV_ENC_OUTPUT_PTR out_buf = nvenc->output_bufs[i];
+    NV_ENC_OUTPUT_PTR out_buf =
+        g_array_index (nvenc->items, GstNvEncFrameState, i).out_buf;
+    GstNvEncInputResource *in_buf =
+        g_array_index (nvenc->items, GstNvEncFrameState, i).in_buf;
 
-#if HAVE_NVCODEC_GST_GL
-    if (nvenc->gl_input) {
-      GstNvEncGLResource *in_gl_resource = nvenc->input_bufs[i];
-
-      gst_cuda_context_push (nvenc->cuda_ctx);
-
-      if (in_gl_resource->mapped) {
-        GST_LOG_OBJECT (nvenc, "Unmap resource %p", in_gl_resource);
-
-        nv_ret =
-            NvEncUnmapInputResource (nvenc->encoder,
-            in_gl_resource->nv_mapped_resource.mappedResource);
-
-        if (nv_ret != NV_ENC_SUCCESS) {
-          GST_ERROR_OBJECT (nvenc, "Failed to unmap input resource %p, ret %d",
-              in_gl_resource, nv_ret);
-        }
-      }
+    if (in_buf->mapped) {
+      GST_LOG_OBJECT (nvenc, "Unmap resource %p", in_buf);
 
       nv_ret =
-          NvEncUnregisterResource (nvenc->encoder,
-          in_gl_resource->nv_resource.registeredResource);
-      if (nv_ret != NV_ENC_SUCCESS)
-        GST_ERROR_OBJECT (nvenc, "Failed to unregister resource %p, ret %d",
-            in_gl_resource, nv_ret);
+          NvEncUnmapInputResource (nvenc->encoder,
+          in_buf->nv_mapped_resource.mappedResource);
 
-      cuda_ret = CuMemFree ((CUdeviceptr) in_gl_resource->cuda_pointer);
-      if (!gst_cuda_result (cuda_ret)) {
-        GST_ERROR_OBJECT (nvenc, "Failed to free CUDA device memory, ret %d",
-            cuda_ret);
-      }
-
-      g_free (in_gl_resource);
-      gst_cuda_context_pop (NULL);
-    } else
-#endif
-    {
-      NV_ENC_INPUT_PTR in_buf = (NV_ENC_INPUT_PTR) nvenc->input_bufs[i];
-
-      GST_DEBUG_OBJECT (nvenc, "Destroying input buffer %p", in_buf);
-      nv_ret = NvEncDestroyInputBuffer (nvenc->encoder, in_buf);
       if (nv_ret != NV_ENC_SUCCESS) {
-        GST_ERROR_OBJECT (nvenc, "Failed to destroy input buffer %p, ret %d",
+        GST_ERROR_OBJECT (nvenc, "Failed to unmap input resource %p, ret %d",
             in_buf, nv_ret);
       }
     }
 
+    nv_ret =
+        NvEncUnregisterResource (nvenc->encoder,
+        in_buf->nv_resource.registeredResource);
+    if (nv_ret != NV_ENC_SUCCESS)
+      GST_ERROR_OBJECT (nvenc, "Failed to unregister resource %p, ret %d",
+          in_buf, nv_ret);
+
+    cuda_ret = CuMemFree (in_buf->cuda_pointer);
+    if (!gst_cuda_result (cuda_ret)) {
+      GST_ERROR_OBJECT (nvenc, "Failed to free CUDA device memory, ret %d",
+          cuda_ret);
+    }
+
+    g_free (in_buf);
+
     GST_DEBUG_OBJECT (nvenc, "Destroying output bitstream buffer %p", out_buf);
     nv_ret = NvEncDestroyBitstreamBuffer (nvenc->encoder, out_buf);
     if (nv_ret != NV_ENC_SUCCESS) {
@@ -1104,12 +1119,8 @@ gst_nv_base_enc_free_buffers (GstNvBaseEnc * nvenc)
           out_buf, nv_ret);
     }
   }
-
-  nvenc->n_bufs = 0;
-  g_free (nvenc->output_bufs);
-  nvenc->output_bufs = NULL;
-  g_free (nvenc->input_bufs);
-  nvenc->input_bufs = NULL;
+  gst_cuda_context_pop (NULL);
+  g_array_set_size (nvenc->items, 0);
 }
 
 static inline guint
@@ -1454,99 +1465,58 @@ gst_nv_base_enc_set_format (GstVideoEncoder * enc, GstVideoCodecState * state)
     nvenc->n_bufs = (num_macroblocks >= 8160) ? 32 : 48;
 
     /* input buffers */
-    nvenc->input_bufs = g_new0 (gpointer, nvenc->n_bufs);
+    g_array_set_size (nvenc->items, nvenc->n_bufs);
 
 #if HAVE_NVCODEC_GST_GL
     features = gst_caps_get_features (state->caps, 0);
     if (gst_caps_features_contains (features,
             GST_CAPS_FEATURE_MEMORY_GL_MEMORY)) {
-      guint pixel_depth = 0;
       nvenc->gl_input = TRUE;
-
-      for (i = 0; i < GST_VIDEO_INFO_N_COMPONENTS (info); i++) {
-        pixel_depth += GST_VIDEO_INFO_COMP_DEPTH (info, i);
-      }
-
-      gst_cuda_context_push (nvenc->cuda_ctx);
-      for (i = 0; i < nvenc->n_bufs; ++i) {
-        GstNvEncGLResource *in_gl_resource = g_new0 (GstNvEncGLResource, 1);
-        CUresult cu_ret;
-
-        memset (&in_gl_resource->nv_resource, 0,
-            sizeof (in_gl_resource->nv_resource));
-        memset (&in_gl_resource->nv_mapped_resource, 0,
-            sizeof (in_gl_resource->nv_mapped_resource));
-
-        /* scratch buffer for non-contigious planer into a contigious buffer */
-        cu_ret =
-            CuMemAllocPitch ((CUdeviceptr *) & in_gl_resource->cuda_pointer,
-            &in_gl_resource->cuda_stride, _get_plane_width (info, 0),
-            _get_frame_data_height (info), 16);
-        if (!gst_cuda_result (CUDA_SUCCESS)) {
-          GST_ERROR_OBJECT (nvenc, "failed to alocate cuda scratch buffer "
-              "ret %d", cu_ret);
-          g_assert_not_reached ();
-        }
-
-        in_gl_resource->nv_resource.version =
-            gst_nvenc_get_registure_resource_version ();
-        in_gl_resource->nv_resource.resourceType =
-            NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR;
-        in_gl_resource->nv_resource.width = input_width;
-        in_gl_resource->nv_resource.height = input_height;
-        in_gl_resource->nv_resource.pitch = in_gl_resource->cuda_stride;
-        in_gl_resource->nv_resource.bufferFormat =
-            gst_nvenc_get_nv_buffer_format (GST_VIDEO_INFO_FORMAT (info));
-        in_gl_resource->nv_resource.resourceToRegister =
-            in_gl_resource->cuda_pointer;
-
-        nv_ret =
-            NvEncRegisterResource (nvenc->encoder,
-            &in_gl_resource->nv_resource);
-        if (nv_ret != NV_ENC_SUCCESS)
-          GST_ERROR_OBJECT (nvenc, "Failed to register resource %p, ret %d",
-              in_gl_resource, nv_ret);
-
-        nvenc->input_bufs[i] = in_gl_resource;
-        g_async_queue_push (nvenc->in_bufs_pool, nvenc->input_bufs[i]);
-      }
-
-      gst_cuda_context_pop (NULL);
-    } else
-#endif
-    {
-      for (i = 0; i < nvenc->n_bufs; ++i) {
-        NV_ENC_CREATE_INPUT_BUFFER cin_buf = { 0, };
-
-        cin_buf.version = gst_nvenc_get_create_input_buffer_version ();
-
-        cin_buf.width = GST_ROUND_UP_32 (input_width);
-        cin_buf.height = GST_ROUND_UP_32 (input_height);
-
-        cin_buf.memoryHeap = NV_ENC_MEMORY_HEAP_SYSMEM_CACHED;
-        cin_buf.bufferFmt =
-            gst_nvenc_get_nv_buffer_format (GST_VIDEO_INFO_FORMAT (info));
-
-        nv_ret = NvEncCreateInputBuffer (nvenc->encoder, &cin_buf);
-
-        if (nv_ret != NV_ENC_SUCCESS) {
-          GST_WARNING_OBJECT (enc, "Failed to allocate input buffer: %d",
-              nv_ret);
-          /* FIXME: clean up */
-          return FALSE;
-        }
-
-        nvenc->input_bufs[i] = cin_buf.inputBuffer;
-
-        GST_INFO_OBJECT (nvenc, "allocated  input buffer %2d: %p", i,
-            nvenc->input_bufs[i]);
-
-        g_async_queue_push (nvenc->in_bufs_pool, nvenc->input_bufs[i]);
-      }
     }
+#endif
+
+    gst_cuda_context_push (nvenc->cuda_ctx);
+    for (i = 0; i < nvenc->n_bufs; ++i) {
+      GstNvEncInputResource *resource = g_new0 (GstNvEncInputResource, 1);
+      CUresult cu_ret;
+
+      memset (&resource->nv_resource, 0, sizeof (resource->nv_resource));
+      memset (&resource->nv_mapped_resource, 0,
+          sizeof (resource->nv_mapped_resource));
+
+      /* scratch buffer for non-contigious planer into a contigious buffer */
+      cu_ret =
+          CuMemAllocPitch (&resource->cuda_pointer,
+          &resource->cuda_stride, _get_plane_width (info, 0),
+          _get_frame_data_height (info), 16);
+      if (!gst_cuda_result (cu_ret)) {
+        GST_ERROR_OBJECT (nvenc, "failed to alocate cuda scratch buffer "
+            "ret %d", cu_ret);
+        g_assert_not_reached ();
+      }
+
+      resource->nv_resource.version =
+          gst_nvenc_get_registure_resource_version ();
+      resource->nv_resource.resourceType =
+          NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR;
+      resource->nv_resource.width = input_width;
+      resource->nv_resource.height = input_height;
+      resource->nv_resource.pitch = resource->cuda_stride;
+      resource->nv_resource.bufferFormat =
+          gst_nvenc_get_nv_buffer_format (GST_VIDEO_INFO_FORMAT (info));
+      resource->nv_resource.resourceToRegister =
+          (gpointer) resource->cuda_pointer;
+
+      nv_ret = NvEncRegisterResource (nvenc->encoder, &resource->nv_resource);
+      if (nv_ret != NV_ENC_SUCCESS)
+        GST_ERROR_OBJECT (nvenc, "Failed to register resource %p, ret %d",
+            resource, nv_ret);
+
+      g_array_index (nvenc->items, GstNvEncFrameState, i).in_buf = resource;
+    }
+    gst_cuda_context_pop (NULL);
 
     /* output buffers */
-    nvenc->output_bufs = g_new0 (NV_ENC_OUTPUT_PTR, nvenc->n_bufs);
     for (i = 0; i < nvenc->n_bufs; ++i) {
       NV_ENC_CREATE_BITSTREAM_BUFFER cout_buf = { 0, };
 
@@ -1567,12 +1537,14 @@ gst_nv_base_enc_set_format (GstVideoEncoder * enc, GstVideoCodecState * state)
         return FALSE;
       }
 
-      nvenc->output_bufs[i] = cout_buf.bitstreamBuffer;
-
       GST_INFO_OBJECT (nvenc, "allocated output buffer %2d: %p", i,
-          nvenc->output_bufs[i]);
+          cout_buf.bitstreamBuffer);
 
-      g_async_queue_push (nvenc->bitstream_pool, nvenc->output_bufs[i]);
+      g_array_index (nvenc->items, GstNvEncFrameState, i).out_buf =
+          cout_buf.bitstreamBuffer;
+
+      g_async_queue_push (nvenc->available_queue, &g_array_index (nvenc->items,
+              GstNvEncFrameState, i));
     }
 
 #if 0
@@ -1608,7 +1580,6 @@ gst_nv_base_enc_set_format (GstVideoEncoder * enc, GstVideoCodecState * state)
   return TRUE;
 }
 
-#if HAVE_NVCODEC_GST_GL
 static guint
 _get_cuda_device_stride (GstVideoInfo * info, guint plane, gsize cuda_stride)
 {
@@ -1634,6 +1605,7 @@ _get_cuda_device_stride (GstVideoInfo * info, guint plane, gsize cuda_stride)
   }
 }
 
+#if HAVE_NVCODEC_GST_GL
 typedef struct _GstNvEncRegisterResourceData
 {
   GstMemory *mem;
@@ -1726,7 +1698,7 @@ typedef struct _GstNvEncGLMapData
   GstNvBaseEnc *nvenc;
   GstBuffer *buffer;
   GstVideoInfo *info;
-  GstNvEncGLResource *in_gl_resource;
+  GstNvEncInputResource *resource;
 
   gboolean ret;
 } GstNvEncGLMapData;
@@ -1734,8 +1706,9 @@ typedef struct _GstNvEncGLMapData
 static void
 _map_gl_input_buffer (GstGLContext * context, GstNvEncGLMapData * data)
 {
+  GstNvBaseEnc *nvenc = data->nvenc;
   CUresult cuda_ret;
-  guint8 *data_pointer;
+  CUdeviceptr data_pointer;
   guint i;
   CUDA_MEMCPY2D param;
   GstCudaGraphicsResource **resources;
@@ -1750,24 +1723,25 @@ _map_gl_input_buffer (GstGLContext * context, GstNvEncGLMapData * data)
     GstMemory *mem;
 
     mem = gst_buffer_peek_memory (data->buffer, i);
-    resources[i] = ensure_cuda_graphics_resource (mem, data->nvenc);
+    resources[i] = ensure_cuda_graphics_resource (mem, nvenc);
     if (!resources[i]) {
-      GST_ERROR_OBJECT (data->nvenc, "could not register %dth memory", i);
+      GST_ERROR_OBJECT (nvenc, "could not register %dth memory", i);
       return;
     }
   }
 
-  gst_cuda_context_push (data->nvenc->cuda_ctx);
-  data_pointer = data->in_gl_resource->cuda_pointer;
+  gst_cuda_context_push (nvenc->cuda_ctx);
+  data_pointer = data->resource->cuda_pointer;
   for (i = 0; i < GST_VIDEO_INFO_N_PLANES (data->info); i++) {
     GstGLBuffer *gl_buf_obj;
     GstGLMemoryPBO *gl_mem;
     guint src_stride, dest_stride;
     CUgraphicsResource cuda_resource;
+    gsize cuda_num_bytes;
+    CUdeviceptr cuda_plane_pointer;
 
     gl_mem = (GstGLMemoryPBO *) gst_buffer_peek_memory (data->buffer, i);
     g_return_if_fail (gst_is_gl_memory_pbo ((GstMemory *) gl_mem));
-    data->in_gl_resource->gl_mem[i] = GST_GL_MEMORY_CAST (gl_mem);
 
     gl_buf_obj = (GstGLBuffer *) gl_mem->pbo;
     g_return_if_fail (gl_buf_obj != NULL);
@@ -1776,76 +1750,118 @@ _map_gl_input_buffer (GstGLContext * context, GstNvEncGLMapData * data)
     gst_gl_memory_pbo_upload_transfer (gl_mem);
     gst_gl_memory_pbo_download_transfer (gl_mem);
 
-    GST_LOG_OBJECT (data->nvenc, "attempting to copy texture %u into cuda",
+    GST_LOG_OBJECT (nvenc, "attempting to copy texture %u into cuda",
         gl_mem->mem.tex_id);
 
     cuda_resource =
-        gst_cuda_graphics_resource_map (resources[i], data->nvenc->cuda_stream,
+        gst_cuda_graphics_resource_map (resources[i], nvenc->cuda_stream,
         CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY);
 
     if (!cuda_resource) {
-      GST_ERROR_OBJECT (data->nvenc, "failed to map GL texture %u into cuda",
+      GST_ERROR_OBJECT (nvenc, "failed to map GL texture %u into cuda",
           gl_mem->mem.tex_id);
       g_assert_not_reached ();
     }
 
     cuda_ret =
-        CuGraphicsResourceGetMappedPointer (&data->in_gl_resource->
-        cuda_plane_pointers[i], &data->in_gl_resource->cuda_num_bytes,
-        cuda_resource);
+        CuGraphicsResourceGetMappedPointer (&cuda_plane_pointer,
+        &cuda_num_bytes, cuda_resource);
+
     if (!gst_cuda_result (cuda_ret)) {
-      GST_ERROR_OBJECT (data->nvenc, "failed to get mapped pointer of map GL "
+      GST_ERROR_OBJECT (nvenc, "failed to get mapped pointer of map GL "
           "texture %u in cuda ret :%d", gl_mem->mem.tex_id, cuda_ret);
       g_assert_not_reached ();
     }
 
     src_stride = GST_VIDEO_INFO_PLANE_STRIDE (data->info, i);
-    dest_stride =
-        _get_cuda_device_stride (data->info, i,
-        data->in_gl_resource->cuda_stride);
+    dest_stride = _get_cuda_device_stride (&nvenc->input_info,
+        i, data->resource->cuda_stride);
 
     /* copy into scratch buffer */
     param.srcXInBytes = 0;
     param.srcY = 0;
     param.srcMemoryType = CU_MEMORYTYPE_DEVICE;
-    param.srcDevice = data->in_gl_resource->cuda_plane_pointers[i];
+    param.srcDevice = cuda_plane_pointer;
     param.srcPitch = src_stride;
 
     param.dstXInBytes = 0;
     param.dstY = 0;
     param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
-    param.dstDevice = (CUdeviceptr) data_pointer;
+    param.dstDevice = data_pointer;
     param.dstPitch = dest_stride;
     param.WidthInBytes = _get_plane_width (data->info, i);
     param.Height = _get_plane_height (data->info, i);
 
-    cuda_ret = CuMemcpy2DAsync (&param, data->nvenc->cuda_stream);
+    cuda_ret = CuMemcpy2DAsync (&param, nvenc->cuda_stream);
     if (!gst_cuda_result (cuda_ret)) {
       GST_ERROR_OBJECT (data->nvenc, "failed to copy GL texture %u into cuda "
           "ret :%d", gl_mem->mem.tex_id, cuda_ret);
       g_assert_not_reached ();
     }
 
-    gst_cuda_graphics_resource_unmap (resources[i], data->nvenc->cuda_stream);
+    gst_cuda_graphics_resource_unmap (resources[i], nvenc->cuda_stream);
 
-    data_pointer = data_pointer +
-        dest_stride * _get_plane_height (&data->nvenc->input_info, i);
+    data_pointer += dest_stride * _get_plane_height (&nvenc->input_info, i);
   }
-  gst_cuda_result (CuStreamSynchronize (data->nvenc->cuda_stream));
+  gst_cuda_result (CuStreamSynchronize (nvenc->cuda_stream));
   gst_cuda_context_pop (NULL);
 
   data->ret = TRUE;
 }
 #endif
 
-static GstFlowReturn
-_acquire_input_buffer (GstNvBaseEnc * nvenc, gpointer * input)
+static gboolean
+gst_nv_base_enc_upload_frame (GstNvBaseEnc * nvenc, GstVideoFrame * frame,
+    GstNvEncInputResource * resource)
 {
-  g_assert (input);
+  gint i;
+  CUdeviceptr dst = resource->cuda_pointer;
+  GstVideoInfo *info = &frame->info;
+  CUresult cuda_ret;
 
+  if (!gst_cuda_context_push (nvenc->cuda_ctx)) {
+    GST_ERROR_OBJECT (nvenc, "cannot push context");
+    return FALSE;
+  }
+
+  for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (frame); i++) {
+    CUDA_MEMCPY2D param = { 0, };
+    guint dest_stride = _get_cuda_device_stride (&nvenc->input_info, i,
+        resource->cuda_stride);
+
+    param.srcMemoryType = CU_MEMORYTYPE_HOST;
+    param.srcHost = GST_VIDEO_FRAME_PLANE_DATA (frame, i);
+    param.srcPitch = GST_VIDEO_FRAME_PLANE_STRIDE (frame, i);
+
+    param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+    param.dstDevice = dst;
+    param.dstPitch = dest_stride;
+    param.WidthInBytes = _get_plane_width (info, i);
+    param.Height = _get_plane_height (info, i);
+
+    cuda_ret = CuMemcpy2DAsync (&param, nvenc->cuda_stream);
+    if (!gst_cuda_result (cuda_ret)) {
+      GST_ERROR_OBJECT (nvenc, "cannot copy %dth plane, ret %d", i, cuda_ret);
+      gst_cuda_context_pop (NULL);
+
+      return FALSE;
+    }
+
+    dst += dest_stride * _get_plane_height (&nvenc->input_info, i);
+  }
+
+  gst_cuda_result (CuStreamSynchronize (nvenc->cuda_stream));
+  gst_cuda_context_pop (NULL);
+
+  return TRUE;
+}
+
+static GstFlowReturn
+_acquire_input_buffer (GstNvBaseEnc * nvenc, GstNvEncFrameState ** input)
+{
   GST_LOG_OBJECT (nvenc, "acquiring input buffer..");
   GST_VIDEO_ENCODER_STREAM_UNLOCK (nvenc);
-  *input = g_async_queue_pop (nvenc->in_bufs_pool);
+  *input = g_async_queue_pop (nvenc->available_queue);
   GST_VIDEO_ENCODER_STREAM_LOCK (nvenc);
 
   if (*input == SHUTDOWN_COOKIE)
@@ -1856,12 +1872,16 @@ _acquire_input_buffer (GstNvBaseEnc * nvenc, gpointer * input)
 
 static GstFlowReturn
 _submit_input_buffer (GstNvBaseEnc * nvenc, GstVideoCodecFrame * frame,
-    GstVideoFrame * vframe, void *inputBuffer, void *inputBufferPtr,
-    NV_ENC_BUFFER_FORMAT bufferFormat, void *outputBufferPtr)
+    GstVideoFrame * vframe, GstNvEncFrameState * state, void *inputBufferPtr,
+    NV_ENC_BUFFER_FORMAT bufferFormat)
 {
   GstNvBaseEncClass *nvenc_class = GST_NV_BASE_ENC_GET_CLASS (nvenc);
   NV_ENC_PIC_PARAMS pic_params = { 0, };
   NVENCSTATUS nv_ret;
+  gpointer inputBuffer, outputBufferPtr;
+
+  inputBuffer = state->in_buf;
+  outputBufferPtr = state->out_buf;
 
   GST_LOG_OBJECT (nvenc, "%u: input buffer %p, output buffer %p, "
       "pts %" GST_TIME_FORMAT, frame->system_frame_number, inputBuffer,
@@ -1899,24 +1919,62 @@ _submit_input_buffer (GstNvBaseEnc * nvenc, GstVideoCodecFrame * frame,
     return GST_FLOW_ERROR;
   }
 
+  if (!gst_cuda_context_push (nvenc->cuda_ctx)) {
+    GST_ELEMENT_ERROR (nvenc, LIBRARY, ENCODE, (NULL),
+        ("Failed to push current context"));
+    return GST_FLOW_ERROR;
+  }
+
   nv_ret = NvEncEncodePicture (nvenc->encoder, &pic_params);
+
+  gst_cuda_context_pop (NULL);
+
   if (nv_ret == NV_ENC_SUCCESS) {
     GST_LOG_OBJECT (nvenc, "Encoded picture");
   } else if (nv_ret == NV_ENC_ERR_NEED_MORE_INPUT) {
-    /* FIXME: we should probably queue pending output buffers here and only
-     * submit them to the async queue once we got sucess back */
     GST_DEBUG_OBJECT (nvenc, "Encoded picture (encoder needs more input)");
   } else {
     GST_ERROR_OBJECT (nvenc, "Failed to encode picture: %d", nv_ret);
-    GST_DEBUG_OBJECT (nvenc, "re-enqueueing input buffer %p", inputBuffer);
-    g_async_queue_push (nvenc->in_bufs_pool, inputBuffer);
-    GST_DEBUG_OBJECT (nvenc, "re-enqueueing output buffer %p", outputBufferPtr);
-    g_async_queue_push (nvenc->bitstream_pool, outputBufferPtr);
+    g_async_queue_push (nvenc->available_queue, state);
 
     return GST_FLOW_ERROR;
   }
 
-  g_async_queue_push (nvenc->bitstream_queue, outputBufferPtr);
+  /* GstNvEncFrameState shouldn't be freed by DestroyNotify */
+  gst_video_codec_frame_set_user_data (frame, state, NULL);
+  g_async_queue_push (nvenc->pending_queue, state);
+
+  if (nv_ret == NV_ENC_SUCCESS) {
+    GstNvEncFrameState *pending_state;
+    gint len, i, end;
+
+    /* HACK: NvEncEncodePicture() with returning NV_ENC_SUCCESS means that
+     * we can pop encoded bitstream from GPU
+     * (via NvEncLockBitstream and copy to memory then NvEncUnlockBitstream).
+     * But if we try to pop every buffer from GPU when the rc-lookahead
+     * was enabled, NvEncLockBitstream returns error NV_ENC_ERR_INVALID_PARAM
+     * randomly (seemingly it's dependent on how fast the encoding thread
+     * dequeued the encoded picture).
+     * So make "pending_queue" having the number of lookahead pictures always,
+     * so that GPU should be able to reference the lookahead pictures.
+     *
+     * This behavior is not documented by Nvidia. The guess here is that
+     * the lookahead pictures are still used for rate-control by Nvidia driver
+     * and dequeuing the lookahead picture from GPU seems to be causing the
+     * problem.
+     */
+    end = nvenc->rc_lookahead;
+
+    g_async_queue_lock (nvenc->pending_queue);
+
+    len = g_async_queue_length_unlocked (nvenc->pending_queue);
+    for (i = len; i > end; i--) {
+      pending_state = g_async_queue_pop_unlocked (nvenc->pending_queue);
+      g_async_queue_push (nvenc->bitstream_queue, pending_state);
+    }
+
+    g_async_queue_unlock (nvenc->pending_queue);
+  }
 
   return GST_FLOW_OK;
 }
@@ -1924,21 +1982,22 @@ _submit_input_buffer (GstNvBaseEnc * nvenc, GstVideoCodecFrame * frame,
 static GstFlowReturn
 gst_nv_base_enc_handle_frame (GstVideoEncoder * enc, GstVideoCodecFrame * frame)
 {
-  gpointer input_buffer = NULL;
   GstNvBaseEnc *nvenc = GST_NV_BASE_ENC (enc);
-  NV_ENC_OUTPUT_PTR out_buf;
   NVENCSTATUS nv_ret;
   GstVideoFrame vframe;
   GstVideoInfo *info = &nvenc->input_state->info;
   GstFlowReturn flow = GST_FLOW_OK;
   GstMapFlags in_map_flags = GST_MAP_READ;
   GstNvEncFrameState *state = NULL;
+  GstNvEncInputResource *resource = NULL;
 
   g_assert (nvenc->encoder != NULL);
 
   if (g_atomic_int_compare_and_exchange (&nvenc->reconfig, TRUE, FALSE)) {
-    if (!gst_nv_base_enc_set_format (enc, nvenc->input_state))
-      return GST_FLOW_ERROR;
+    if (!gst_nv_base_enc_set_format (enc, nvenc->input_state)) {
+      flow = GST_FLOW_NOT_NEGOTIATED;
+      goto drop;
+    }
 
     /* reconfigured encode session should start from keyframe */
     GST_VIDEO_CODEC_FRAME_SET_FORCE_KEYFRAME (frame);
@@ -1948,244 +2007,106 @@ gst_nv_base_enc_handle_frame (GstVideoEncoder * enc, GstVideoCodecFrame * frame)
     in_map_flags |= GST_MAP_GL;
 #endif
 
-  if (!gst_video_frame_map (&vframe, info, frame->input_buffer, in_map_flags))
-    return GST_FLOW_ERROR;
+  if (!gst_video_frame_map (&vframe, info, frame->input_buffer, in_map_flags)) {
+    goto drop;
+  }
 
   /* make sure our thread that waits for output to be ready is started */
   if (nvenc->bitstream_thread == NULL) {
-    if (!gst_nv_base_enc_start_bitstream_thread (nvenc))
-      goto error;
+    if (!gst_nv_base_enc_start_bitstream_thread (nvenc)) {
+      gst_video_frame_unmap (&vframe);
+      goto unmap_and_drop;
+    }
   }
 
-  flow = _acquire_input_buffer (nvenc, &input_buffer);
-  if (flow != GST_FLOW_OK)
-    goto out;
-  else if (input_buffer == SHUTDOWN_COOKIE)
-    goto out;
-  if (input_buffer == NULL)
-    goto error;
+  flow = _acquire_input_buffer (nvenc, &state);
+  if (flow != GST_FLOW_OK || state == SHUTDOWN_COOKIE || !state)
+    goto unmap_and_drop;
 
-  state = gst_video_codec_frame_get_user_data (frame);
-  if (!state)
-    state = g_new0 (GstNvEncFrameState, 1);
+  resource = state->in_buf;
 
 #if HAVE_NVCODEC_GST_GL
   if (nvenc->gl_input) {
-    GstNvEncGLResource *in_gl_resource = input_buffer;
+    GstGLMemory *gl_mem;
     GstNvEncGLMapData data;
 
-    GST_LOG_OBJECT (enc, "got input buffer %p", in_gl_resource);
-
-    in_gl_resource->gl_mem[0] =
-        (GstGLMemory *) gst_buffer_peek_memory (frame->input_buffer, 0);
-    g_assert (gst_is_gl_memory ((GstMemory *) in_gl_resource->gl_mem[0]));
+    gl_mem = (GstGLMemory *) gst_buffer_peek_memory (frame->input_buffer, 0);
+    g_assert (gst_is_gl_memory ((GstMemory *) gl_mem));
 
     data.nvenc = nvenc;
     data.buffer = frame->input_buffer;
     data.info = &vframe.info;
-    data.in_gl_resource = in_gl_resource;
+    data.resource = resource;
 
-    gst_gl_context_thread_add (in_gl_resource->gl_mem[0]->mem.context,
+    gst_gl_context_thread_add (gl_mem->mem.context,
         (GstGLContextThreadFunc) _map_gl_input_buffer, &data);
-
     if (!data.ret) {
-      GST_ERROR_OBJECT (nvenc, "Could not map input buffer");
-      goto error;
+      flow = GST_FLOW_ERROR;
+      goto unmap_and_drop;
     }
-
-    in_gl_resource->nv_mapped_resource.version =
-        gst_nvenc_get_map_input_resource_version ();
-    in_gl_resource->nv_mapped_resource.registeredResource =
-        in_gl_resource->nv_resource.registeredResource;
-
-    nv_ret =
-        NvEncMapInputResource (nvenc->encoder,
-        &in_gl_resource->nv_mapped_resource);
-    if (nv_ret != NV_ENC_SUCCESS) {
-      GST_ERROR_OBJECT (nvenc, "Failed to map input resource %p, ret %d",
-          in_gl_resource, nv_ret);
-      goto error;
-    }
-
-    in_gl_resource->mapped = TRUE;
-
-    out_buf = g_async_queue_try_pop (nvenc->bitstream_pool);
-    if (out_buf == NULL) {
-      GST_DEBUG_OBJECT (nvenc, "wait for output buf to become available again");
-      out_buf = g_async_queue_pop (nvenc->bitstream_pool);
-    }
-
-    state->in_buf = in_gl_resource;
-    state->out_buf = out_buf;
-
-    gst_video_codec_frame_set_user_data (frame, state, (GDestroyNotify) g_free);
-
-    flow =
-        _submit_input_buffer (nvenc, frame, &vframe, in_gl_resource,
-        in_gl_resource->nv_mapped_resource.mappedResource,
-        in_gl_resource->nv_mapped_resource.mappedBufferFmt, out_buf);
-
-    /* encoder will keep frame in list internally, we'll look it up again later
-     * in the thread where we get the output buffers and finish it there */
-    gst_video_codec_frame_unref (frame);
-    frame = NULL;
-  }
+  } else
 #endif
-
-  if (!nvenc->gl_input) {
-    NV_ENC_LOCK_INPUT_BUFFER in_buf_lock = { 0, };
-    NV_ENC_INPUT_PTR in_buf = input_buffer;
-    guint8 *src, *dest;
-    guint src_stride, dest_stride;
-    guint height, width;
-    guint y;
-
-    GST_LOG_OBJECT (enc, "got input buffer %p", in_buf);
-
-    in_buf_lock.version = gst_nvenc_get_lock_input_buffer_version ();
-    in_buf_lock.inputBuffer = in_buf;
-
-    nv_ret = NvEncLockInputBuffer (nvenc->encoder, &in_buf_lock);
-    if (nv_ret != NV_ENC_SUCCESS) {
-      GST_ERROR_OBJECT (nvenc, "Failed to lock input buffer: %d", nv_ret);
-      /* FIXME: post proper error message */
-      goto error;
-    }
-    GST_LOG_OBJECT (nvenc, "Locked input buffer %p", in_buf);
-
-    width = GST_VIDEO_FRAME_COMP_WIDTH (&vframe, 0) *
-        GST_VIDEO_FRAME_COMP_PSTRIDE (&vframe, 0);
-    height = GST_VIDEO_FRAME_HEIGHT (&vframe);
-
-    /* copy Y plane */
-    src = GST_VIDEO_FRAME_PLANE_DATA (&vframe, 0);
-    src_stride = GST_VIDEO_FRAME_PLANE_STRIDE (&vframe, 0);
-    dest = in_buf_lock.bufferDataPtr;
-    dest_stride = in_buf_lock.pitch;
-    for (y = 0; y < height; ++y) {
-      memcpy (dest, src, width);
-      dest += dest_stride;
-      src += src_stride;
-    }
-
-    if (GST_VIDEO_FRAME_FORMAT (&vframe) == GST_VIDEO_FORMAT_NV12 ||
-        GST_VIDEO_FRAME_FORMAT (&vframe) == GST_VIDEO_FORMAT_P010_10LE ||
-        GST_VIDEO_FRAME_FORMAT (&vframe) == GST_VIDEO_FORMAT_P010_10BE) {
-      /* copy UV plane */
-      src = GST_VIDEO_FRAME_PLANE_DATA (&vframe, 1);
-      src_stride = GST_VIDEO_FRAME_PLANE_STRIDE (&vframe, 1);
-      dest =
-          (guint8 *) in_buf_lock.bufferDataPtr +
-          GST_ROUND_UP_32 (height) * in_buf_lock.pitch;
-      dest_stride = in_buf_lock.pitch;
-      for (y = 0; y < GST_ROUND_UP_2 (height) / 2; ++y) {
-        memcpy (dest, src, width);
-        dest += dest_stride;
-        src += src_stride;
-      }
-    } else if (GST_VIDEO_FRAME_FORMAT (&vframe) == GST_VIDEO_FORMAT_I420 ||
-        GST_VIDEO_FRAME_FORMAT (&vframe) == GST_VIDEO_FORMAT_YV12) {
-      guint8 *dest_u, *dest_v;
-
-      dest_u = (guint8 *) in_buf_lock.bufferDataPtr +
-          GST_ROUND_UP_32 (height) * in_buf_lock.pitch;
-      dest_v = dest_u + ((GST_ROUND_UP_32 (height) / 2) *
-          (in_buf_lock.pitch / 2));
-      dest_stride = in_buf_lock.pitch / 2;
-
-      /* copy U plane */
-      src = GST_VIDEO_FRAME_PLANE_DATA (&vframe, 1);
-      src_stride = GST_VIDEO_FRAME_PLANE_STRIDE (&vframe, 1);
-      dest = dest_u;
-      for (y = 0; y < GST_ROUND_UP_2 (height) / 2; ++y) {
-        memcpy (dest, src, width / 2);
-        dest += dest_stride;
-        src += src_stride;
-      }
-
-      /* copy V plane */
-      src = GST_VIDEO_FRAME_PLANE_DATA (&vframe, 2);
-      src_stride = GST_VIDEO_FRAME_PLANE_STRIDE (&vframe, 2);
-      dest = dest_v;
-      for (y = 0; y < GST_ROUND_UP_2 (height) / 2; ++y) {
-        memcpy (dest, src, width / 2);
-        dest += dest_stride;
-        src += src_stride;
-      }
-    } else if (GST_VIDEO_FRAME_FORMAT (&vframe) == GST_VIDEO_FORMAT_Y444 ||
-        GST_VIDEO_FRAME_FORMAT (&vframe) == GST_VIDEO_FORMAT_Y444_16LE ||
-        GST_VIDEO_FRAME_FORMAT (&vframe) == GST_VIDEO_FORMAT_Y444_16BE) {
-      src = GST_VIDEO_FRAME_PLANE_DATA (&vframe, 1);
-      src_stride = GST_VIDEO_FRAME_PLANE_STRIDE (&vframe, 1);
-      dest = (guint8 *) in_buf_lock.bufferDataPtr +
-          GST_ROUND_UP_32 (height) * in_buf_lock.pitch;
-      dest_stride = in_buf_lock.pitch;
-
-      for (y = 0; y < height; ++y) {
-        memcpy (dest, src, width);
-        dest += dest_stride;
-        src += src_stride;
-      }
-
-      src = GST_VIDEO_FRAME_PLANE_DATA (&vframe, 2);
-      src_stride = GST_VIDEO_FRAME_PLANE_STRIDE (&vframe, 2);
-      dest = (guint8 *) in_buf_lock.bufferDataPtr +
-          2 * GST_ROUND_UP_32 (height) * in_buf_lock.pitch;
-
-      for (y = 0; y < height; ++y) {
-        memcpy (dest, src, width);
-        dest += dest_stride;
-        src += src_stride;
-      }
-    } else if (GST_VIDEO_INFO_IS_RGB (info)) {
-      /* nothing to do */
-    } else {
-      // FIXME: this only works for NV12 and I420
-      g_assert_not_reached ();
-    }
-
-    nv_ret = NvEncUnlockInputBuffer (nvenc->encoder, in_buf);
-    if (nv_ret != NV_ENC_SUCCESS) {
-      GST_ERROR_OBJECT (nvenc, "Failed to unlock input buffer: %d", nv_ret);
-      goto error;
-    }
-
-    out_buf = g_async_queue_try_pop (nvenc->bitstream_pool);
-    if (out_buf == NULL) {
-      GST_DEBUG_OBJECT (nvenc, "wait for output buf to become available again");
-      out_buf = g_async_queue_pop (nvenc->bitstream_pool);
-    }
-
-    state->in_buf = in_buf;
-    state->out_buf = out_buf;
-    gst_video_codec_frame_set_user_data (frame, state, (GDestroyNotify) g_free);
-
-    flow =
-        _submit_input_buffer (nvenc, frame, &vframe, in_buf, in_buf,
-        gst_nvenc_get_nv_buffer_format (GST_VIDEO_INFO_FORMAT (info)), out_buf);
-
-    /* encoder will keep frame in list internally, we'll look it up again later
-     * in the thread where we get the output buffers and finish it there */
-    gst_video_codec_frame_unref (frame);
-    frame = NULL;
+  if (!gst_nv_base_enc_upload_frame (nvenc, &vframe, resource)) {
+    flow = GST_FLOW_ERROR;
+    goto unmap_and_drop;
   }
 
-  if (flow != GST_FLOW_OK)
-    goto out;
+  resource->nv_mapped_resource.version =
+      gst_nvenc_get_map_input_resource_version ();
+  resource->nv_mapped_resource.registeredResource =
+      resource->nv_resource.registeredResource;
+
+  if (!gst_cuda_context_push (nvenc->cuda_ctx)) {
+    GST_ELEMENT_ERROR (nvenc, LIBRARY, ENCODE, (NULL),
+        ("Failed to push current context"));
+    flow = GST_FLOW_ERROR;
+    goto unmap_and_drop;
+  }
+
+  nv_ret =
+      NvEncMapInputResource (nvenc->encoder, &resource->nv_mapped_resource);
+  gst_cuda_context_pop (NULL);
+
+  if (nv_ret != NV_ENC_SUCCESS) {
+    GST_ERROR_OBJECT (nvenc, "Failed to map input resource %p, ret %d",
+        resource, nv_ret);
+    flow = GST_FLOW_ERROR;
+    goto unmap_and_drop;
+  }
+
+  resource->mapped = TRUE;
+
+  flow =
+      _submit_input_buffer (nvenc, frame, &vframe, state,
+      resource->nv_mapped_resource.mappedResource,
+      resource->nv_mapped_resource.mappedBufferFmt);
+
+  if (flow != GST_FLOW_OK) {
+    GST_DEBUG_OBJECT (nvenc, "return state to pool");
+    g_async_queue_push (nvenc->available_queue, state);
+    goto unmap_and_drop;
+  }
 
   flow = g_atomic_int_get (&nvenc->last_flow);
 
-out:
-
   gst_video_frame_unmap (&vframe);
+  /* encoder will keep frame in list internally, we'll look it up again later
+   * in the thread where we get the output buffers and finish it there */
+  gst_video_codec_frame_unref (frame);
 
   return flow;
 
-error:
-  flow = GST_FLOW_ERROR;
-  g_free (state);
-  g_free (input_buffer);
-  goto out;
+/* ERRORS */
+unmap_and_drop:
+  {
+    gst_video_frame_unmap (&vframe);
+    goto drop;
+  }
+drop:
+  {
+    gst_video_encoder_finish_frame (enc, frame);
+    return flow;
+  }
 }
 
 static gboolean
@@ -2193,6 +2114,7 @@ gst_nv_base_enc_drain_encoder (GstNvBaseEnc * nvenc)
 {
   NV_ENC_PIC_PARAMS pic_params = { 0, };
   NVENCSTATUS nv_ret;
+  gboolean ret = TRUE;
 
   GST_INFO_OBJECT (nvenc, "draining encoder");
 
@@ -2204,13 +2126,31 @@ gst_nv_base_enc_drain_encoder (GstNvBaseEnc * nvenc)
   pic_params.version = gst_nvenc_get_pic_params_version ();
   pic_params.encodePicFlags = NV_ENC_PIC_FLAG_EOS;
 
-  nv_ret = NvEncEncodePicture (nvenc->encoder, &pic_params);
-  if (nv_ret != NV_ENC_SUCCESS) {
-    GST_LOG_OBJECT (nvenc, "Failed to drain encoder, ret %d", nv_ret);
-    return FALSE;
+  if (!gst_cuda_context_push (nvenc->cuda_ctx)) {
+    GST_ERROR_OBJECT (nvenc, "Could not push context");
+    return GST_FLOW_ERROR;
   }
 
-  return TRUE;
+  nv_ret = NvEncEncodePicture (nvenc->encoder, &pic_params);
+
+  if (nv_ret != NV_ENC_SUCCESS) {
+    GST_LOG_OBJECT (nvenc, "Failed to drain encoder, ret %d", nv_ret);
+
+    ret = FALSE;
+  } else {
+    GstNvEncFrameState *pending_state;
+
+    g_async_queue_lock (nvenc->pending_queue);
+    while ((pending_state =
+            g_async_queue_try_pop_unlocked (nvenc->pending_queue))) {
+      g_async_queue_push (nvenc->bitstream_queue, pending_state);
+    }
+    g_async_queue_unlock (nvenc->pending_queue);
+  }
+
+  gst_cuda_context_pop (NULL);
+
+  return ret;
 }
 
 static GstFlowReturn
@@ -2218,9 +2158,6 @@ gst_nv_base_enc_finish (GstVideoEncoder * enc)
 {
   GstNvBaseEnc *nvenc = GST_NV_BASE_ENC (enc);
 
-  if (!gst_nv_base_enc_drain_encoder (nvenc))
-    return GST_FLOW_ERROR;
-
   gst_nv_base_enc_stop_bitstream_thread (nvenc, FALSE);
 
   return GST_FLOW_OK;
diff --git a/sys/nvcodec/gstnvbaseenc.h b/sys/nvcodec/gstnvbaseenc.h
index 4bd749f59a..07b5d1af1d 100644
--- a/sys/nvcodec/gstnvbaseenc.h
+++ b/sys/nvcodec/gstnvbaseenc.h
@@ -104,25 +104,30 @@ typedef struct {
   volatile gint       reconfig;                   /* ATOMIC */
   gboolean            gl_input;
 
-  /* allocated buffers */
-  gpointer          *input_bufs;   /* array of n_allocs input buffers  */
-  NV_ENC_OUTPUT_PTR *output_bufs;  /* array of n_allocs output buffers */
+  /* array of allocated input/output buffers (GstNvEncFrameState),
+   * and hold the ownership of the GstNvEncFrameState. */
+  GArray            *items;
   guint              n_bufs;
 
-  /* input and output buffers currently available */
-  GAsyncQueue    *in_bufs_pool;
-  GAsyncQueue    *bitstream_pool;
+  /* (GstNvEncFrameState) available empty items which could be submitted
+   * to encoder */
+  GAsyncQueue       *available_queue;
 
-  /* output bufs in use (input bufs in use are tracked via the codec frames) */
-  GAsyncQueue    *bitstream_queue;
+  /* (GstNvEncFrameState) submitted to encoder but not ready to finish
+   * (due to bframe or lookhead operation) */
+  GAsyncQueue       *pending_queue;
+
+  /* (GstNvEncFrameState) submitted to encoder and ready to finish.
+   * finished items will go back to available item queue */
+  GAsyncQueue       *bitstream_queue;
 
   /* we spawn a thread that does the (blocking) waits for output buffers
    * to become available, so we can continue to feed data to the encoder
    * while we wait */
   GThread        *bitstream_thread;
 
-  void           *display;            /* GstGLDisplay */
-  void           *other_context;      /* GstGLContext */
+  GstObject      *display;            /* GstGLDisplay */
+  GstObject      *other_context;      /* GstGLContext */
 
   GstVideoInfo        input_info;     /* buffer configuration for buffers sent to NVENC */