diff --git a/sys/nvcodec/gstnvbaseenc.c b/sys/nvcodec/gstnvbaseenc.c
index e1a73d9bdd..15bd2653b3 100644
--- a/sys/nvcodec/gstnvbaseenc.c
+++ b/sys/nvcodec/gstnvbaseenc.c
@@ -292,6 +292,7 @@ gst_nv_base_enc_open (GstVideoEncoder * enc)
   GstNvBaseEnc *nvenc = GST_NV_BASE_ENC (enc);
   GstNvBaseEncClass *klass = GST_NV_BASE_ENC_GET_CLASS (enc);
   GValue *formats = NULL;
+  CUresult cuda_ret;
 
   if (!gst_cuda_ensure_element_context (GST_ELEMENT_CAST (enc),
           klass->cuda_device_id, &nvenc->cuda_ctx)) {
@@ -299,6 +300,16 @@ gst_nv_base_enc_open (GstVideoEncoder * enc)
     return FALSE;
   }
 
+  if (gst_cuda_context_push (nvenc->cuda_ctx)) {
+    cuda_ret = CuStreamCreate (&nvenc->cuda_stream, CU_STREAM_NON_BLOCKING);
+    if (!gst_cuda_result (cuda_ret)) {
+      GST_WARNING_OBJECT (nvenc,
+          "Could not create cuda stream, will use default stream");
+      nvenc->cuda_stream = NULL;
+    }
+    gst_cuda_context_pop (NULL);
+  }
+
   {
     NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS params = { 0, };
     NVENCSTATUS nv_ret;
@@ -640,14 +651,24 @@ static gboolean
 gst_nv_base_enc_close (GstVideoEncoder * enc)
 {
   GstNvBaseEnc *nvenc = GST_NV_BASE_ENC (enc);
+  gboolean ret = TRUE;
 
   if (nvenc->encoder) {
     if (NvEncDestroyEncoder (nvenc->encoder) != NV_ENC_SUCCESS)
-      return FALSE;
+      ret = FALSE;
+
     nvenc->encoder = NULL;
   }
 
+  if (nvenc->cuda_ctx && nvenc->cuda_stream) {
+    if (gst_cuda_context_push (nvenc->cuda_ctx)) {
+      gst_cuda_result (CuStreamDestroy (nvenc->cuda_stream));
+      gst_cuda_context_pop (NULL);
+    }
+  }
+
   gst_clear_object (&nvenc->cuda_ctx);
+  nvenc->cuda_stream = NULL;
 
   GST_OBJECT_LOCK (nvenc);
   if (nvenc->input_formats)
@@ -667,7 +688,7 @@ gst_nv_base_enc_close (GstVideoEncoder * enc)
     nvenc->bitstream_pool = NULL;
   }
 
-  return TRUE;
+  return ret;
 }
 
 static void
@@ -1537,7 +1558,8 @@ _map_gl_input_buffer (GstGLContext * context, struct map_gl_input *data)
     }
 
     cuda_ret =
-        CuGraphicsMapResources (1, &data->in_gl_resource->cuda_texture, 0);
+        CuGraphicsMapResources (1, &data->in_gl_resource->cuda_texture,
+        data->nvenc->cuda_stream);
     if (!gst_cuda_result (cuda_ret)) {
       GST_ERROR_OBJECT (data->nvenc, "failed to map GL texture %u into cuda "
           "ret :%d", gl_mem->mem.tex_id, cuda_ret);
@@ -1574,7 +1596,7 @@ _map_gl_input_buffer (GstGLContext * context, struct map_gl_input *data)
     param.WidthInBytes = _get_plane_width (data->info, i);
     param.Height = _get_plane_height (data->info, i);
 
-    cuda_ret = CuMemcpy2D (&param);
+    cuda_ret = CuMemcpy2DAsync (&param, data->nvenc->cuda_stream);
     if (!gst_cuda_result (cuda_ret)) {
       GST_ERROR_OBJECT (data->nvenc, "failed to copy GL texture %u into cuda "
           "ret :%d", gl_mem->mem.tex_id, cuda_ret);
@@ -1582,7 +1604,8 @@ _map_gl_input_buffer (GstGLContext * context, struct map_gl_input *data)
     }
 
     cuda_ret =
-        CuGraphicsUnmapResources (1, &data->in_gl_resource->cuda_texture, 0);
+        CuGraphicsUnmapResources (1, &data->in_gl_resource->cuda_texture,
+        data->nvenc->cuda_stream);
     if (!gst_cuda_result (cuda_ret)) {
       GST_ERROR_OBJECT (data->nvenc, "failed to unmap GL texture %u from cuda "
           "ret :%d", gl_mem->mem.tex_id, cuda_ret);
@@ -1600,6 +1623,7 @@ _map_gl_input_buffer (GstGLContext * context, struct map_gl_input *data)
     data_pointer = data_pointer +
         dest_stride * _get_plane_height (&data->nvenc->input_info, i);
   }
+  gst_cuda_result (CuStreamSynchronize (data->nvenc->cuda_stream));
   gst_cuda_context_pop (NULL);
 }
 #endif
diff --git a/sys/nvcodec/gstnvbaseenc.h b/sys/nvcodec/gstnvbaseenc.h
index 0cb3d87bf7..aa022871d4 100644
--- a/sys/nvcodec/gstnvbaseenc.h
+++ b/sys/nvcodec/gstnvbaseenc.h
@@ -72,6 +72,7 @@ typedef struct {
   gint            gop_size;
 
   GstCudaContext * cuda_ctx;
+  CUstream         cuda_stream;
   void          * encoder;
 
   /* the supported input formats */