Fork the Lifeline plugin to raise Lazarus

2025-04-14 06:54:08 +00:00 · 2024-07-12 13:46:52 -04:00 · 2024-07-12 13:46:52 -04:00 · 83286a1d9f
commit 83286a1d9f
parent 0ea63d824e
1 changed files with 189 additions and 0 deletions
--- a/lib/pleroma/oban/plugins/lazarus.ex
+++ b/lib/pleroma/oban/plugins/lazarus.ex
@ -0,0 +1,189 @@
+# Oban is originally Apache licensed which is where this came from
+# It needs to go into a separate repo and be its own project
+# and continue using the Apache license
+
+defmodule Pleroma.Oban.Plugins.Lazarus do
+  @moduledoc """
+  Naively transition jobs stuck `executing` back to `available`.
+
+  The `Lazarus` plugin periodically rescues orphaned jobs, i.e. jobs that are stuck in the
+  `executing` state because the node was shut down before the job could finish. Rescuing is
+  purely based on time, rather than any heuristic about the job's expected execution time or
+  whether the node is still alive.
+
+  If an executing job has exhausted all attempts and max_attempts > 1, the Lazarus plugin
+  will mark it `discarded` rather than `available`.
+
+  ## Using the Plugin
+
+  Rescue orphaned jobs that are still `executing` after the default of 60 minutes:
+
+      config :my_app, Oban,
+        plugins: [Pleroma.Oban.Plugins.Lazarus],
+        ...
+
+  Override the default period to rescue orphans after a more aggressive period of 5 minutes:
+
+      config :my_app, Oban,
+        plugins: [{Pleroma.Oban.Plugins.Lazarus, rescue_after: :timer.minutes(5)}],
+        ...
+
+  ## Options
+
+  * `:interval` — the number of milliseconds between rescue attempts. The default is `60_000ms`.
+
+  * `:rescue_after` — the maximum amount of time, in milliseconds, that a job may execute before
+  being rescued. 60 minutes by default, and rescuing is performed once a minute.
+
+  ## Instrumenting with Telemetry
+
+  The `Oban.Plugins.Lifeline` plugin adds the following metadata to the `[:oban, :plugin, :stop]`
+  event:
+
+  * `:rescued_jobs` — a list of jobs transitioned back to `available`
+
+  * `:discarded_jobs` — a list of jobs transitioned to `discarded`
+
+  _Note: jobs only include `id`, `queue`, `state` fields._
+  """
+
+  @behaviour Oban.Plugin
+
+  use GenServer
+
+  import Ecto.Query, only: [select: 3, where: 3]
+
+  alias Oban.{Job, Peer, Plugin, Repo, Validation}
+  alias __MODULE__, as: State
+
+  @type option ::
+          Plugin.option()
+          | {:interval, timeout()}
+          | {:rescue_after, pos_integer()}
+
+  defstruct [
+    :conf,
+    :timer,
+    interval: :timer.minutes(1),
+    rescue_after: :timer.minutes(60)
+  ]
+
+  @doc false
+  @spec child_spec(Keyword.t()) :: Supervisor.child_spec()
+  def child_spec(opts), do: super(opts)
+
+  @impl Plugin
+  @spec start_link([option()]) :: GenServer.on_start()
+  def start_link(opts) do
+    {name, opts} = Keyword.pop(opts, :name)
+
+    GenServer.start_link(__MODULE__, struct!(State, opts), name: name)
+  end
+
+  @impl Plugin
+  def validate(opts) do
+    Validation.validate_schema(opts,
+      conf: :any,
+      name: :any,
+      interval: :pos_integer,
+      rescue_after: :pos_integer
+    )
+  end
+
+  @impl GenServer
+  def init(state) do
+    :telemetry.execute([:oban, :plugin, :init], %{}, %{conf: state.conf, plugin: __MODULE__})
+
+    {:ok, schedule_rescue(state)}
+  end
+
+  @impl GenServer
+  def terminate(_reason, %State{timer: timer}) do
+    if is_reference(timer), do: Process.cancel_timer(timer)
+
+    :ok
+  end
+
+  @impl GenServer
+  def handle_info(:rescue, %State{} = state) do
+    meta = %{conf: state.conf, plugin: __MODULE__}
+
+    :telemetry.span([:oban, :plugin], meta, fn ->
+      case check_leadership_and_rescue_jobs(state) do
+        {:ok, extra} when is_map(extra) ->
+          {:ok, Map.merge(meta, extra)}
+
+        error ->
+          {:error, Map.put(meta, :error, error)}
+      end
+    end)
+
+    {:noreply, schedule_rescue(state)}
+  end
+
+  # Scheduling
+
+  defp schedule_rescue(state) do
+    timer = Process.send_after(self(), :rescue, state.interval)
+
+    %{state | timer: timer}
+  end
+
+  # Rescuing
+
+  defp check_leadership_and_rescue_jobs(state) do
+    if Peer.leader?(state.conf) do
+      Repo.transaction(state.conf, fn ->
+        time = DateTime.add(DateTime.utc_now(), -state.rescue_after, :millisecond)
+        base = where(Job, [j], j.state == "executing" and j.attempted_at < ^time)
+
+        {rescued_count, rescued} = transition_available(base, state)
+        {discard_count, discard} = transition_discarded(base, state)
+
+        %{
+          discarded_count: discard_count,
+          discarded_jobs: discard,
+          rescued_count: rescued_count,
+          rescued_jobs: rescued
+        }
+      end)
+    else
+      {:ok, %{}}
+    end
+  end
+
+  # Rescue stuck max_attempts: 1 jobs from the dead and try again
+  # until it gives us a clear error
+  # Others that have tried multiple times can be discarded.
+  defp transition_available(base, state) do
+    query =
+      base
+      |> where([j], j.max_attempts == 1)
+      |> where([j], j.attempt == j.max_attempts)
+      |> select([j], map(j, [:id, :queue, :state]))
+
+    {resurrected_count, resurrected} =
+      Repo.update_all(state.conf, query, set: [state: "available", attempt: 0])
+
+    query =
+      base
+      |> where([j], j.attempt < j.max_attempts)
+      |> select([j], map(j, [:id, :queue, :state]))
+
+    {rescued_count, rescued} =
+      Repo.update_all(state.conf, query, set: [state: "available"])
+
+    {resurrected_count + rescued_count, resurrected ++ rescued}
+  end
+
+  defp transition_discarded(base, state) do
+    query =
+      base
+      |> where([j], j.attempt >= j.max_attempts)
+      |> select([j], map(j, [:id, :queue, :state]))
+
+    Repo.update_all(state.conf, query,
+      set: [state: "discarded", discarded_at: DateTime.utc_now()]
+    )
+  end
+end