Use BinaryHeap for more efficient retry selection

2024-09-20 18:39:59 +00:00 · 2023-07-20 18:52:41 +09:30 · 2023-07-20 18:52:41 +09:30 · b2e45f8287
commit b2e45f8287
parent ea2f6b4f69
4 changed files with 124 additions and 71 deletions
--- a/src/activity_queue/mod.rs
+++ b/src/activity_queue/mod.rs
@ -2,7 +2,7 @@
 //!
 #![doc = include_str!("../../docs/09_sending_activities.md")]

-use self::{request::sign_and_send, retry_queue::RetryQueue};
+use self::{queue::ActivityQueue, request::sign_and_send};
 use crate::{
    config::Data,
    traits::{ActivityHandler, Actor},
@ -22,16 +22,15 @@ use std::{
 use tracing::{debug, info, warn};
 use url::Url;

+pub(crate) mod queue;
 pub(crate) mod request;
-pub(crate) mod retry_queue;
 pub(super) mod retry_worker;
 pub(super) mod util;

 /// Send a new activity to the given inboxes
 ///
 /// - `activity`: The activity to be sent, gets converted to json
-/// - `private_key`: Private key belonging to the actor who sends the activity, for signing HTTP
-///                  signature. Generated with [crate::http_signatures::generate_actor_keypair].
+/// - `actor`: The actor doing the sending
 /// - `inboxes`: List of remote actor inboxes that should receive the activity. Ignores local actor
 ///              inboxes. Should be built by calling [crate::traits::Actor::shared_inbox_or_inbox]
 ///              for each target actor.
@ -96,6 +95,17 @@ pub struct RawActivity {
    private_key: PKey<Private>,
 }

+impl PartialEq for RawActivity {
+    fn eq(&self, other: &Self) -> bool {
+        self.actor_id == other.actor_id
+            && self.activity_id == other.activity_id
+            && self.activity == other.activity
+            && self.inbox == other.inbox
+    }
+}
+
+impl Eq for RawActivity {}
+
 impl RawActivity {
    /// Sends a raw activity directly, rather than using the background queue.
    /// This will sign and send the request using the configured [`client`](crate::config::FederationConfigBuilder::client) in the federation config
@ -187,8 +197,8 @@ pub(crate) fn create_activity_queue(
    disable_retry: bool,
    request_timeout: Duration,
    http_signature_compat: bool,
-) -> RetryQueue {
-    RetryQueue::new(
+) -> ActivityQueue {
+    ActivityQueue::new(
        client,
        worker_count,
        retry_count,
@ -264,7 +274,7 @@ mod tests {
            .init();
        */

-        let activity_queue = RetryQueue::new(
+        let activity_queue = ActivityQueue::new(
            reqwest::Client::default().into(),
            num_workers,
            num_workers,
--- a/src/activity_queue/retry_queue.rs
+++ b/src/activity_queue/retry_queue.rs
@ -15,7 +15,7 @@ use tokio::{sync::mpsc::UnboundedSender, task::JoinHandle};

 /// A simple activity queue which spawns tokio workers to send out requests
 /// Uses an unbounded mpsc queue for communication (i.e, all messages are in memory)
-pub(crate) struct RetryQueue {
+pub(crate) struct ActivityQueue {
    // Stats shared between the queue and workers
    stats: Arc<Stats>,
    sender: UnboundedSender<RetryRawActivity>,
@ -48,7 +48,7 @@ impl Debug for Stats {
    }
 }

-impl RetryQueue {
+impl ActivityQueue {
    pub fn new(
        client: ClientWithMiddleware,
        worker_count: usize,
--- a/src/activity_queue/retry_worker.rs
+++ b/src/activity_queue/retry_worker.rs
@ -1,23 +1,18 @@
-use super::{request::sign_and_send, retry_queue::Stats, util::RetryStrategy, RawActivity};
+use super::{queue::Stats, request::sign_and_send, util::RetryStrategy, RawActivity};
 use futures_core::Future;
 use futures_util::FutureExt;
 use reqwest_middleware::ClientWithMiddleware;
 use std::{
+    collections::{BTreeMap, BinaryHeap},
    sync::{atomic::Ordering, Arc},
    time::{Duration, Instant},
 };
 use tokio::{
-    sync::mpsc::{
-        error::TryRecvError,
-        unbounded_channel,
-        UnboundedReceiver,
-        UnboundedSender,
-        WeakUnboundedSender,
-    },
+    sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender, WeakUnboundedSender},
    task::{JoinHandle, JoinSet},
    time::MissedTickBehavior,
 };
-use tracing::error;
+use tracing::{error, info};

 /// A tokio spawned worker which is responsible for submitting requests to federated servers
 /// This will retry up to one time with the same signature, and if it fails, will move it to the retry queue.
@ -37,7 +32,7 @@ pub(super) struct RetryWorker {
 }

 /// A message that has tried to be sent but has not been able to be sent
-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq)]
 pub(super) struct RetryRawActivity {
    /// The message that is sent
    pub message: RawActivity,
@ -47,6 +42,20 @@ pub(super) struct RetryRawActivity {
    pub count: usize,
 }

+// We reverse the order here as we want the "highest" to be the earliest, not latest
+// So that we can retry the oldest sent first
+impl Ord for RetryRawActivity {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        self.last_sent.cmp(&other.last_sent).reverse()
+    }
+}
+
+impl PartialOrd for RetryRawActivity {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
 impl RetryWorker {
    /// Spawns a background task for managing the queue of retryables
    pub fn spawn(
@ -60,7 +69,7 @@ impl RetryWorker {
    ) -> (UnboundedSender<RetryRawActivity>, JoinHandle<()>) {
        // The main sender channel, gets called immediately when something is queued
        let (sender, receiver) = unbounded_channel::<RetryRawActivity>();
-        // The batch sender channel, waits up to an hour before checking if anything needs to be sent
+        // The batch sender channel, checks every hour if anything needs to be sent
        let (batch_sender, batch_receiver) = unbounded_channel::<RetryRawActivity>();
        // The retry sender channel, is called by the batch
        let (retry_sender, retry_receiver) = unbounded_channel::<RetryRawActivity>();
@ -69,13 +78,11 @@ impl RetryWorker {
            client,
            timeout,
            stats,
-            batch_sender: batch_sender.clone().downgrade(),
+            batch_sender: batch_sender.downgrade(),
            backoff,
            http_signature_compat,
        });

-        let loop_batch_sender = batch_sender.clone().downgrade();
-
        let retry_task = tokio::spawn(async move {
            // This is the main worker queue, tasks sent here are sent immediately
            let main_worker = worker.clone();
@ -90,12 +97,7 @@ impl RetryWorker {
            if let Some(retry_count) = retry_count {
                // This task checks every hour anything that needs to be sent, based upon the last sent time
                // If any tasks need to be sent, they are then sent to the retry queue
-                let batch_loop = retry_loop(
-                    backoff.pow(2),
-                    batch_receiver,
-                    loop_batch_sender,
-                    retry_sender,
-                );
+                let batch_loop = retry_loop(backoff.pow(2), batch_receiver, retry_sender);

                let retry_queue = receiver_queue(retry_count, retry_receiver, move |message| {
                    let worker = worker.clone();
@ -179,60 +181,101 @@ impl RetryWorker {
    }
 }

+/// Ordered list of raw activities based upon retry count
+///
+/// Uses separate binary heaps per count to keep things in order
+///
+/// When flushed it will go through each queue and check to see if there are any retries ready to be sent
+///
+/// If enought time has elapsed it'll send them with the sender, otherwise they'll stay in the queue
+struct RetryQueue {
+    /// Queue per retry count for ordering
+    queues: BTreeMap<usize, BinaryHeap<RetryRawActivity>>,
+    sender: UnboundedSender<RetryRawActivity>,
+    sleep_interval: usize,
+}
+
+impl RetryQueue {
+    /// Push a raw activity onto the queue
+    fn push(&mut self, retry: RetryRawActivity) {
+        let queue = self.queues.entry(retry.count).or_default();
+        queue.push(retry);
+    }
+
+    /// Flush out & send any retries that need to be retried
+    fn flush(&mut self) {
+        let mut count = 0;
+        let mut total = 0;
+
+        // We check each queue separately
+        for (retry_count, queue) in self.queues.iter_mut() {
+            // We check the duration based on the retry count using an exponential backoff, i.e, 60s, 60m, 60h
+            let sleep_duration =
+                Duration::from_secs(self.sleep_interval.pow(*retry_count as u32) as u64);
+
+            total += queue.len();
+
+            'queue: loop {
+                match queue.pop() {
+                    Some(retry) => {
+                        // If the elapsed time is long enough we send it
+                        if retry.last_sent.elapsed() > sleep_duration {
+                            if let Err(err) = self.sender.send(retry) {
+                                error!("Error sending retry: {err}");
+                            }
+                            count += 1;
+                        // If it's too young, then we exit the loop
+                        // No more entries after this will be old enough in the binary heap
+                        } else {
+                            queue.push(retry);
+                            break 'queue;
+                        }
+                    }
+                    None => break 'queue,
+                }
+            }
+        }
+
+        if total > 0 {
+            info!("Scheduled {count}/{total} activities for retry");
+        }
+    }
+}
+
 /// This is a retry loop that will simply send tasks in batches
 /// It will check an incoming queue, and schedule any tasks that need to be sent
 /// The current sleep interval here is 1 hour
 async fn retry_loop(
    sleep_interval: usize,
    mut batch_receiver: UnboundedReceiver<RetryRawActivity>,
-    batch_sender: WeakUnboundedSender<RetryRawActivity>,
    retry_sender: UnboundedSender<RetryRawActivity>,
 ) {
    let mut interval = tokio::time::interval(Duration::from_secs((sleep_interval) as u64));
    interval.set_missed_tick_behavior(MissedTickBehavior::Delay);

+    let mut inner = RetryQueue {
+        queues: Default::default(),
+        sender: retry_sender,
+        sleep_interval,
+    };
+
    loop {
-        interval.tick().await;
-
-        // We requeue any messages to be checked next time if they haven't slept long enough yet
-        let mut requeue_messages = Vec::new();
-
-        // Grab all the activities that are in the queue
-        loop {
-            // try_recv will not await anything
-            match batch_receiver.try_recv() {
-                Ok(message) => {
-                    let sleep_duration = Duration::from_secs(
-                        sleep_interval.pow(message.count as u32) as u64,
-                        // Take off 1 second for tests to pass
-                    ) - Duration::from_secs(1);
-
-                    // If the time between now and sending this message is greater than our sleep duration
-                    if message.last_sent.elapsed() > sleep_duration {
-                        if let Err(err) = retry_sender.send(message) {
-                            error!("Couldn't wake up task for sending: {err}");
-                        }
-                    } else {
-                        // If we haven't slept long enough, then we just add it to the end of the queue
-                        requeue_messages.push(message);
+        tokio::select! {
+            message = batch_receiver.recv() => {
+                match message {
+                    // We have a new message, add it to our queue
+                    Some(retry) => {
+                        inner.push(retry);
+                    },
+                    // The receiver has dropped, so flush out everything and then exit the loop
+                    None => {
+                        inner.flush();
+                        break;
                    }
                }
-                Err(TryRecvError::Empty) => {
-                    // no more to be had, break and wait for the next interval
-                    break;
-                }
-                Err(TryRecvError::Disconnected) => {
-                    return;
-                }
            }
-        }
-
-        // If there are any messages that need to be retried later on
-        if let Some(ref sender) = batch_sender.upgrade() {
-            for message in requeue_messages {
-                if let Err(err) = sender.send(message) {
-                    error!("Couldn't wake up task for sending: {err}");
-                }
+            _ = interval.tick() => {
+                inner.flush();
            }
        }
    }
--- a/src/config.rs
+++ b/src/config.rs
@ -16,7 +16,7 @@
 //! ```

 use crate::{
-    activity_queue::{create_activity_queue, retry_queue::RetryQueue},
+    activity_queue::{create_activity_queue, queue::ActivityQueue},
    error::Error,
    protocol::verification::verify_domains_match,
    traits::{ActivityHandler, Actor},
@ -98,7 +98,7 @@ pub struct FederationConfig<T: Clone> {
    /// Queue for sending outgoing activities. Only optional to make builder work, its always
    /// present once constructed.
    #[builder(setter(skip))]
-    pub(crate) activity_queue: Option<Arc<RetryQueue>>,
+    pub(crate) activity_queue: Option<Arc<ActivityQueue>>,
 }

 impl<T: Clone> FederationConfig<T> {
@ -199,7 +199,7 @@ impl<T: Clone> FederationConfig<T> {
            .take()
            .context("ActivityQueue never constructed, build() not called?")?;
        // Todo: use Arc::into_inner but is only part of rust 1.70.
-        let stats = Arc::<RetryQueue>::try_unwrap(q)
+        let stats = Arc::<ActivityQueue>::try_unwrap(q)
            .map_err(|_| {
                anyhow::anyhow!(
                    "Could not cleanly shut down: activityqueue arc was still in use elsewhere "