takahe/stator/runner.py

250 lines
9 KiB
Python
Raw Normal View History

import asyncio
import datetime
import os
import signal
import time
import traceback
import uuid
2023-05-03 16:39:00 +00:00
from collections.abc import Callable
2023-02-04 18:46:38 +00:00
from asgiref.sync import async_to_sync, sync_to_async
2022-12-20 08:02:35 +00:00
from django.conf import settings
from django.utils import timezone
from core import exceptions, sentry
from core.models import Config
from stator.models import StatorModel, Stats
class LoopingTask:
"""
Wrapper for having a coroutine go in the background and only have one
copy running at a time.
"""
2023-05-03 16:39:00 +00:00
def __init__(self, callable: Callable):
self.callable = callable
self.task: asyncio.Task | None = None
def run(self) -> bool:
# If we have a task object, see if we can clear it up
if self.task is not None:
if self.task.done():
self.task = None
else:
return False
# OK, launch a new task
self.task = asyncio.create_task(self.callable())
return True
class StatorRunner:
"""
Runs tasks on models that are looking for state changes.
Designed to run either indefinitely, or just for a few seconds.
"""
2022-11-14 01:42:47 +00:00
def __init__(
self,
models: list[type[StatorModel]],
2022-12-20 08:02:35 +00:00
concurrency: int = getattr(settings, "STATOR_CONCURRENCY", 50),
concurrency_per_model: int = getattr(
settings, "STATOR_CONCURRENCY_PER_MODEL", 15
2022-12-20 08:02:35 +00:00
),
liveness_file: str | None = None,
schedule_interval: int = 30,
lock_expiry: int = 300,
run_for: int = 0,
2022-11-14 01:42:47 +00:00
):
self.models = models
self.runner_id = uuid.uuid4().hex
2022-11-14 01:42:47 +00:00
self.concurrency = concurrency
self.concurrency_per_model = concurrency_per_model
self.liveness_file = liveness_file
self.schedule_interval = schedule_interval
self.lock_expiry = lock_expiry
self.run_for = run_for
self.minimum_loop_delay = 0.5
self.maximum_loop_delay = 5
# Set up SIGALRM handler
signal.signal(signal.SIGALRM, self.alarm_handler)
async def run(self):
sentry.set_takahe_app("stator")
self.handled = {}
self.started = time.monotonic()
self.last_clean = time.monotonic() - self.schedule_interval
self.tasks = []
self.loop_delay = self.minimum_loop_delay
self.schedule_task = LoopingTask(self.run_scheduling)
self.fetch_task = LoopingTask(self.fetch_and_process_tasks)
self.config_task = LoopingTask(self.load_config)
# For the first time period, launch tasks
print("Running main task loop")
try:
with sentry.configure_scope() as scope:
while True:
# Do we need to do cleaning?
if (time.monotonic() - self.last_clean) >= self.schedule_interval:
# Set up the watchdog timer (each time we do this the
# previous one is cancelled)
signal.alarm(self.schedule_interval * 2)
# Refresh the config
self.config_task.run()
if self.schedule_task.run():
print("Running cleaning and scheduling")
else:
print("Previous scheduling still running...!")
2022-12-31 18:54:51 +00:00
# Write liveness file if configured
if self.liveness_file:
with open(self.liveness_file, "w") as fh:
fh.write(str(int(time.time())))
self.last_clean = time.monotonic()
2022-11-27 18:09:46 +00:00
# Clear the cleaning breadcrumbs/extra for the main part of the loop
sentry.scope_clear(scope)
2022-11-27 18:09:46 +00:00
self.remove_completed_tasks()
# Fetching is kind of blocking, so we need to do this
# as a separate coroutine
self.fetch_task.run()
# Are we in limited run mode?
if (
self.run_for
and (time.monotonic() - self.started) > self.run_for
):
break
# Prevent busylooping, but also back off delay if we have
# no tasks
if self.tasks:
self.loop_delay = self.minimum_loop_delay
else:
self.loop_delay = min(
self.loop_delay * 1.5,
self.maximum_loop_delay,
)
await asyncio.sleep(self.loop_delay)
# Clear the Sentry breadcrumbs and extra for next loop
sentry.scope_clear(scope)
except KeyboardInterrupt:
pass
# Wait for tasks to finish
print("Waiting for tasks to complete")
while True:
self.remove_completed_tasks()
if not self.tasks:
break
# Prevent busylooping
await asyncio.sleep(0.5)
print("Complete")
return self.handled
def alarm_handler(self, signum, frame):
"""
Called when SIGALRM fires, which means we missed a schedule loop.
Just exit as we're likely deadlocked.
"""
print("Watchdog timeout exceeded")
os._exit(2)
async def load_config(self):
"""
Refreshes config from the DB
"""
Config.system = await Config.aload_system()
2022-11-27 19:09:08 +00:00
async def run_scheduling(self):
2022-11-27 18:09:46 +00:00
"""
Do any transition cleanup tasks
"""
if self.handled:
print("Tasks processed since last flush:")
for label, number in self.handled.items():
print(f" {label}: {number}")
else:
print("No tasks handled since last flush.")
with sentry.start_transaction(op="task", name="stator.run_scheduling"):
for model in self.models:
await self.submit_stats(model)
await model.atransition_clean_locks()
await model.atransition_schedule_due()
await model.atransition_delete_due()
2022-11-27 18:09:46 +00:00
async def submit_stats(self, model):
"""
Pop some statistics into the database
"""
stats_instance = await Stats.aget_for_model(model)
if stats_instance.model_label in self.handled:
stats_instance.add_handled(self.handled[stats_instance.model_label])
del self.handled[stats_instance.model_label]
stats_instance.set_queued(await model.atransition_ready_count())
stats_instance.trim_data()
await sync_to_async(stats_instance.save)()
2022-11-27 18:09:46 +00:00
async def fetch_and_process_tasks(self):
# Calculate space left for tasks
space_remaining = self.concurrency - len(self.tasks)
# Fetch new tasks
for model in self.models:
if space_remaining > 0:
for instance in await model.atransition_get_with_lock(
number=min(space_remaining, self.concurrency_per_model),
lock_expiry=(
timezone.now() + datetime.timedelta(seconds=self.lock_expiry)
),
):
self.tasks.append(
asyncio.create_task(self.run_transition(instance))
)
self.handled[model._meta.label_lower] = (
self.handled.get(model._meta.label_lower, 0) + 1
)
2022-11-27 18:09:46 +00:00
space_remaining -= 1
async def run_transition(self, instance: StatorModel):
"""
Wrapper for atransition_attempt with fallback error handling
"""
task_name = f"stator.run_transition:{instance._meta.label_lower}#{{id}} from {instance.state}"
2023-02-04 18:46:38 +00:00
with sentry.start_transaction(op="task", name=task_name):
sentry.set_context(
"instance",
{
"model": instance._meta.label_lower,
"pk": instance.pk,
"state": instance.state,
"state_age": instance.state_age,
},
)
try:
print(
f"Attempting transition on {instance._meta.label_lower}#{instance.pk} from state {instance.state}"
)
2023-02-04 18:46:38 +00:00
await instance.atransition_attempt()
except BaseException as e:
await exceptions.acapture_exception(e)
traceback.print_exc()
def remove_completed_tasks(self):
"""
Removes all completed asyncio.Tasks from our local in-progress list
"""
self.tasks = [t for t in self.tasks if not t.done()]
async def run_single_cycle(self):
"""
Testing entrypoint to advance things just one cycle, and allow errors
to propagate out.
"""
await asyncio.wait_for(self.fetch_and_process_tasks(), timeout=1)
for task in self.tasks:
await task
run_single_cycle_sync = async_to_sync(run_single_cycle)