From c8501b1768752f8ebd03182232171c9380b75cdf Mon Sep 17 00:00:00 2001 From: lonelyhentxi Date: Fri, 27 Jun 2025 02:18:23 +0800 Subject: [PATCH 1/3] fix: remove inconsistent cleanup function --- .../migrations/m20250629_065628_add_cron.rs | 42 ++----------------- apps/recorder/src/models/cron/core.rs | 2 - apps/recorder/src/models/cron/mod.rs | 18 +------- apps/recorder/src/task/service.rs | 3 -- 4 files changed, 6 insertions(+), 59 deletions(-) diff --git a/apps/recorder/src/migrations/m20250629_065628_add_cron.rs b/apps/recorder/src/migrations/m20250629_065628_add_cron.rs index f725077..7dee4f8 100644 --- a/apps/recorder/src/migrations/m20250629_065628_add_cron.rs +++ b/apps/recorder/src/migrations/m20250629_065628_add_cron.rs @@ -1,4 +1,5 @@ use async_trait::async_trait; +use sea_orm::ActiveEnum; use sea_orm_migration::{prelude::*, schema::*}; use crate::{ @@ -6,7 +7,6 @@ use crate::{ Cron, CustomSchemaManagerExt, GeneralIds, Subscribers, Subscriptions, table_auto_z, }, models::cron::{ - CHECK_AND_CLEANUP_EXPIRED_CRON_LOCKS_FUNCTION_NAME, CHECK_AND_TRIGGER_DUE_CRONS_FUNCTION_NAME, CRON_DUE_EVENT, CronSource, CronSourceEnum, CronStatus, CronStatusEnum, NOTIFY_DUE_CRON_WHEN_MUTATING_FUNCTION_NAME, NOTIFY_DUE_CRON_WHEN_MUTATING_TRIGGER_NAME, @@ -143,14 +143,14 @@ impl MigrationTrait for Migration { locked_at = &Cron::LockedAt.to_string(), timeout_ms = &Cron::TimeoutMs.to_string(), status = &Cron::Status.to_string(), - pending = &CronStatus::Pending.to_string(), + pending = &CronStatus::Pending.to_value(), attempts = &Cron::Attempts.to_string(), max_attempts = &Cron::MaxAttempts.to_string(), )) .await?; db.execute_unprepared(&format!( - r#"CREATE TRIGGER {NOTIFY_DUE_CRON_WHEN_MUTATING_TRIGGER_NAME} + r#"CREATE OR REPLACE TRIGGER {NOTIFY_DUE_CRON_WHEN_MUTATING_TRIGGER_NAME} AFTER INSERT OR UPDATE ON {table} FOR EACH ROW EXECUTE FUNCTION {NOTIFY_DUE_CRON_WHEN_MUTATING_FUNCTION_NAME}();"#, @@ -158,35 +158,6 @@ impl MigrationTrait for Migration { )) .await?; - db.execute_unprepared(&format!( - r#"CREATE OR REPLACE FUNCTION {CHECK_AND_CLEANUP_EXPIRED_CRON_LOCKS_FUNCTION_NAME}() RETURNS INTEGER AS $$ - DECLARE - affected_count INTEGER; - BEGIN - UPDATE {table} - SET - {locked_by} = NULL, - {locked_at} = NULL, - {status} = '{pending}' - WHERE - {locked_by} IS NOT NULL - AND {timeout_ms} IS NOT NULL - AND {locked_at} + {timeout_ms} * INTERVAL '1 millisecond' <= CURRENT_TIMESTAMP - AND {status} = '{running}'; - GET DIAGNOSTICS affected_count = ROW_COUNT; - RETURN affected_count; - END; - $$ LANGUAGE plpgsql;"#, - table = &Cron::Table.to_string(), - locked_by = &Cron::LockedBy.to_string(), - locked_at = &Cron::LockedAt.to_string(), - status = &Cron::Status.to_string(), - running = &CronStatus::Running.to_string(), - pending = &CronStatus::Pending.to_string(), - timeout_ms = &Cron::TimeoutMs.to_string(), - )) - .await?; - db.execute_unprepared(&format!( r#"CREATE OR REPLACE FUNCTION {CHECK_AND_TRIGGER_DUE_CRONS_FUNCTION_NAME}() RETURNS INTEGER AS $$ DECLARE @@ -220,7 +191,7 @@ impl MigrationTrait for Migration { next_run = &Cron::NextRun.to_string(), enabled = &Cron::Enabled.to_string(), status = &Cron::Status.to_string(), - pending = &CronStatus::Pending.to_string(), + pending = &CronStatus::Pending.to_value(), locked_at = &Cron::LockedAt.to_string(), timeout_ms = &Cron::TimeoutMs.to_string(), priority = &Cron::Priority.to_string(), @@ -246,11 +217,6 @@ impl MigrationTrait for Migration { )) .await?; - db.execute_unprepared(&format!( - r#"DROP FUNCTION IF EXISTS {CHECK_AND_CLEANUP_EXPIRED_CRON_LOCKS_FUNCTION_NAME}();"#, - )) - .await?; - db.execute_unprepared(&format!( r#"DROP FUNCTION IF EXISTS {CHECK_AND_TRIGGER_DUE_CRONS_FUNCTION_NAME}();"#, )) diff --git a/apps/recorder/src/models/cron/core.rs b/apps/recorder/src/models/cron/core.rs index 2324876..cf21096 100644 --- a/apps/recorder/src/models/cron/core.rs +++ b/apps/recorder/src/models/cron/core.rs @@ -1,7 +1,5 @@ pub const CRON_DUE_EVENT: &str = "cron_due"; -pub const CHECK_AND_CLEANUP_EXPIRED_CRON_LOCKS_FUNCTION_NAME: &str = - "check_and_cleanup_expired_cron_locks"; pub const CHECK_AND_TRIGGER_DUE_CRONS_FUNCTION_NAME: &str = "check_and_trigger_due_crons"; pub const NOTIFY_DUE_CRON_WHEN_MUTATING_FUNCTION_NAME: &str = "notify_due_cron_when_mutating"; diff --git a/apps/recorder/src/models/cron/mod.rs b/apps/recorder/src/models/cron/mod.rs index dc9b3a5..98a4667 100644 --- a/apps/recorder/src/models/cron/mod.rs +++ b/apps/recorder/src/models/cron/mod.rs @@ -2,9 +2,8 @@ mod core; mod registry; pub use core::{ - CHECK_AND_CLEANUP_EXPIRED_CRON_LOCKS_FUNCTION_NAME, CHECK_AND_TRIGGER_DUE_CRONS_FUNCTION_NAME, - CRON_DUE_EVENT, NOTIFY_DUE_CRON_WHEN_MUTATING_FUNCTION_NAME, - NOTIFY_DUE_CRON_WHEN_MUTATING_TRIGGER_NAME, + CHECK_AND_TRIGGER_DUE_CRONS_FUNCTION_NAME, CRON_DUE_EVENT, + NOTIFY_DUE_CRON_WHEN_MUTATING_FUNCTION_NAME, NOTIFY_DUE_CRON_WHEN_MUTATING_TRIGGER_NAME, }; use async_trait::async_trait; @@ -270,19 +269,6 @@ impl Model { Ok(()) } - pub async fn cleanup_expired_locks(ctx: &dyn AppContextTrait) -> RecorderResult { - let db = ctx.db(); - - let result = db - .execute(Statement::from_string( - db.get_database_backend(), - format!("SELECT {CHECK_AND_CLEANUP_EXPIRED_CRON_LOCKS_FUNCTION_NAME}()"), - )) - .await?; - - Ok(result.rows_affected() as i32) - } - pub async fn check_and_trigger_due_crons(ctx: &dyn AppContextTrait) -> RecorderResult<()> { let db = ctx.db(); diff --git a/apps/recorder/src/task/service.rs b/apps/recorder/src/task/service.rs index da68bbc..5e0b02c 100644 --- a/apps/recorder/src/task/service.rs +++ b/apps/recorder/src/task/service.rs @@ -185,9 +185,6 @@ impl TaskService { tokio::task::spawn(async move { loop { interval.tick().await; - if let Err(e) = cron::Model::cleanup_expired_locks(ctx.as_ref()).await { - tracing::error!("Error cleaning up expired locks: {e}"); - } if let Err(e) = cron::Model::check_and_trigger_due_crons(ctx.as_ref()).await { tracing::error!("Error checking and triggering due crons: {e}"); From c858cc7d44c48fef749d5dc338491b4d237b4960 Mon Sep 17 00:00:00 2001 From: lonelyhentxi Date: Sat, 28 Jun 2025 03:38:53 +0800 Subject: [PATCH 2/3] fix: fix cron timeout clean --- apps/recorder/src/models/cron/mod.rs | 71 +++++++++++++++++++++++++--- apps/recorder/src/task/config.rs | 7 +++ apps/recorder/src/task/service.rs | 33 +++++++++++-- 3 files changed, 101 insertions(+), 10 deletions(-) diff --git a/apps/recorder/src/models/cron/mod.rs b/apps/recorder/src/models/cron/mod.rs index 98a4667..2ac57c1 100644 --- a/apps/recorder/src/models/cron/mod.rs +++ b/apps/recorder/src/models/cron/mod.rs @@ -10,8 +10,11 @@ use async_trait::async_trait; use chrono::{DateTime, Utc}; use croner::Cron; use sea_orm::{ - ActiveValue::Set, DeriveActiveEnum, DeriveDisplay, DeriveEntityModel, EnumIter, QuerySelect, - Statement, TransactionTrait, entity::prelude::*, sea_query::LockType, + ActiveValue::Set, + Condition, DeriveActiveEnum, DeriveDisplay, DeriveEntityModel, EnumIter, QuerySelect, + Statement, TransactionTrait, + entity::prelude::*, + sea_query::{ExprTrait, LockBehavior, LockType}, sqlx::postgres::PgNotification, }; use serde::{Deserialize, Serialize}; @@ -126,6 +129,7 @@ impl Model { ctx: &dyn AppContextTrait, notification: PgNotification, worker_id: &str, + retry_duration: chrono::Duration, ) -> RecorderResult<()> { let payload: Self = serde_json::from_str(notification.payload())?; let cron_id = payload.id; @@ -140,7 +144,8 @@ impl Model { } Err(e) => { tracing::error!("Error executing cron {cron_id}: {e}"); - cron.mark_cron_failed(ctx, &e.to_string()).await?; + cron.mark_cron_failed(ctx, &e.to_string(), retry_duration) + .await?; } }, None => { @@ -162,7 +167,7 @@ impl Model { let txn = db.begin().await?; let cron = Entity::find_by_id(cron_id) - .lock(LockType::Update) + .lock_with_behavior(LockType::Update, LockBehavior::SkipLocked) .one(&txn) .await?; @@ -235,7 +240,12 @@ impl Model { Ok(()) } - async fn mark_cron_failed(&self, ctx: &dyn AppContextTrait, error: &str) -> RecorderResult<()> { + async fn mark_cron_failed( + &self, + ctx: &dyn AppContextTrait, + error: &str, + retry_duration: chrono::Duration, + ) -> RecorderResult<()> { let db = ctx.db(); let should_retry = self.attempts < self.max_attempts; @@ -247,7 +257,7 @@ impl Model { }; let next_run = if should_retry { - Some(Utc::now() + chrono::Duration::seconds(5)) + Some(Utc::now() + retry_duration) } else { Some(self.calculate_next_run(&self.cron_expr)?) }; @@ -281,6 +291,55 @@ impl Model { Ok(()) } + pub async fn check_and_cleanup_expired_cron_locks( + ctx: &dyn AppContextTrait, + retry_duration: chrono::Duration, + ) -> RecorderResult<()> { + let db = ctx.db(); + + let condition = Condition::all() + .add(Column::Status.eq(CronStatus::Running)) + .add(Column::LastRun.is_not_null()) + .add(Column::TimeoutMs.is_not_null()) + .add( + Expr::col(Column::LastRun) + .add(Expr::col(Column::TimeoutMs).mul(Expr::cust("INTERVAL '1 millisecond'"))) + .lte(Expr::current_timestamp()), + ); + + let cron_ids = Entity::find() + .select_only() + .column(Column::Id) + .filter(condition.clone()) + .lock_with_behavior(LockType::Update, LockBehavior::SkipLocked) + .into_tuple::() + .all(db) + .await?; + + for cron_id in cron_ids { + let txn = db.begin().await?; + let locked_cron = Entity::find_by_id(cron_id) + .filter(condition.clone()) + .lock_with_behavior(LockType::Update, LockBehavior::SkipLocked) + .one(&txn) + .await?; + + if let Some(locked_cron) = locked_cron { + locked_cron + .mark_cron_failed( + ctx, + format!("Cron timeout of {}ms", locked_cron.timeout_ms).as_str(), + retry_duration, + ) + .await?; + txn.commit().await?; + } else { + txn.rollback().await?; + } + } + Ok(()) + } + fn calculate_next_run(&self, cron_expr: &str) -> RecorderResult> { let cron_expr = Cron::new(cron_expr).parse()?; diff --git a/apps/recorder/src/task/config.rs b/apps/recorder/src/task/config.rs index 7d8379e..f01d423 100644 --- a/apps/recorder/src/task/config.rs +++ b/apps/recorder/src/task/config.rs @@ -12,6 +12,8 @@ pub struct TaskConfig { pub subscriber_task_timeout: Duration, #[serde(default = "default_system_task_timeout")] pub system_task_timeout: Duration, + #[serde(default = "default_cron_retry_duration")] + pub cron_retry_duration: Duration, } impl Default for TaskConfig { @@ -21,6 +23,7 @@ impl Default for TaskConfig { system_task_concurrency: default_system_task_workers(), subscriber_task_timeout: default_subscriber_task_timeout(), system_task_timeout: default_system_task_timeout(), + cron_retry_duration: default_cron_retry_duration(), } } } @@ -48,3 +51,7 @@ pub fn default_subscriber_task_timeout() -> Duration { pub fn default_system_task_timeout() -> Duration { Duration::from_secs(3600) } + +pub fn default_cron_retry_duration() -> Duration { + Duration::from_secs(5) +} diff --git a/apps/recorder/src/task/service.rs b/apps/recorder/src/task/service.rs index 5e0b02c..f2492cd 100644 --- a/apps/recorder/src/task/service.rs +++ b/apps/recorder/src/task/service.rs @@ -170,9 +170,14 @@ impl TaskService { let listener = self.setup_cron_due_listening().await?; let ctx = self.ctx.clone(); let cron_worker_id = self.cron_worker_id.clone(); + let retry_duration = chrono::Duration::milliseconds( + self.config.cron_retry_duration.as_millis() as i64, + ); tokio::task::spawn(async move { - if let Err(e) = Self::listen_cron_due(listener, ctx, &cron_worker_id).await { + if let Err(e) = + Self::listen_cron_due(listener, ctx, &cron_worker_id, retry_duration).await + { tracing::error!("Error listening to cron due: {e}"); } }); @@ -180,11 +185,24 @@ impl TaskService { Ok::<_, RecorderError>(()) }, async { - let mut interval = tokio::time::interval(tokio::time::Duration::from_secs(60)); let ctx = self.ctx.clone(); + let retry_duration = chrono::Duration::milliseconds( + self.config.cron_retry_duration.as_millis() as i64, + ); tokio::task::spawn(async move { + let mut interval = tokio::time::interval(tokio::time::Duration::from_secs(60)); loop { interval.tick().await; + if let Err(e) = cron::Model::check_and_cleanup_expired_cron_locks( + ctx.as_ref(), + retry_duration, + ) + .await + { + tracing::error!( + "Error checking and cleaning up expired cron locks: {e}" + ); + } if let Err(e) = cron::Model::check_and_trigger_due_crons(ctx.as_ref()).await { tracing::error!("Error checking and triggering due crons: {e}"); @@ -257,12 +275,19 @@ impl TaskService { mut listener: PgListener, ctx: Arc, worker_id: &str, + retry_duration: chrono::Duration, ) -> RecorderResult<()> { listener.listen(CRON_DUE_EVENT).await?; + loop { let notification = listener.recv().await?; - if let Err(e) = - cron::Model::handle_cron_notification(ctx.as_ref(), notification, worker_id).await + if let Err(e) = cron::Model::handle_cron_notification( + ctx.as_ref(), + notification, + worker_id, + retry_duration, + ) + .await { tracing::error!("Error handling cron notification: {e}"); } From f83371bbf9a3bf451583278ca6155180ff828ba5 Mon Sep 17 00:00:00 2001 From: lonelyhentxi Date: Sat, 28 Jun 2025 04:10:18 +0800 Subject: [PATCH 3/3] fix: fix task lifetime --- apps/recorder/src/task/config.rs | 17 +++--- apps/recorder/src/task/service.rs | 8 +-- .../routes/_app/subscriptions/detail.$id.tsx | 52 +++++++++---------- 3 files changed, 39 insertions(+), 38 deletions(-) diff --git a/apps/recorder/src/task/config.rs b/apps/recorder/src/task/config.rs index f01d423..3b2c18d 100644 --- a/apps/recorder/src/task/config.rs +++ b/apps/recorder/src/task/config.rs @@ -8,10 +8,10 @@ pub struct TaskConfig { pub subscriber_task_concurrency: u32, #[serde(default = "default_system_task_workers")] pub system_task_concurrency: u32, - #[serde(default = "default_subscriber_task_timeout")] - pub subscriber_task_timeout: Duration, - #[serde(default = "default_system_task_timeout")] - pub system_task_timeout: Duration, + #[serde(default = "default_subscriber_task_reenqueue_orphaned_after")] + pub subscriber_task_reenqueue_orphaned_after: Duration, + #[serde(default = "default_system_task_reenqueue_orphaned_after")] + pub system_task_reenqueue_orphaned_after: Duration, #[serde(default = "default_cron_retry_duration")] pub cron_retry_duration: Duration, } @@ -21,8 +21,9 @@ impl Default for TaskConfig { Self { subscriber_task_concurrency: default_subscriber_task_workers(), system_task_concurrency: default_system_task_workers(), - subscriber_task_timeout: default_subscriber_task_timeout(), - system_task_timeout: default_system_task_timeout(), + subscriber_task_reenqueue_orphaned_after: + default_subscriber_task_reenqueue_orphaned_after(), + system_task_reenqueue_orphaned_after: default_system_task_reenqueue_orphaned_after(), cron_retry_duration: default_cron_retry_duration(), } } @@ -44,11 +45,11 @@ pub fn default_system_task_workers() -> u32 { } } -pub fn default_subscriber_task_timeout() -> Duration { +pub fn default_subscriber_task_reenqueue_orphaned_after() -> Duration { Duration::from_secs(3600) } -pub fn default_system_task_timeout() -> Duration { +pub fn default_system_task_reenqueue_orphaned_after() -> Duration { Duration::from_secs(3600) } diff --git a/apps/recorder/src/task/service.rs b/apps/recorder/src/task/service.rs index f2492cd..1401865 100644 --- a/apps/recorder/src/task/service.rs +++ b/apps/recorder/src/task/service.rs @@ -41,10 +41,10 @@ impl TaskService { }; let pool = ctx.db().get_postgres_connection_pool().clone(); - let subscriber_task_storage_config = - Config::new(SUBSCRIBER_TASK_APALIS_NAME).set_keep_alive(config.subscriber_task_timeout); - let system_task_storage_config = - Config::new(SYSTEM_TASK_APALIS_NAME).set_keep_alive(config.system_task_timeout); + let subscriber_task_storage_config = Config::new(SUBSCRIBER_TASK_APALIS_NAME) + .set_reenqueue_orphaned_after(config.subscriber_task_reenqueue_orphaned_after); + let system_task_storage_config = Config::new(SYSTEM_TASK_APALIS_NAME) + .set_reenqueue_orphaned_after(config.system_task_reenqueue_orphaned_after); let subscriber_task_storage = ApalisPostgresStorage::new_with_config(pool.clone(), subscriber_task_storage_config); let system_task_storage = diff --git a/apps/webui/src/presentation/routes/_app/subscriptions/detail.$id.tsx b/apps/webui/src/presentation/routes/_app/subscriptions/detail.$id.tsx index b3c289e..5160462 100644 --- a/apps/webui/src/presentation/routes/_app/subscriptions/detail.$id.tsx +++ b/apps/webui/src/presentation/routes/_app/subscriptions/detail.$id.tsx @@ -212,20 +212,6 @@ function SubscriptionDetailRouteComponent() { View subscription detail -
- -
@@ -439,18 +425,32 @@ function SubscriptionDetailRouteComponent() {
- - - - - - +
+ + + + + + + +
{subscription.subscriberTask?.nodes &&