From 4cca86e50f1ce661ca03ed4fc51fbddbe97232bf Mon Sep 17 00:00:00 2001 From: Vikhyath Mondreti Date: Tue, 2 Jun 2026 13:46:11 -0700 Subject: [PATCH 1/2] fix(schedules): count usage lim error schedule as failed run --- .../app/api/schedules/execute/route.test.ts | 9 ++ apps/sim/app/api/schedules/execute/route.ts | 22 +---- apps/sim/background/schedule-execution.ts | 91 ++++++++++--------- apps/sim/lib/core/config/env.ts | 1 + .../workflows/schedules/execution-limits.ts | 11 +++ 5 files changed, 70 insertions(+), 64 deletions(-) diff --git a/apps/sim/app/api/schedules/execute/route.test.ts b/apps/sim/app/api/schedules/execute/route.test.ts index 2fe434e87b2..2d0fa95ff55 100644 --- a/apps/sim/app/api/schedules/execute/route.test.ts +++ b/apps/sim/app/api/schedules/execute/route.test.ts @@ -50,6 +50,15 @@ vi.mock('@/background/schedule-execution', () => ({ executeScheduleJob: mockExecuteScheduleJob, executeJobInline: mockExecuteJobInline, releaseScheduleLock: mockReleaseScheduleLock, + buildScheduleFailureUpdate: (now: Date, nextRunAt: Date | null) => ({ + updatedAt: now, + lastQueuedAt: null, + nextRunAt, + failedCount: { type: 'sql' }, + lastFailedAt: now, + status: { type: 'sql' }, + infraRetryCount: 0, + }), })) vi.mock('@/lib/core/config/feature-flags', () => mockFeatureFlags) diff --git a/apps/sim/app/api/schedules/execute/route.ts b/apps/sim/app/api/schedules/execute/route.ts index 6dedd8d7494..414ae81047b 100644 --- a/apps/sim/app/api/schedules/execute/route.ts +++ b/apps/sim/app/api/schedules/execute/route.ts @@ -27,12 +27,12 @@ import { SCHEDULE_WORKFLOW_ENQUEUE_LIMIT, } from '@/lib/workflows/schedules/execution-limits' import { + buildScheduleFailureUpdate, executeJobInline, executeScheduleJob, releaseScheduleLock, type ScheduleExecutionPayload, } from '@/background/schedule-execution' -import { MAX_CONSECUTIVE_FAILURES } from '@/triggers/constants' export const dynamic = 'force-dynamic' export const maxDuration = 3600 @@ -321,15 +321,7 @@ async function markClaimedScheduleFailed( const now = new Date() await db .update(workflowSchedule) - .set({ - updatedAt: now, - lastQueuedAt: null, - lastFailedAt: now, - nextRunAt: getScheduleNextRunAt(schedule, now), - failedCount: sql`COALESCE(${workflowSchedule.failedCount}, 0) + 1`, - status: sql`CASE WHEN COALESCE(${workflowSchedule.failedCount}, 0) + 1 >= ${MAX_CONSECUTIVE_FAILURES} THEN 'disabled' ELSE 'active' END`, - infraRetryCount: 0, - }) + .set(buildScheduleFailureUpdate(now, getScheduleNextRunAt(schedule, now))) .where( and( eq(workflowSchedule.id, schedule.id), @@ -482,15 +474,7 @@ async function recoverStaleDatabaseScheduleJobs(now: Date): Promise { await tx .update(workflowSchedule) - .set({ - updatedAt: now, - lastQueuedAt: null, - lastFailedAt: now, - nextRunAt: getScheduleNextRunAt(payload, now), - failedCount: sql`COALESCE(${workflowSchedule.failedCount}, 0) + 1`, - status: sql`CASE WHEN COALESCE(${workflowSchedule.failedCount}, 0) + 1 >= ${MAX_CONSECUTIVE_FAILURES} THEN 'disabled' ELSE 'active' END`, - infraRetryCount: 0, - }) + .set(buildScheduleFailureUpdate(now, getScheduleNextRunAt(payload, now))) .where( and( eq(workflowSchedule.id, payload.scheduleId), diff --git a/apps/sim/background/schedule-execution.ts b/apps/sim/background/schedule-execution.ts index 4c240e5f1e7..53b43245919 100644 --- a/apps/sim/background/schedule-execution.ts +++ b/apps/sim/background/schedule-execution.ts @@ -39,6 +39,7 @@ import { SCHEDULE_INFRA_RETRY_BASE_MS, SCHEDULE_INFRA_RETRY_MAX_ATTEMPTS, SCHEDULE_INFRA_RETRY_MAX_MS, + SCHEDULE_USAGE_LIMIT_BACKOFF_MS, } from '@/lib/workflows/schedules/execution-limits' import { type BlockState, @@ -76,6 +77,29 @@ function resetScheduleInfraRetryCount(): Pick backoffRunAt.getTime() + ? cronNextRunAt + : backoffRunAt + logger.warn(`[${requestId}] Usage limit exceeded, backing off scheduled run`, { + scheduleId: payload.scheduleId, + nextRunAt: nextRunAt.toISOString(), + }) await updateClaimedSchedule( - { - updatedAt: now, - lastQueuedAt: null, - nextRunAt, - ...resetScheduleInfraRetryCount(), - }, + buildScheduleFailureUpdate(now, nextRunAt), `Error updating schedule ${payload.scheduleId} after usage limit check` ) return @@ -809,15 +834,7 @@ export async function executeScheduleJob(payload: ScheduleExecutionPayload) { const nextRunAt = await determineNextRunAfterError(payload, now, requestId) await updateClaimedSchedule( - { - updatedAt: now, - lastQueuedAt: null, - nextRunAt, - failedCount: incrementScheduleFailedCount(), - lastFailedAt: now, - status: scheduleStatusAfterFailedCountIncrement(), - ...resetScheduleInfraRetryCount(), - }, + buildScheduleFailureUpdate(now, nextRunAt), `Error updating schedule ${payload.scheduleId} after preprocessing failure` ) return @@ -914,15 +931,7 @@ export async function executeScheduleJob(payload: ScheduleExecutionPayload) { const nextRunAt = calculateNextRunTime(payload, executionResult.blocks) await updateClaimedSchedule( - { - updatedAt: now, - lastQueuedAt: null, - nextRunAt, - failedCount: incrementScheduleFailedCount(), - lastFailedAt: now, - status: scheduleStatusAfterFailedCountIncrement(), - ...resetScheduleInfraRetryCount(), - }, + buildScheduleFailureUpdate(now, nextRunAt), `Error updating schedule ${payload.scheduleId} after failure` ) } catch (error: unknown) { @@ -934,15 +943,7 @@ export async function executeScheduleJob(payload: ScheduleExecutionPayload) { const nextRunAt = await determineNextRunAfterError(payload, now, requestId) await updateClaimedSchedule( - { - updatedAt: now, - lastQueuedAt: null, - nextRunAt, - failedCount: incrementScheduleFailedCount(), - lastFailedAt: now, - status: scheduleStatusAfterFailedCountIncrement(), - ...resetScheduleInfraRetryCount(), - }, + buildScheduleFailureUpdate(now, nextRunAt), `Error updating schedule ${payload.scheduleId} after execution error` ) } diff --git a/apps/sim/lib/core/config/env.ts b/apps/sim/lib/core/config/env.ts index 223eb519524..7d0c64369b6 100644 --- a/apps/sim/lib/core/config/env.ts +++ b/apps/sim/lib/core/config/env.ts @@ -204,6 +204,7 @@ export const env = createEnv({ SCHEDULE_INFRA_RETRY_BASE_MS: z.string().optional().default('60000'), SCHEDULE_INFRA_RETRY_MAX_MS: z.string().optional().default('300000'), SCHEDULE_INFRA_RETRY_MAX_ATTEMPTS: z.string().optional().default('10'), + SCHEDULE_USAGE_LIMIT_BACKOFF_MS: z.string().optional().default('3600000'), // Min gap between retries while over usage limit (default 1h) // Cloud Storage - AWS S3 AWS_REGION: z.string().optional(), // AWS region for S3 buckets diff --git a/apps/sim/lib/workflows/schedules/execution-limits.ts b/apps/sim/lib/workflows/schedules/execution-limits.ts index a5bb9c5bdc0..39a91ccb137 100644 --- a/apps/sim/lib/workflows/schedules/execution-limits.ts +++ b/apps/sim/lib/workflows/schedules/execution-limits.ts @@ -40,3 +40,14 @@ export const SCHEDULE_INFRA_RETRY_MAX_ATTEMPTS = envNumber( integer: true, } ) + +/** + * Minimum delay before a schedule retries after hitting a usage limit (402). + * Usage limits only clear on billing-period rollover or upgrade, so over-limit + * schedules back off to (at most) this cadence instead of re-running every tick. + */ +export const SCHEDULE_USAGE_LIMIT_BACKOFF_MS = envNumber( + env.SCHEDULE_USAGE_LIMIT_BACKOFF_MS, + 60 * 60_000, + { min: 1, integer: true } +) From 0d5e8136e1b10b2a1fc86df2aae8c8cf7eff28bb Mon Sep 17 00:00:00 2001 From: Vikhyath Mondreti Date: Tue, 2 Jun 2026 13:51:22 -0700 Subject: [PATCH 2/2] remove backoff logic --- apps/sim/background/schedule-execution.ts | 19 +++++++------------ apps/sim/lib/core/config/env.ts | 1 - .../workflows/schedules/execution-limits.ts | 11 ----------- 3 files changed, 7 insertions(+), 24 deletions(-) diff --git a/apps/sim/background/schedule-execution.ts b/apps/sim/background/schedule-execution.ts index 53b43245919..b90886d7f71 100644 --- a/apps/sim/background/schedule-execution.ts +++ b/apps/sim/background/schedule-execution.ts @@ -39,7 +39,6 @@ import { SCHEDULE_INFRA_RETRY_BASE_MS, SCHEDULE_INFRA_RETRY_MAX_ATTEMPTS, SCHEDULE_INFRA_RETRY_MAX_MS, - SCHEDULE_USAGE_LIMIT_BACKOFF_MS, } from '@/lib/workflows/schedules/execution-limits' import { type BlockState, @@ -794,20 +793,16 @@ export async function executeScheduleJob(payload: ScheduleExecutionPayload) { case 402: { /** - * Usage limits are a billing state, not a broken workflow, and only clear - * on billing-period rollover or upgrade. Back off to at most the usage-limit - * cadence (never faster than the schedule's own cadence) so an over-limit - * schedule stops re-running every tick, and count each hit toward the shared - * auto-disable threshold so an abandoned over-limit schedule eventually stops. + * Usage limits are a billing state, not a broken workflow, but they only + * clear on billing-period rollover or upgrade. Keep retrying at the normal + * cadence, but count each hit toward the shared auto-disable threshold so an + * abandoned over-limit schedule eventually stops instead of running forever. * A successful run resets failedCount, so transient overages self-heal. */ - const cronNextRunAt = await calculateNextRunFromDeployment(payload, requestId) - const backoffRunAt = new Date(now.getTime() + SCHEDULE_USAGE_LIMIT_BACKOFF_MS) const nextRunAt = - cronNextRunAt && cronNextRunAt.getTime() > backoffRunAt.getTime() - ? cronNextRunAt - : backoffRunAt - logger.warn(`[${requestId}] Usage limit exceeded, backing off scheduled run`, { + (await calculateNextRunFromDeployment(payload, requestId)) ?? + new Date(now.getTime() + 60 * 60 * 1000) + logger.warn(`[${requestId}] Usage limit exceeded, counting as failed run`, { scheduleId: payload.scheduleId, nextRunAt: nextRunAt.toISOString(), }) diff --git a/apps/sim/lib/core/config/env.ts b/apps/sim/lib/core/config/env.ts index 7d0c64369b6..223eb519524 100644 --- a/apps/sim/lib/core/config/env.ts +++ b/apps/sim/lib/core/config/env.ts @@ -204,7 +204,6 @@ export const env = createEnv({ SCHEDULE_INFRA_RETRY_BASE_MS: z.string().optional().default('60000'), SCHEDULE_INFRA_RETRY_MAX_MS: z.string().optional().default('300000'), SCHEDULE_INFRA_RETRY_MAX_ATTEMPTS: z.string().optional().default('10'), - SCHEDULE_USAGE_LIMIT_BACKOFF_MS: z.string().optional().default('3600000'), // Min gap between retries while over usage limit (default 1h) // Cloud Storage - AWS S3 AWS_REGION: z.string().optional(), // AWS region for S3 buckets diff --git a/apps/sim/lib/workflows/schedules/execution-limits.ts b/apps/sim/lib/workflows/schedules/execution-limits.ts index 39a91ccb137..a5bb9c5bdc0 100644 --- a/apps/sim/lib/workflows/schedules/execution-limits.ts +++ b/apps/sim/lib/workflows/schedules/execution-limits.ts @@ -40,14 +40,3 @@ export const SCHEDULE_INFRA_RETRY_MAX_ATTEMPTS = envNumber( integer: true, } ) - -/** - * Minimum delay before a schedule retries after hitting a usage limit (402). - * Usage limits only clear on billing-period rollover or upgrade, so over-limit - * schedules back off to (at most) this cadence instead of re-running every tick. - */ -export const SCHEDULE_USAGE_LIMIT_BACKOFF_MS = envNumber( - env.SCHEDULE_USAGE_LIMIT_BACKOFF_MS, - 60 * 60_000, - { min: 1, integer: true } -)