docs/plans/2025-12-31-event-driven-approval-finding-design.md
Date: 2025-12-31 Status: Design Author: Danny
This design replaces the polling-based approval finding system with an event-driven architecture that is HA-compatible. Approval templates will be determined automatically and instantly when plan checks complete, eliminating the in-memory sync.Map and 1-second polling loop.
The current approval finding system uses an in-memory sync.Map which breaks in High Availability (HA) deployments:
// backend/component/state/state.go
type State struct {
ApprovalFinding sync.Map // map[issue.ID]*store.IssueMessage
}
// backend/runner/approval/runner.go
func (r *Runner) runOnce(ctx context.Context) {
r.stateCfg.ApprovalFinding.Range(func(key, value any) bool {
// Process issues in the map
})
}
HA Incompatibilities:
ApprovalFindingDone flag but no cross-instance coordinationAdditional Problems:
ApprovalFindingError persisted in database requires manual retry UIUse a channel-based architecture similar to the existing RolloutCreationChan pattern. Approval finding is triggered by events, primarily plan check completion.
Approval finding depends on plan check summary report data, so the natural trigger point is plan check completion. If users need to trigger approval finding manually, they can click "Rerun plan checks" in the UI.
┌─────────────────────┐
│ Plan Check Complete │
│ (status = DONE) │
└──────────┬──────────┘
│
▼
┌──────────────────┐
│ Signal channel: │
│ ApprovalCheckChan│
└──────────┬───────┘
│
▼
┌──────────────────────────┐
│ Approval Runner (event │
│ listener) receives signal│
└──────────┬───────────────┘
│
▼
┌──────────────────────────┐
│ Get fresh issue from DB │
│ Find approval template │
│ Update issue payload │
└──────────────────────────┘
Benefits:
HA Consideration:
The channel is in-memory per-instance, which is acceptable because:
approval_finding_done=falseFile: backend/component/state/state.go
type State struct {
// ADD new channel:
// ApprovalCheckChan signals when an issue needs approval template finding.
// Triggered by plan check completion.
ApprovalCheckChan chan int64 // issue UID
// REMOVE old sync.Map:
// ApprovalFinding sync.Map // map[issue.ID]*store.IssueMessage
TaskRunSchedulerInfo sync.Map
RunningTaskRunsCancelFunc sync.Map
RunningPlanChecks sync.Map
RunningPlanCheckRunsCancelFunc sync.Map
PlanCheckTickleChan chan int
TaskRunTickleChan chan int
RolloutCreationChan chan int64
PlanCompletionCheckChan chan int64
}
func New() (*State, error) {
return &State{
ApprovalCheckChan: make(chan int64, 1000), // buffered for bursts
PlanCheckTickleChan: make(chan int, 1000),
TaskRunTickleChan: make(chan int, 1000),
RolloutCreationChan: make(chan int64, 100),
PlanCompletionCheckChan: make(chan int64, 1000),
}, nil
}
File: backend/runner/approval/runner.go
New Run() method (event listener):
func (r *Runner) Run(ctx context.Context, wg *sync.WaitGroup) {
defer wg.Done()
slog.Debug("Approval runner started (event-driven)")
for {
select {
case issueUID := <-r.stateCfg.ApprovalCheckChan:
r.processIssue(ctx, issueUID)
case <-ctx.Done():
return
}
}
}
func (r *Runner) processIssue(ctx context.Context, issueUID int64) {
// Get fresh issue from database
issue, err := r.store.GetIssue(ctx, &store.FindIssueMessage{UID: &issueUID})
if err != nil {
slog.Error("failed to get issue for approval check",
slog.Int64("issue_uid", issueUID), log.BBError(err))
return
}
if issue == nil {
return // Issue deleted, nothing to do
}
approvalSetting, err := r.store.GetWorkspaceApprovalSetting(ctx)
if err != nil {
slog.Error("failed to get workspace approval setting", log.BBError(err))
return
}
// Find approval template - errors are logged, not persisted
_, err = r.findApprovalTemplateForIssue(ctx, issue, approvalSetting)
if err != nil {
slog.Error("failed to find approval template",
slog.Int64("issue_uid", issueUID),
slog.String("issue_title", issue.Title),
log.BBError(err))
// Don't persist error - user can rerun plan check to retry
}
}
Remove deprecated methods:
runOnce() - polling method no longer neededretryFindApprovalTemplate() - startup recovery removed for HA safetyUpdate error handling in findApprovalTemplateForIssue():
// OLD: Persisted error to database
if err != nil {
if updateErr := updateIssueApprovalPayload(ctx, r.store, issue, &storepb.IssuePayloadApproval{
ApprovalFindingDone: true,
ApprovalFindingError: err.Error(),
}, storepb.RiskLevel_RISK_LEVEL_UNSPECIFIED); updateErr != nil {
return false, multierr.Append(errors.Wrap(updateErr, "failed to update issue payload"), err)
}
return false, err
}
// NEW: Just return error (logged by caller)
if err != nil {
// Don't persist error - it will be logged by caller
// User can rerun plan check to retry
return false, err
}
Location: backend/runner/plancheck/scheduler.go
In markPlanCheckRunDone():
func (s *Scheduler) markPlanCheckRunDone(ctx context.Context, planCheckRun *store.PlanCheckRunMessage, results []*storepb.PlanCheckRunResult_Result) {
result := &storepb.PlanCheckRunResult{
Results: results,
}
if err := s.store.UpdatePlanCheckRun(ctx,
store.PlanCheckRunStatusDone,
result,
planCheckRun.UID,
); err != nil {
slog.Error("failed to mark plan check run done", log.BBError(err))
return
}
// Get issue for this plan
issue, err := s.store.GetIssue(ctx, &store.FindIssueMessage{PlanUID: &planCheckRun.PlanUID})
if err != nil {
slog.Error("failed to get issue for approval check after plan check",
slog.Int("plan_id", int(planCheckRun.PlanUID)),
log.BBError(err))
return
}
if issue != nil {
// Trigger approval finding
s.stateCfg.ApprovalCheckChan <- issue.UID
// Also trigger rollout creation (existing behavior)
s.stateCfg.RolloutCreationChan <- planCheckRun.PlanUID
}
}
Location: backend/api/v1/plan_service.go
When user updates plan (e.g., changes SQL), reset approval status but don't trigger immediately. The plan update will trigger a new plan check run, which will trigger approval finding when it completes.
// Reset approval finding status
if _, err := s.store.UpdateIssue(ctx, issue.UID, &store.UpdateIssueMessage{
PayloadUpsert: &storepb.Issue{
Approval: &storepb.IssuePayloadApproval{
ApprovalFindingDone: false,
},
},
}); err != nil {
slog.Error("failed to reset approval finding status after plan update", log.BBError(err))
}
// Note: Don't trigger ApprovalCheckChan here - plan update creates new plan check run,
// which will trigger approval finding on completion
User clicks "Rerun plan check" button → plan check runs → triggers approval finding on completion.
No code changes needed - existing plan check rerun functionality handles this.
Rationale: Startup recovery is NOT HA-compatible:
What happens to crashed work?
approval_finding_done=falseSince crashes are rare and users have a clear fix (rerun plan check), we accept this trade-off for HA safety.
Approval finding errors are treated as transient and not persisted to the database:
Error Sources:
Error Handling:
ApprovalFindingError field in databaseCHECKING approval statusUser Experience:
Delete from proto: proto/store/approval.proto
message IssuePayloadApproval {
ApprovalTemplate approval_template = 1;
repeated Approver approvers = 2;
bool approval_finding_done = 3;
// REMOVE field 4:
// string approval_finding_error = 4;
// Reserve to prevent field number reuse:
reserved 4;
reserved "approval_finding_error";
}
Regenerate: cd proto && buf generate
File: backend/api/v1/issue_service.go
Remove "approval_status" update path:
// DELETE entire case block (lines 1052-1069):
case "approval_status":
if req.Msg.Issue.ApprovalStatus != v1pb.Issue_CHECKING {
return nil, connect.NewError(connect.CodeInvalidArgument, errors.Errorf("can only set approval_status to CHECKING to trigger re-finding approval templates"))
}
payload := issue.Payload
if payload.Approval == nil {
return nil, connect.NewError(connect.CodeInternal, errors.Errorf("issue payload approval is nil"))
}
if !payload.Approval.ApprovalFindingDone {
return nil, connect.NewError(connect.CodeFailedPrecondition, errors.Errorf("approval template finding is not done"))
}
if patch.PayloadUpsert == nil {
patch.PayloadUpsert = &storepb.Issue{}
}
patch.PayloadUpsert.Approval = &storepb.IssuePayloadApproval{
ApprovalFindingDone: false,
}
Remove dead code:
// DELETE lines 1143-1145:
if updateMasks["approval_finding_done"] {
s.stateCfg.ApprovalFinding.Store(issue.UID, issue)
}
Remove error checks (3 places):
In ApprovalApprove(), RejectIssue(), and UpdateIssue():
// DELETE:
if payload.Approval.ApprovalFindingError != "" {
return nil, connect.NewError(connect.CodeFailedPrecondition, errors.Errorf("approval template finding failed: %v", payload.Approval.ApprovalFindingError))
}
File: backend/api/v1/issue_service_converter.go
// DELETE line 117:
issueV1.ApprovalStatusError = approval.GetApprovalFindingError()
// DELETE lines 130-132 in computeApprovalStatus():
if approval.GetApprovalFindingError() != "" {
return v1pb.Issue_ERROR
}
File: backend/utils/utils.go
In CheckApprovalApproved():
// DELETE:
if approval.ApprovalFindingError != "" {
return false, nil
}
File: frontend/src/components/Plan/components/IssueReviewView/Sidebar/ApprovalFlowSection/ApprovalFlowSection.vue
Remove ERROR state and retry button:
<!-- DELETE lines 18-30: -->
<div
v-else-if="issue.approvalStatus === Issue_ApprovalStatus.ERROR"
class="flex items-center gap-x-2"
>
<span class="text-error text-sm">{{ issue.approvalStatusError }}</span>
<NButton
size="tiny"
:loading="retrying"
@click="retryFindingApprovalFlow"
>
{{ $t("common.retry") }}
</NButton>
</div>
Remove retry function:
// DELETE:
const retrying = ref(false);
const retryFindingApprovalFlow = async () => {
retrying.value = true;
try {
await useIssueV1Store().regenerateReviewV1(props.issue.name);
emit("issue-updated");
} finally {
retrying.value = false;
}
};
File: frontend/src/store/modules/v1/issue.ts
Remove regenerateReviewV1:
// DELETE function (lines 80-89):
const regenerateReviewV1 = async (name: string) => {
const request = create(UpdateIssueRequestSchema, {
issue: create(IssueSchema, {
name,
approvalStatus: Issue_ApprovalStatus.CHECKING,
}),
updateMask: { paths: ["approval_status"] },
});
await issueServiceClientConnect.updateIssue(request);
};
// DELETE from return statement:
return {
listIssues,
fetchIssueByName,
// regenerateReviewV1, // DELETE this
};
No database migration needed because:
ApprovalFindingError is stored in JSONB issue.payload columnapproval_finding_error will be ignored (field not in proto)Proto field reservation:
reserved 4;
reserved "approval_finding_error";
This prevents:
approval_finding_done=false will be processed on next plan check rerunApproval runner event processing
processIssue() with valid issueTrigger points
End-to-end flow
Error scenarios
Multiple replicas
Crash recovery
HA Compatibility
Performance
Simplicity
User Experience
If orphaned issues (stuck at CHECKING due to crashes) become a problem:
Option: Add lightweight polling safety net
FOR UPDATE SKIP LOCKED for HA-safe claimImplementation:
func (r *Runner) Run(ctx context.Context, wg *sync.WaitGroup) {
defer wg.Done()
slog.Debug("Approval runner started (event-driven)")
ticker := time.NewTicker(60 * time.Second)
defer ticker.Stop()
for {
select {
case issueUID := <-r.stateCfg.ApprovalCheckChan:
r.processIssue(ctx, issueUID)
case <-ticker.C:
// Safety net: process old stuck issues
r.processStuckIssues(ctx)
case <-ctx.Done():
return
}
}
}
This can be added later if needed, keeping the design simple initially.