Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions agents/base2/base2-free-evals.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import { createBase2 } from './base2'

const definition = {
...createBase2('free', { noAskUser: true }),
id: 'base2-free-evals',
displayName: 'Buffy the Free Evals Orchestrator',
}
export default definition
2 changes: 2 additions & 0 deletions bun.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

33 changes: 17 additions & 16 deletions cli/src/hooks/helpers/__tests__/send-message.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ const { createBatchedMessageUpdater } = await import(
'../../../utils/message-updater'
)
import { createPaymentRequiredError } from '@codebuff/sdk'
import type { RunState } from '@codebuff/sdk'

const createMockTimerController = (): SendMessageTimerController & {
startCalls: string[]
Expand Down Expand Up @@ -348,7 +349,7 @@ describe('handleRunCompletion', () => {
let hasReceivedPlanResponse = false

const runState = {
sessionState: null,
sessionState: undefined,
output: { type: 'lastMessage' as const, value: [] },
}

Expand All @@ -372,7 +373,7 @@ describe('handleRunCompletion', () => {
expect(chainInProgress).toBe(false)
expect(canProcessQueue).toBe(true)
expect(isProcessingQueueRef.current).toBe(false)
expect(streamStatus).toBe('idle')
expect(streamStatus as StreamStatus).toBe('idle')
})

test('does not process server response when wasAbortedByUser is true', () => {
Expand All @@ -388,7 +389,7 @@ describe('handleRunCompletion', () => {
let hasReceivedPlanResponse = false

const runState = {
sessionState: null,
sessionState: undefined,
output: {
type: 'lastMessage' as const,
value: [{ type: 'text' as const, text: 'Server response that should be ignored' }],
Expand Down Expand Up @@ -431,7 +432,7 @@ describe('handleRunCompletion', () => {
let canProcessQueueCalled = false

const runState = {
sessionState: null,
sessionState: undefined,
output: { type: 'lastMessage' as const, value: [] },
}

Expand Down Expand Up @@ -929,7 +930,7 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves

// Abort handler fires synchronously: UI is updated, but chain lock stays held
expect(streamRefsA.state.wasAbortedByUser).toBe(true)
expect(streamStatus).toBe('idle') // UI shows idle
expect(streamStatus as StreamStatus).toBe('idle') // UI shows idle
expect(chainInProgress).toBe(true) // But chain lock is still held!

// --- PHASE 3: User types run B — verify it's BLOCKED ---
Expand All @@ -952,8 +953,8 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
// Simulate what happens in useSendMessage after `await client.run(runConfig)`:
// 1. previousRunStateRef.current = runState (state saved)
// 2. handleRunCompletion is called
const runStateFromA = {
sessionState: { conversationId: 'conv-123', history: ['user msg A', 'partial assistant response'] },
const runStateFromA: RunState = {
sessionState: { conversationId: 'conv-123', history: ['user msg A', 'partial assistant response'] } as any,
output: { type: 'lastMessage' as const, value: [{ type: 'text' as const, text: 'partial' }] },
}

Expand Down Expand Up @@ -991,11 +992,11 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
expect(chainInProgress).toBe(false)
expect(canProcessQueue).toBe(true)
expect(isProcessingQueueRef.current).toBe(false)
expect(streamStatus).toBe('idle')
expect(streamStatus as StreamStatus).toBe('idle')

// The crucial state continuity: previousRunState from A is available for B
expect(previousRunState).toBe(runStateFromA)
expect(previousRunState.sessionState).toEqual({
expect(previousRunState.sessionState as any).toEqual({
conversationId: 'conv-123',
history: ['user msg A', 'partial assistant response'],
})
Expand Down Expand Up @@ -1049,7 +1050,7 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
let chainInProgress = true
const isProcessingQueueRef = { current: false }
const isQueuePausedRef = { current: false }
let previousRunState: { sessionState: unknown; output: unknown } | null = null
let previousRunState: RunState | null = null

const setStreamStatus = (status: StreamStatus) => { streamStatus = status }
const setCanProcessQueue = (can: boolean) => { canProcessQueue = can }
Expand Down Expand Up @@ -1083,14 +1084,14 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
expect(chainInProgress).toBe(true) // Lock held

// client.run() resolves for run A
const runStateA = {
const runStateA: RunState = {
sessionState: {
id: 'session-abc',
messages: [
{ role: 'user', content: 'first message' },
{ role: 'assistant', content: 'partial response before cancel' },
],
},
} as any,
output: { type: 'lastMessage' as const, value: [] },
}
previousRunState = runStateA
Expand Down Expand Up @@ -1146,7 +1147,7 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
// In the real code, this is: previousRunState: previousRunStateRef.current
// passed to createRunConfig
expect(previousRunState).toBe(runStateA)
expect(previousRunState!.sessionState).toEqual({
expect(previousRunState!.sessionState as any).toEqual({
id: 'session-abc',
messages: [
{ role: 'user', content: 'first message' },
Expand All @@ -1155,7 +1156,7 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
})

// Simulate run B completing normally
const runStateB = {
const runStateB: RunState = {
sessionState: {
id: 'session-abc',
messages: [
Expand All @@ -1164,7 +1165,7 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
{ role: 'user', content: 'second message' },
{ role: 'assistant', content: 'full response to second message' },
],
},
} as any,
output: { type: 'lastMessage' as const, value: [{ type: 'text' as const, text: 'full response' }] },
}
previousRunState = runStateB
Expand All @@ -1186,7 +1187,7 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
})

// Final state: both runs' messages are preserved in session history
expect(previousRunState!.sessionState).toEqual({
expect(previousRunState!.sessionState as any).toEqual({
id: 'session-abc',
messages: [
{ role: 'user', content: 'first message' },
Expand Down
180 changes: 0 additions & 180 deletions docs/patterns/handle-steps-generators.md

This file was deleted.

2 changes: 2 additions & 0 deletions evalbuff/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@
"run": "bun run src/run-evalbuff.ts"
},
"dependencies": {
"@ai-sdk/anthropic": "^2.0.50",
"@codebuff/common": "workspace:*",
"@codebuff/sdk": "workspace:*",
"ai": "^5.0.0",
"zod": "^4.2.1"
}
}
29 changes: 20 additions & 9 deletions evalbuff/src/__tests__/e2e.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,25 @@ mock.module('../test-repo-utils', () => ({
},
}))

mock.module('../cli-runner', () => ({
runCliAgent: async () => ({
diff: 'mock diff content',
durationMs: 1000,
exitCode: 0,
stdout: 'mock stdout',
stderr: '',
}),
mock.module('../runners/codebuff', () => ({
CodebuffRunner: class {
constructor() {}
async run() {
return {
steps: [{ type: 'text', content: 'mock trace' }],
totalCostUsd: 0.01,
diff: 'mock diff content',
}
}
},
}))

mock.module('@codebuff/sdk', () => ({
CodebuffClient: class {
constructor() {}
async run() { return { output: { type: 'success' }, sessionState: null } }
},
loadLocalAgents: async () => ({}),
}))

// Judge returns alternating scores: low (triggers doc edit), then higher (confirms improvement)
Expand Down Expand Up @@ -126,7 +137,7 @@ describe('evalbuff E2E', () => {
await runLearnMode({
mode: 'learn',
repoPath: repoDir,
agentCommand: 'echo',
agentId: 'base2-free-evals',
parallelism: 1,
maxCostUsd: 50,
agentTimeoutMs: 10_000,
Expand Down
Loading
Loading