Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## [Unreleased]

### Added

- **Extended-thinking request invariants** — aimock now validates Anthropic extended-thinking continuations on the tool-use loop. When extended thinking is enabled, a continuation whose prior assistant turn drops the leading `thinking` block (or its `signature`, or a `redacted_thinking` block's `data`) is rejected with the real Anthropic `400`, instead of producing a false-green replay — under strict mode; otherwise the violation warns and replay proceeds. Emitted thinking blocks now carry a non-empty placeholder signature so record→replay round-trips stay green across text, content+tool, and tool-only response shapes.

### Changed

- **Reasoning emission** — replaying a reasoning channel is now gated on the requested model's capability. aimock no longer synthesizes a reasoning channel (chat `reasoning_content` / Responses `reasoning_summary_text` / Anthropic thinking / etc.) for models that would not emit reasoning against the real provider. A new `isReasoningModel` classifier and `resolveReasoningForModel` gate are applied across OpenAI chat + Responses, Anthropic, Ollama, Gemini, Cohere, Bedrock (invoke + Converse), and WebSocket Responses: a non-reasoning model paired with a reasoning fixture has its reasoning suppressed under strict mode, or warns-and-emits otherwise. The `AIMOCK_REASONING_MODELS` and `AIMOCK_NONREASONING_MODELS` env vars override the classifier.
Expand Down
11 changes: 9 additions & 2 deletions src/__tests__/drift/anthropic.drift.ts
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,10 @@ describe("Anthropic Claude extended thinking shapes", () => {
expect(mockBody.content.length).toBe(2);
expect(mockBody.content[0].type).toBe("thinking");
expect(mockBody.content[0].thinking).toBe("I need to consider...");
expect(mockBody.content[0].signature).toBe("");
// Real Anthropic non-streaming returns a non-empty cryptographic signature
// on the assembled thinking block (assembled here from the signature_delta).
expect(typeof mockBody.content[0].signature).toBe("string");
expect(mockBody.content[0].signature.length).toBeGreaterThan(0);
expect(mockBody.content[1].type).toBe("text");
expect(mockBody.content[1].text).toBe("Hello!");

Expand Down Expand Up @@ -261,7 +264,11 @@ describe("Anthropic Claude extended thinking shapes", () => {
(e) => e.type === "content_block_delta" && e.data?.delta?.type === "signature_delta",
);
expect(signatureDeltas.length, "Missing signature_delta event").toBe(1);
expect(signatureDeltas[0].data.delta.signature).toBe("");
// Real Anthropic delivers the non-empty cryptographic signature here (the
// `content_block_start` carried ""); an SDK assembles the block's signature
// from this delta.
expect(typeof signatureDeltas[0].data.delta.signature).toBe("string");
expect(signatureDeltas[0].data.delta.signature.length).toBeGreaterThan(0);

// Verify text block follows thinking block
const textBlockStart = mockEvents.find(
Expand Down
1 change: 1 addition & 0 deletions src/__tests__/helpers/mock-res.ts
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ export function createDefaults(overrides: Partial<HandlerDefaults> = {}): Handle
return {
latency: 0,
chunkSize: 100,
replaySpeed: 1.0,
logger: new Logger("silent"),
...overrides,
};
Expand Down
137 changes: 136 additions & 1 deletion src/__tests__/reasoning-capability-anthropic.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,20 @@ const contentWithToolsReasoningFixture: Fixture = {
},
};

const allFixtures: Fixture[] = [reasoningFixture, plainFixture, contentWithToolsReasoningFixture];
const toolOnlyReasoningFixture: Fixture = {
match: { userMessage: "toolonly" },
response: {
reasoning: REASONING_TEXT,
toolCalls: [{ name: "get_weather", arguments: '{"city":"NYC"}' }],
},
};

const allFixtures: Fixture[] = [
reasoningFixture,
plainFixture,
contentWithToolsReasoningFixture,
toolOnlyReasoningFixture,
];

/** Mock ServerResponse that captures everything written to the body. */
function createCapturingRes(): { res: http.ServerResponse; getBody: () => string } {
Expand Down Expand Up @@ -493,3 +506,125 @@ describe("Anthropic /v1/messages reasoning gating — content+toolCalls, capable
expect(error).not.toHaveBeenCalled();
});
});

// ─── tool-only branch → gating on the pure-tool-call path ────────────────────
//
// aimock#253 newly synthesized a leading thinking block on the pure-tool-call
// dispatch (no text content) so a replayed tool-only turn under extended
// thinking does not self-trip `missing_thinking_first`. aimock#254's capability
// gate never covered that path (it didn't emit reasoning at #254's time). These
// cases lock in that the gate now applies there too: a non-reasoning model under
// strict SUPPRESSES the thinking block, a capable model EMITS it — while the
// tool_use block always survives. (Red without the `resolveReasoningForModel`
// routing: raw `response.reasoning` would emit the thinking block unconditionally.)

describe("Anthropic /v1/messages reasoning gating — tool-only, strict ON suppresses", () => {
it("non-streaming: suppresses thinking, keeps tool_use", async () => {
const logger = new Logger("warn");
const warn = vi.spyOn(logger, "warn");
const error = vi.spyOn(logger, "error");

const body = await run(
{
model: "claude-3-5-sonnet-20241022",
max_tokens: 1024,
messages: [{ role: "user", content: "toolonly" }],
},
makeDefaults(logger, true),
);

const blocks = contentBlocks(body);
expect(blocks.filter((b) => b.type === "thinking")).toHaveLength(0);

const toolUse = blocks.find((b) => b.type === "tool_use");
expect(toolUse?.name).toBe("get_weather");
expect(toolUse?.input).toEqual({ city: "NYC" });

expect(error).toHaveBeenCalledTimes(1);
expect(error.mock.calls[0]?.join(" ")).toContain("claude-3-5-sonnet-20241022");
expect(warn).not.toHaveBeenCalled();
});

it("streaming: emits no thinking deltas, keeps tool_use block", async () => {
const logger = new Logger("warn");
const warn = vi.spyOn(logger, "warn");
const error = vi.spyOn(logger, "error");

const body = await run(
{
model: "claude-3-5-sonnet-20241022",
max_tokens: 1024,
stream: true,
messages: [{ role: "user", content: "toolonly" }],
},
makeDefaults(logger, true),
);

expect(streamThinkingDeltas(body)).toHaveLength(0);

const started = streamStartedBlocks(body);
expect(started.some((b) => b.type === "thinking")).toBe(false);
expect(started.map((b) => b.type)).toEqual(["tool_use"]);
expect(started.find((b) => b.type === "tool_use")?.name).toBe("get_weather");

expect(error).toHaveBeenCalledTimes(1);
expect(warn).not.toHaveBeenCalled();
});
});

describe("Anthropic /v1/messages reasoning gating — tool-only, capable model emits", () => {
it("non-streaming: emits thinking, keeps tool_use in order", async () => {
const logger = new Logger("warn");
const warn = vi.spyOn(logger, "warn");
const error = vi.spyOn(logger, "error");

const body = await run(
{
model: "claude-opus-4",
max_tokens: 1024,
messages: [{ role: "user", content: "toolonly" }],
},
makeDefaults(logger),
);

const blocks = contentBlocks(body);
const thinking = blocks.filter((b) => b.type === "thinking");
expect(thinking).toHaveLength(1);
expect(thinking[0].thinking).toBe(REASONING_TEXT);

const toolUse = blocks.find((b) => b.type === "tool_use");
expect(toolUse?.name).toBe("get_weather");
expect(toolUse?.input).toEqual({ city: "NYC" });

// Order: thinking → tool_use.
expect(blocks.map((b) => b.type)).toEqual(["thinking", "tool_use"]);

expect(warn).not.toHaveBeenCalled();
expect(error).not.toHaveBeenCalled();
});

it("streaming: emits thinking deltas, keeps tool_use block in order", async () => {
const logger = new Logger("warn");
const warn = vi.spyOn(logger, "warn");
const error = vi.spyOn(logger, "error");

const body = await run(
{
model: "claude-opus-4",
max_tokens: 1024,
stream: true,
messages: [{ role: "user", content: "toolonly" }],
},
makeDefaults(logger),
);

expect(streamThinkingDeltas(body).join("")).toBe(REASONING_TEXT);

const started = streamStartedBlocks(body);
expect(started.map((b) => b.type)).toEqual(["thinking", "tool_use"]);
expect(started.find((b) => b.type === "tool_use")?.name).toBe("get_weather");

expect(warn).not.toHaveBeenCalled();
expect(error).not.toHaveBeenCalled();
});
});
Loading
Loading