CopilotKit · jpr5 · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 ## [Unreleased]
 
+### Added
+
+- **Extended-thinking request invariants** — aimock now validates Anthropic extended-thinking continuations on the tool-use loop. When extended thinking is enabled, a continuation whose prior assistant turn drops the leading `thinking` block (or its `signature`, or a `redacted_thinking` block's `data`) is rejected with the real Anthropic `400`, instead of producing a false-green replay — under strict mode; otherwise the violation warns and replay proceeds. Emitted thinking blocks now carry a non-empty placeholder signature so record→replay round-trips stay green across text, content+tool, and tool-only response shapes.
+
 ### Changed
 
 - **Reasoning emission** — replaying a reasoning channel is now gated on the requested model's capability. aimock no longer synthesizes a reasoning channel (chat `reasoning_content` / Responses `reasoning_summary_text` / Anthropic thinking / etc.) for models that would not emit reasoning against the real provider. A new `isReasoningModel` classifier and `resolveReasoningForModel` gate are applied across OpenAI chat + Responses, Anthropic, Ollama, Gemini, Cohere, Bedrock (invoke + Converse), and WebSocket Responses: a non-reasoning model paired with a reasoning fixture has its reasoning suppressed under strict mode, or warns-and-emits otherwise. The `AIMOCK_REASONING_MODELS` and `AIMOCK_NONREASONING_MODELS` env vars override the classifier.

diff --git a/src/__tests__/drift/anthropic.drift.ts b/src/__tests__/drift/anthropic.drift.ts
@@ -212,7 +212,10 @@ describe("Anthropic Claude extended thinking shapes", () => {
     expect(mockBody.content.length).toBe(2);
     expect(mockBody.content[0].type).toBe("thinking");
     expect(mockBody.content[0].thinking).toBe("I need to consider...");
-    expect(mockBody.content[0].signature).toBe("");
+    // Real Anthropic non-streaming returns a non-empty cryptographic signature
+    // on the assembled thinking block (assembled here from the signature_delta).
+    expect(typeof mockBody.content[0].signature).toBe("string");
+    expect(mockBody.content[0].signature.length).toBeGreaterThan(0);
     expect(mockBody.content[1].type).toBe("text");
     expect(mockBody.content[1].text).toBe("Hello!");
 
@@ -261,7 +264,11 @@ describe("Anthropic Claude extended thinking shapes", () => {
       (e) => e.type === "content_block_delta" && e.data?.delta?.type === "signature_delta",
     );
     expect(signatureDeltas.length, "Missing signature_delta event").toBe(1);
-    expect(signatureDeltas[0].data.delta.signature).toBe("");
+    // Real Anthropic delivers the non-empty cryptographic signature here (the
+    // `content_block_start` carried ""); an SDK assembles the block's signature
+    // from this delta.
+    expect(typeof signatureDeltas[0].data.delta.signature).toBe("string");
+    expect(signatureDeltas[0].data.delta.signature.length).toBeGreaterThan(0);
 
     // Verify text block follows thinking block
     const textBlockStart = mockEvents.find(

diff --git a/src/__tests__/helpers/mock-res.ts b/src/__tests__/helpers/mock-res.ts
@@ -53,6 +53,7 @@ export function createDefaults(overrides: Partial<HandlerDefaults> = {}): Handle
   return {
     latency: 0,
     chunkSize: 100,
+    replaySpeed: 1.0,
     logger: new Logger("silent"),
     ...overrides,
   };

diff --git a/src/__tests__/reasoning-capability-anthropic.test.ts b/src/__tests__/reasoning-capability-anthropic.test.ts
@@ -46,7 +46,20 @@ const contentWithToolsReasoningFixture: Fixture = {
   },
 };
 
-const allFixtures: Fixture[] = [reasoningFixture, plainFixture, contentWithToolsReasoningFixture];
+const toolOnlyReasoningFixture: Fixture = {
+  match: { userMessage: "toolonly" },
+  response: {
+    reasoning: REASONING_TEXT,
+    toolCalls: [{ name: "get_weather", arguments: '{"city":"NYC"}' }],
+  },
+};
+
+const allFixtures: Fixture[] = [
+  reasoningFixture,
+  plainFixture,
+  contentWithToolsReasoningFixture,
+  toolOnlyReasoningFixture,
+];
 
 /** Mock ServerResponse that captures everything written to the body. */
 function createCapturingRes(): { res: http.ServerResponse; getBody: () => string } {
@@ -493,3 +506,125 @@ describe("Anthropic /v1/messages reasoning gating — content+toolCalls, capable
     expect(error).not.toHaveBeenCalled();
   });
 });
+
+// ─── tool-only branch → gating on the pure-tool-call path ────────────────────
+//
+// aimock#253 newly synthesized a leading thinking block on the pure-tool-call
+// dispatch (no text content) so a replayed tool-only turn under extended
+// thinking does not self-trip `missing_thinking_first`. aimock#254's capability
+// gate never covered that path (it didn't emit reasoning at #254's time). These
+// cases lock in that the gate now applies there too: a non-reasoning model under
+// strict SUPPRESSES the thinking block, a capable model EMITS it — while the
+// tool_use block always survives. (Red without the `resolveReasoningForModel`
+// routing: raw `response.reasoning` would emit the thinking block unconditionally.)
+
+describe("Anthropic /v1/messages reasoning gating — tool-only, strict ON suppresses", () => {
+  it("non-streaming: suppresses thinking, keeps tool_use", async () => {
+    const logger = new Logger("warn");
+    const warn = vi.spyOn(logger, "warn");
+    const error = vi.spyOn(logger, "error");
+
+    const body = await run(
+      {
+        model: "claude-3-5-sonnet-20241022",
+        max_tokens: 1024,
+        messages: [{ role: "user", content: "toolonly" }],
+      },
+      makeDefaults(logger, true),
+    );
+
+    const blocks = contentBlocks(body);
+    expect(blocks.filter((b) => b.type === "thinking")).toHaveLength(0);
+
+    const toolUse = blocks.find((b) => b.type === "tool_use");
+    expect(toolUse?.name).toBe("get_weather");
+    expect(toolUse?.input).toEqual({ city: "NYC" });
+
+    expect(error).toHaveBeenCalledTimes(1);
+    expect(error.mock.calls[0]?.join(" ")).toContain("claude-3-5-sonnet-20241022");
+    expect(warn).not.toHaveBeenCalled();
+  });
+
+  it("streaming: emits no thinking deltas, keeps tool_use block", async () => {
+    const logger = new Logger("warn");
+    const warn = vi.spyOn(logger, "warn");
+    const error = vi.spyOn(logger, "error");
+
+    const body = await run(
+      {
+        model: "claude-3-5-sonnet-20241022",
+        max_tokens: 1024,
+        stream: true,
+        messages: [{ role: "user", content: "toolonly" }],
+      },
+      makeDefaults(logger, true),
+    );
+
+    expect(streamThinkingDeltas(body)).toHaveLength(0);
+
+    const started = streamStartedBlocks(body);
+    expect(started.some((b) => b.type === "thinking")).toBe(false);
+    expect(started.map((b) => b.type)).toEqual(["tool_use"]);
+    expect(started.find((b) => b.type === "tool_use")?.name).toBe("get_weather");
+
+    expect(error).toHaveBeenCalledTimes(1);
+    expect(warn).not.toHaveBeenCalled();
+  });
+});
+
+describe("Anthropic /v1/messages reasoning gating — tool-only, capable model emits", () => {
+  it("non-streaming: emits thinking, keeps tool_use in order", async () => {
+    const logger = new Logger("warn");
+    const warn = vi.spyOn(logger, "warn");
+    const error = vi.spyOn(logger, "error");
+
+    const body = await run(
+      {
+        model: "claude-opus-4",
+        max_tokens: 1024,
+        messages: [{ role: "user", content: "toolonly" }],
+      },
+      makeDefaults(logger),
+    );
+
+    const blocks = contentBlocks(body);
+    const thinking = blocks.filter((b) => b.type === "thinking");
+    expect(thinking).toHaveLength(1);
+    expect(thinking[0].thinking).toBe(REASONING_TEXT);
+
+    const toolUse = blocks.find((b) => b.type === "tool_use");
+    expect(toolUse?.name).toBe("get_weather");
+    expect(toolUse?.input).toEqual({ city: "NYC" });
+
+    // Order: thinking → tool_use.
+    expect(blocks.map((b) => b.type)).toEqual(["thinking", "tool_use"]);
+
+    expect(warn).not.toHaveBeenCalled();
+    expect(error).not.toHaveBeenCalled();
+  });
+
+  it("streaming: emits thinking deltas, keeps tool_use block in order", async () => {
+    const logger = new Logger("warn");
+    const warn = vi.spyOn(logger, "warn");
+    const error = vi.spyOn(logger, "error");
+
+    const body = await run(
+      {
+        model: "claude-opus-4",
+        max_tokens: 1024,
+        stream: true,
+        messages: [{ role: "user", content: "toolonly" }],
+      },
+      makeDefaults(logger),
+    );
+
+    expect(streamThinkingDeltas(body).join("")).toBe(REASONING_TEXT);
+
+    const started = streamStartedBlocks(body);
+    expect(started.map((b) => b.type)).toEqual(["thinking", "tool_use"]);
+    expect(started.find((b) => b.type === "tool_use")?.name).toBe("get_weather");
+
+    expect(warn).not.toHaveBeenCalled();
+    expect(error).not.toHaveBeenCalled();
+  });
+});