Chat Completions: Extended Thinking

Extended Thinking

Extended thinking exposes the model’s step-by-step reasoning as structured thinking_blocks. Unlike the plain-text reasoning_content field, thinking blocks carry cryptographic signatures — required to continue a reasoning chain across multiple turns.

Provider Support

Provider	Models	Notes
Anthropic	Claude 3.7, Claude 4, Claude 4.5 series	Claude 4.5+ uses adaptive thinking (no fixed budget)
AWS Bedrock	All Claude models on Bedrock	Minimum 1024 budget tokens enforced
Google Gemini / Vertex AI	Gemini 2.5, Gemini 3 series	Thinking on by default for Gemini 2.5 Pro and Flash

Response Fields

Field	Description
`message.reasoning_content`	Plain-text summary of the model’s reasoning
`message.thinking_blocks`	Array of `{type, thinking, signature}` objects

{
  "message": {
    "content": "3^3^3 = 7,625,597,484,987",
    "reasoning_content": "Exponentiation is right-associative: 3^3 = 27, then 3^27...",
    "thinking_blocks": [
      {
        "type": "thinking",
        "thinking": "Exponentiation is right-associative: 3^3 = 27, then 3^27...",
        "signature": "ErUBCkQIARgCIkC..."
      }
    ]
  }
}

Basic Usage

from openai import OpenAI

client = OpenAI(api_key="TFY_API_KEY", base_url="{GATEWAY_BASE_URL}")

response = client.chat.completions.create(
    model="anthropic-main/claude-opus-4-1-20250805",
    messages=[{"role": "user", "content": "How to compute 3^3^3?"}],
    reasoning_effort="high",  # "high" | "medium" | "low" | "none"
    max_tokens=8000
)

msg = response.choices[0].message
print(msg.content)            # Final answer
print(msg.reasoning_content)  # Plain text reasoning
print(msg.thinking_blocks)    # Structured blocks with signatures

Streaming

Use the helper below to accumulate a stream into content, thinking_blocks, and tool_calls — ready to pass directly into the next turn.

def accumulate_stream(stream):
    content = ""
    reasoning_content = ""
    reasoning_signature = None
    thinking_blocks_direct = []
    tool_calls_buf: dict[int, dict] = {}

    for chunk in stream:
        delta = chunk.choices[0].delta
        extra = delta.model_extra or {}

        if delta.content:
            content += delta.content

        # Anthropic / Bedrock
        if extra.get("reasoning_content"):
            reasoning_content += extra["reasoning_content"]
        if extra.get("reasoning_signature"):
            reasoning_signature = extra["reasoning_signature"]

        # Gemini / Vertex AI
        for tb in (extra.get("thinking_blocks") or []):
            thinking_blocks_direct.append(tb)

        # Tool calls — accumulated by index
        for tc in (delta.tool_calls or []):
            slot = tool_calls_buf.setdefault(tc.index, {
                "id": "", "type": "function",
                "function": {"name": "", "arguments": ""},
            })
            if tc.id:
                slot["id"] += tc.id
            if tc.function:
                slot["function"]["name"]      += tc.function.name      or ""
                slot["function"]["arguments"] += tc.function.arguments or ""

    # Normalize both provider formats into a unified thinking_blocks shape
    if thinking_blocks_direct:
        thinking_blocks = thinking_blocks_direct
    elif reasoning_content or reasoning_signature:
        thinking_blocks = [{
            "type": "thinking",
            "thinking": reasoning_content,
            **({"signature": reasoning_signature} if reasoning_signature else {}),
        }]
    else:
        thinking_blocks = []

    return content, thinking_blocks, list(tool_calls_buf.values())


stream = client.chat.completions.create(
    model="anthropic-main/claude-opus-4-1-20250805",
    messages=[{"role": "user", "content": "How to compute 3^3^3?"}],
    reasoning_effort="high",
    max_tokens=8000,
    stream=True
)

content, thinking_blocks, tool_calls = accumulate_stream(stream)

assistant_message = {
    "role": "assistant",
    "content": content,
    "thinking_blocks": thinking_blocks,
    **({"tool_calls": tool_calls} if tool_calls else {}),
}

Multi-Turn Conversations

Use model_dump(exclude_none=True) on the assistant message — it captures content, tool_calls, and thinking_blocks in one shot, so you don’t need to construct the dict manually.

# Turn 1
response = client.chat.completions.create(
    model="anthropic-main/claude-opus-4-1-20250805",
    messages=[{"role": "user", "content": "What is 3^3^3?"}],
    reasoning_effort="high",
    max_tokens=8000
)

# Serialize the full assistant message (preserves thinking_blocks + signatures)
assistant_message = response.choices[0].message.model_dump(exclude_none=True)

# Turn 2 — pass the serialized message back as-is
response2 = client.chat.completions.create(
    model="anthropic-main/claude-opus-4-1-20250805",
    messages=[
        {"role": "user", "content": "What is 3^3^3?"},
        assistant_message,
        {"role": "user", "content": "Now explain why exponentiation is right-associative."}
    ],
    reasoning_effort="high",
    max_tokens=8000
)

Always echo thinking_blocks exactly as returned. Blocks with missing or modified signature fields are rejected by the provider.

Multi-Turn with Tool Calls

When thinking is enabled, Anthropic and Bedrock require the assistant message to include thinking_blocks alongside tool_calls. Use model_dump(exclude_none=True) — it captures both in one step.

import json
from openai import OpenAI

client = OpenAI(api_key="TFY_API_KEY", base_url="{GATEWAY_BASE_URL}")

tools = [{
    "type": "function",
    "function": {
        "name": "get_weather",
        "description": "Get weather for a city",
        "parameters": {
            "type": "object",
            "properties": {"city": {"type": "string"}},
            "required": ["city"]
        }
    }
}]

messages = [{"role": "user", "content": "What's the weather in Tokyo?"}]

# Turn 1 — model responds with thinking + tool call
response = client.chat.completions.create(
    model="anthropic-main/claude-opus-4-1-20250805",
    messages=messages,
    tools=tools,
    reasoning_effort="high",
    max_tokens=8000
)

msg = response.choices[0].message

# Append assistant message — model_dump captures tool_calls + thinking_blocks together
messages.append(msg.model_dump(exclude_none=True))

# Execute the tool and append the result
for tool_call in msg.tool_calls:
    args = json.loads(tool_call.function.arguments)
    result = f"Sunny, 24°C in {args['city']}"
    messages.append({
        "role": "tool",
        "tool_call_id": tool_call.id,
        "content": result
    })

# Turn 2 — model summarizes with full context
response2 = client.chat.completions.create(
    model="anthropic-main/claude-opus-4-1-20250805",
    messages=messages,
    tools=tools,
    reasoning_effort="high",
    max_tokens=8000
)

print(response2.choices[0].message.content)

Grounding with Google Search

Google Gemini support grounding with Google Search, which allows the model to augment its responses with real-time web results. When grounding is enabled, the model can call a search tool during generation to retrieve up-to-date information and incorporate it into the final answer.

from openai import OpenAI

client = OpenAI(
    api_key="TFY_API_KEY",
    base_url="{GATEWAY_BASE_URL}"
)

response = client.chat.completions.create(
    model="tfy-ai-gemini/gemini-2-5-pro", # tfy gemini model name
    messages=[{"role": "user", "content": "what date and time is right now?"}],
    tools=[{
        "type": "function",
        "function": {
            "name": "google_search",
        }
    }]
    
)

print(response.choices[0].message)

​Extended Thinking

​Provider Support

​Response Fields

​Basic Usage

​Streaming

​Multi-Turn Conversations

​Multi-Turn with Tool Calls

​Grounding with Google Search

Extended Thinking

Provider Support

Response Fields

Basic Usage

Streaming

Multi-Turn Conversations

Multi-Turn with Tool Calls

Grounding with Google Search