> ## Documentation Index > Fetch the complete documentation index at: https://www.truefoundry.com/llms.txt > Use this file to discover all available pages before exploring further. # Live API: Tool Calling > Tool and function calling during Live API realtime sessions ## Tool calling You can define tools (functions) that the model can invoke during a live session. The model will return a tool call when it decides to use a function, and you send the result back to continue the conversation. See the [Gemini Live API tools documentation](https://ai.google.dev/gemini-api/docs/live-api/tools) and [OpenAI Realtime API function calling documentation](https://platform.openai.com/docs/guides/realtime#function-calls) for more details. You can adapt this pattern to call [TrueFoundry MCP tools](/ai-gateway/mcp/mcp-overview) as well — replace the dummy function with an MCP tool invocation via the [MCP Gateway SDK](/ai-gateway/mcp-gateway-sdk-usage). ```python theme={"dark"} """ Gemini Live API - Tool Calling with Audio Ref: https://ai.google.dev/gemini-api/docs/live-api/tools pip install google-genai pyaudio """ import asyncio import pyaudio from google import genai from google.genai import types FORMAT = pyaudio.paInt16 CHANNELS = 1 SEND_SAMPLE_RATE = 16000 RECEIVE_SAMPLE_RATE = 24000 CHUNK_SIZE = 1024 API_KEY = "your-tfy-api-key" MODEL = "gemini-live-2.5-flash" # actual model id BASE_URL = "{GATEWAY_BASE_URL}/live/{geminiProviderAccountName}" client = genai.Client( http_options={ "base_url": BASE_URL, "headers": { "Authorization": f"Bearer {API_KEY}", } }, api_key=API_KEY, ) # --- Dummy tool implementation (replace with your own logic) --- def get_weather(location: str) -> dict: """Returns mock weather data for the given location.""" return { "location": location, "temperature": "15°C", "condition": "Foggy", "humidity": "85%", } TOOL_HANDLERS = { "get_weather": lambda args: get_weather(args["location"]), } # Define function declarations get_weather_declaration = { "name": "get_weather", "description": "Gets the current weather for a given location.", "parameters": { "type": "object", "properties": { "location": { "type": "string", "description": "The city or place to get weather for", } }, "required": ["location"], }, } tools = [{"function_declarations": [get_weather_declaration]}] CONFIG = types.LiveConnectConfig( response_modalities=["AUDIO"], speech_config=types.SpeechConfig( voice_config=types.VoiceConfig( prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Zephyr") ) ), # Enable transcription to get text versions of user and model speech. # Remove these lines if transcription is not needed. input_audio_transcription=types.AudioTranscriptionConfig(), output_audio_transcription=types.AudioTranscriptionConfig(), tools=tools, ) pya = pyaudio.PyAudio() async def main(): try: async with client.aio.live.connect(model=MODEL, config=CONFIG) as session: # Try saying: "What's the weather in San Francisco?" print("Connected! Try saying: 'What's the weather in San Francisco?' to trigger tool calling.") # Record audio from microphone and send to session mic_info = pya.get_default_input_device_info() mic_stream = pya.open( format=FORMAT, channels=CHANNELS, rate=SEND_SAMPLE_RATE, input=True, input_device_index=mic_info["index"], frames_per_buffer=CHUNK_SIZE, ) # Speaker output for receiving audio speaker_stream = pya.open( format=FORMAT, channels=CHANNELS, rate=RECEIVE_SAMPLE_RATE, output=True, ) audio_in_queue = asyncio.Queue() current_speaker = None # Track who is currently speaking async def send_audio(): while True: data = await asyncio.to_thread( mic_stream.read, CHUNK_SIZE, exception_on_overflow=False ) await session.send_realtime_input(audio={"data": data, "mime_type": "audio/pcm"}) async def receive_audio(): nonlocal current_speaker while True: turn = session.receive() was_interrupted = False async for response in turn: # Handle tool calls if response.tool_call: function_responses = [] for fc in response.tool_call.function_calls: print(f"\n[Tool Call] {fc.name}({fc.args})") handler = TOOL_HANDLERS.get(fc.name) if handler: result = handler(fc.args) else: result = {"error": f"Unknown tool: {fc.name}"} print(f"[Tool Result] {result}") function_responses.append( types.FunctionResponse( id=fc.id, name=fc.name, response={"result": result}, ) ) await session.send_tool_response( function_responses=function_responses ) continue if response.server_content and response.server_content.model_turn: for part in response.server_content.model_turn.parts: if part.inline_data: audio_in_queue.put_nowait(part.inline_data.data) if part.text and not part.thought: print(part.text, end="", flush=True) # Print transcriptions if enabled above if hasattr(response, "server_content") and response.server_content: sc = response.server_content if hasattr(sc, "input_transcription") and sc.input_transcription and sc.input_transcription.text: if current_speaker != "user": if current_speaker is not None: print() # end previous line print("[You]: ", end="", flush=True) current_speaker = "user" print(sc.input_transcription.text, end="", flush=True) if hasattr(sc, "output_transcription") and sc.output_transcription and sc.output_transcription.text: if current_speaker != "model": if current_speaker is not None: print() # end previous line print("[Model]: ", end="", flush=True) current_speaker = "model" print(sc.output_transcription.text, end="", flush=True) if hasattr(sc, "interrupted") and sc.interrupted: was_interrupted = True # Only clear the audio queue on interruption. # On normal turn completion, let play_audio finish playing # all enqueued chunks to avoid losing audio. if was_interrupted: while not audio_in_queue.empty(): audio_in_queue.get_nowait() async def play_audio(): while True: data = await audio_in_queue.get() await asyncio.to_thread(speaker_stream.write, data) print("Start speaking! (Ctrl+C to stop)\n") async with asyncio.TaskGroup() as tg: tg.create_task(send_audio()) tg.create_task(receive_audio()) tg.create_task(play_audio()) except Exception as e: print(f"Error: {e}") finally: pya.terminate() asyncio.run(main()) ``` ```python theme={"dark"} """ Gemini Live API (Vertex AI) - Tool Calling with Audio Ref: https://ai.google.dev/gemini-api/docs/live-api/tools pip install google-genai pyaudio google-auth """ import asyncio import pyaudio import google.auth.credentials from google import genai from google.genai import types FORMAT = pyaudio.paInt16 CHANNELS = 1 SEND_SAMPLE_RATE = 16000 RECEIVE_SAMPLE_RATE = 24000 CHUNK_SIZE = 1024 API_KEY = "your-tfy-api-key" MODEL = "gemini-live-2.5-flash" # actual model id BASE_URL = "{GATEWAY_BASE_URL}/live/{vertexProviderAccountName}" class _GatewayCredentials(google.auth.credentials.Credentials): """Bypasses local ADC; the gateway handles Vertex AI authentication.""" def __init__(self, token): super().__init__() self.token = token def refresh(self, request): pass @property def valid(self): return True client = genai.Client( http_options={ "base_url": BASE_URL, "headers": {"Authorization": f"Bearer {API_KEY}"}, }, vertexai=True, project="your-gcp-project", location="us-central1", credentials=_GatewayCredentials(API_KEY), ) # --- Dummy tool implementation (replace with your own logic) --- def get_weather(location: str) -> dict: """Returns mock weather data for the given location.""" return { "location": location, "temperature": "15°C", "condition": "Foggy", "humidity": "85%", } TOOL_HANDLERS = { "get_weather": lambda args: get_weather(args["location"]), } # Define function declarations get_weather_declaration = { "name": "get_weather", "description": "Gets the current weather for a given location.", "parameters": { "type": "object", "properties": { "location": { "type": "string", "description": "The city or place to get weather for", } }, "required": ["location"], }, } tools = [{"function_declarations": [get_weather_declaration]}] CONFIG = types.LiveConnectConfig( response_modalities=["AUDIO"], speech_config=types.SpeechConfig( voice_config=types.VoiceConfig( prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Zephyr") ) ), # Enable transcription to get text versions of user and model speech. # Remove these lines if transcription is not needed. input_audio_transcription=types.AudioTranscriptionConfig(), output_audio_transcription=types.AudioTranscriptionConfig(), tools=tools, ) pya = pyaudio.PyAudio() async def main(): try: async with client.aio.live.connect(model=MODEL, config=CONFIG) as session: # Try saying: "What's the weather in San Francisco?" print("Connected! Try saying: 'What's the weather in San Francisco?' to trigger tool calling.") # Record audio from microphone and send to session mic_info = pya.get_default_input_device_info() mic_stream = pya.open( format=FORMAT, channels=CHANNELS, rate=SEND_SAMPLE_RATE, input=True, input_device_index=mic_info["index"], frames_per_buffer=CHUNK_SIZE, ) # Speaker output for receiving audio speaker_stream = pya.open( format=FORMAT, channels=CHANNELS, rate=RECEIVE_SAMPLE_RATE, output=True, ) audio_in_queue = asyncio.Queue() current_speaker = None # Track who is currently speaking async def send_audio(): while True: data = await asyncio.to_thread( mic_stream.read, CHUNK_SIZE, exception_on_overflow=False ) await session.send_realtime_input(audio={"data": data, "mime_type": "audio/pcm"}) async def receive_audio(): nonlocal current_speaker while True: turn = session.receive() was_interrupted = False async for response in turn: # Handle tool calls if response.tool_call: function_responses = [] for fc in response.tool_call.function_calls: print(f"\n[Tool Call] {fc.name}({fc.args})") handler = TOOL_HANDLERS.get(fc.name) if handler: result = handler(fc.args) else: result = {"error": f"Unknown tool: {fc.name}"} print(f"[Tool Result] {result}") function_responses.append( types.FunctionResponse( id=fc.id, name=fc.name, response={"result": result}, ) ) await session.send_tool_response( function_responses=function_responses ) continue if response.server_content and response.server_content.model_turn: for part in response.server_content.model_turn.parts: if part.inline_data: audio_in_queue.put_nowait(part.inline_data.data) if part.text and not part.thought: print(part.text, end="", flush=True) # Print transcriptions if enabled above if hasattr(response, "server_content") and response.server_content: sc = response.server_content if hasattr(sc, "input_transcription") and sc.input_transcription and sc.input_transcription.text: if current_speaker != "user": if current_speaker is not None: print() # end previous line print("[You]: ", end="", flush=True) current_speaker = "user" print(sc.input_transcription.text, end="", flush=True) if hasattr(sc, "output_transcription") and sc.output_transcription and sc.output_transcription.text: if current_speaker != "model": if current_speaker is not None: print() # end previous line print("[Model]: ", end="", flush=True) current_speaker = "model" print(sc.output_transcription.text, end="", flush=True) if hasattr(sc, "interrupted") and sc.interrupted: was_interrupted = True # Only clear the audio queue on interruption. # On normal turn completion, let play_audio finish playing # all enqueued chunks to avoid losing audio. if was_interrupted: while not audio_in_queue.empty(): audio_in_queue.get_nowait() async def play_audio(): while True: data = await audio_in_queue.get() await asyncio.to_thread(speaker_stream.write, data) print("Start speaking! (Ctrl+C to stop)\n") async with asyncio.TaskGroup() as tg: tg.create_task(send_audio()) tg.create_task(receive_audio()) tg.create_task(play_audio()) except Exception as e: print(f"Error: {e}") finally: pya.terminate() asyncio.run(main()) ``` ```python theme={"dark"} """ OpenAI Realtime API - Tool Calling with Audio Ref: https://platform.openai.com/docs/guides/realtime#function-calls Requires Python 3.11+ pip install "openai[realtime]" numpy sounddevice """ import json import base64 import asyncio import threading import numpy as np import sounddevice as sd from openai import AsyncOpenAI from openai.resources.realtime.realtime import AsyncRealtimeConnection SAMPLE_RATE = 24000 CHANNELS = 1 CHUNK_LENGTH_S = 0.05 API_KEY = "your-tfy-api-key" MODEL = "gpt-4o-realtime-preview" # actual model id client = AsyncOpenAI( api_key=API_KEY, websocket_base_url="wss://{GATEWAY_HOST}/live/{openaiProviderAccountName}", ) # --- Dummy tool implementation (replace with your own logic) --- def get_weather(location: str) -> dict: """Returns mock weather data for the given location.""" return { "location": location, "temperature": "15°C", "condition": "Foggy", "humidity": "85%", } TOOL_HANDLERS = { "get_weather": lambda args: get_weather(args["location"]), } class AudioPlayerAsync: def __init__(self): self.queue = [] self.lock = threading.Lock() self.stream = sd.OutputStream( callback=self._callback, samplerate=SAMPLE_RATE, channels=CHANNELS, dtype=np.int16, blocksize=int(CHUNK_LENGTH_S * SAMPLE_RATE), ) self.playing = False def _callback(self, outdata, frames, time, status): with self.lock: data = np.empty(0, dtype=np.int16) while len(data) < frames and self.queue: item = self.queue.pop(0) needed = frames - len(data) data = np.concatenate((data, item[:needed])) if len(item) > needed: self.queue.insert(0, item[needed:]) if len(data) < frames: data = np.concatenate((data, np.zeros(frames - len(data), dtype=np.int16))) outdata[:] = data.reshape(-1, 1) def add_data(self, data: bytes): with self.lock: self.queue.append(np.frombuffer(data, dtype=np.int16)) if not self.playing: self.playing = True self.stream.start() def stop(self): self.playing = False self.stream.stop() with self.lock: self.queue = [] def terminate(self): self.stream.close() async def send_mic_audio(connection: AsyncRealtimeConnection): read_size = int(SAMPLE_RATE * 0.02) stream = sd.InputStream(channels=CHANNELS, samplerate=SAMPLE_RATE, dtype="int16") stream.start() try: while True: if stream.read_available < read_size: await asyncio.sleep(0) continue data, _ = stream.read(read_size) await connection.input_audio_buffer.append( audio=base64.b64encode(data).decode("utf-8"), ) await asyncio.sleep(0) except KeyboardInterrupt: pass finally: stream.stop() stream.close() async def main(): player = AudioPlayerAsync() try: async with client.realtime.connect(model=MODEL) as connection: # Try saying: "What's the weather in San Francisco?" print("Connected! Try saying: 'What's the weather in San Francisco?' to trigger tool calling.") await connection.session.update(session={ "type": "realtime", "output_modalities": ["audio"], "tools": [ { "type": "function", "name": "get_weather", "description": "Gets the current weather for a given location.", "parameters": { "type": "object", "properties": { "location": { "type": "string", "description": "The city or place to get weather for", } }, "required": ["location"], }, } ], "audio": { "input": { "turn_detection": {"type": "server_vad"}, # Enable input audio transcription (user speech to text). # Remove this if input transcription is not needed. "transcription": {"model": "gpt-4o-transcribe"}, }, "output": { "voice": "alloy" } } }) pending_tool_calls: dict[str, dict] = {} async def receive_events(): async for event in connection: if event.type == "response.output_audio.delta": player.add_data(base64.b64decode(event.delta)) # Output transcript (model speech to text), enabled by default elif event.type == "response.output_audio_transcript.delta": print(event.delta, end="", flush=True) elif event.type == "response.output_audio_transcript.done": print() # Input transcript (user speech to text), requires transcription config above elif event.type == "conversation.item.input_audio_transcription.completed": print(f"\n[You]: {event.transcript}") elif event.type == "input_audio_buffer.speech_started": player.stop() # Tool call handling elif event.type == "response.output_item.added": item = event.item if item.type == "function_call": pending_tool_calls[item.call_id] = {"name": item.name, "arguments": ""} elif event.type == "response.function_call_arguments.delta": if event.call_id in pending_tool_calls: pending_tool_calls[event.call_id]["arguments"] += event.delta elif event.type == "response.function_call_arguments.done": call_id = event.call_id if call_id in pending_tool_calls: tool = pending_tool_calls.pop(call_id) args = json.loads(tool["arguments"]) print(f"\n[Tool Call] {tool['name']}({args})") handler = TOOL_HANDLERS.get(tool["name"]) if handler: result = handler(args) else: result = {"error": f"Unknown tool: {tool['name']}"} print(f"[Tool Result] {result}") await connection.conversation.item.create(item={ "type": "function_call_output", "call_id": call_id, "output": json.dumps(result), }) await connection.response.create() elif event.type == "error": print(f"\n[ERROR] {event}") print("Start speaking! (Ctrl+C to stop)\n") async with asyncio.TaskGroup() as tg: tg.create_task(send_mic_audio(connection)) tg.create_task(receive_events()) except Exception as e: print(f"Error: {e}") finally: player.terminate() asyncio.run(main()) ``` ```python theme={"dark"} """ OpenAI Realtime API via Azure AI Foundry - Tool Calling with Audio Ref: https://platform.openai.com/docs/guides/realtime#function-calls Requires Python 3.11+ pip install "openai[realtime]" numpy sounddevice """ import json import base64 import asyncio import threading import numpy as np import sounddevice as sd from openai import AsyncOpenAI from openai.resources.realtime.realtime import AsyncRealtimeConnection SAMPLE_RATE = 24000 CHANNELS = 1 CHUNK_LENGTH_S = 0.05 API_KEY = "your-tfy-api-key" MODEL = "gpt-4o-realtime-preview" # actual model id client = AsyncOpenAI( api_key=API_KEY, websocket_base_url="wss://{GATEWAY_HOST}/live/{azureFoundryProviderAccountName}", ) # --- Dummy tool implementation (replace with your own logic) --- def get_weather(location: str) -> dict: """Returns mock weather data for the given location.""" return { "location": location, "temperature": "15°C", "condition": "Foggy", "humidity": "85%", } TOOL_HANDLERS = { "get_weather": lambda args: get_weather(args["location"]), } class AudioPlayerAsync: def __init__(self): self.queue = [] self.lock = threading.Lock() self.stream = sd.OutputStream( callback=self._callback, samplerate=SAMPLE_RATE, channels=CHANNELS, dtype=np.int16, blocksize=int(CHUNK_LENGTH_S * SAMPLE_RATE), ) self.playing = False def _callback(self, outdata, frames, time, status): with self.lock: data = np.empty(0, dtype=np.int16) while len(data) < frames and self.queue: item = self.queue.pop(0) needed = frames - len(data) data = np.concatenate((data, item[:needed])) if len(item) > needed: self.queue.insert(0, item[needed:]) if len(data) < frames: data = np.concatenate((data, np.zeros(frames - len(data), dtype=np.int16))) outdata[:] = data.reshape(-1, 1) def add_data(self, data: bytes): with self.lock: self.queue.append(np.frombuffer(data, dtype=np.int16)) if not self.playing: self.playing = True self.stream.start() def stop(self): self.playing = False self.stream.stop() with self.lock: self.queue = [] def terminate(self): self.stream.close() async def send_mic_audio(connection: AsyncRealtimeConnection): read_size = int(SAMPLE_RATE * 0.02) stream = sd.InputStream(channels=CHANNELS, samplerate=SAMPLE_RATE, dtype="int16") stream.start() try: while True: if stream.read_available < read_size: await asyncio.sleep(0) continue data, _ = stream.read(read_size) await connection.input_audio_buffer.append( audio=base64.b64encode(data).decode("utf-8"), ) await asyncio.sleep(0) except KeyboardInterrupt: pass finally: stream.stop() stream.close() async def main(): player = AudioPlayerAsync() try: async with client.realtime.connect(model=MODEL) as connection: # Try saying: "What's the weather in San Francisco?" print("Connected! Try saying: 'What's the weather in San Francisco?' to trigger tool calling.") await connection.session.update(session={ "type": "realtime", "output_modalities": ["audio"], "tools": [ { "type": "function", "name": "get_weather", "description": "Gets the current weather for a given location.", "parameters": { "type": "object", "properties": { "location": { "type": "string", "description": "The city or place to get weather for", } }, "required": ["location"], }, } ], "audio": { "input": { "turn_detection": {"type": "server_vad"}, # Enable input audio transcription (user speech to text). # Remove this if input transcription is not needed. "transcription": {"model": "gpt-4o-transcribe"}, }, "output": { "voice": "alloy" } } }) pending_tool_calls: dict[str, dict] = {} async def receive_events(): async for event in connection: if event.type == "response.output_audio.delta": player.add_data(base64.b64decode(event.delta)) # Output transcript (model speech to text), enabled by default elif event.type == "response.output_audio_transcript.delta": print(event.delta, end="", flush=True) elif event.type == "response.output_audio_transcript.done": print() # Input transcript (user speech to text), requires transcription config above elif event.type == "conversation.item.input_audio_transcription.completed": print(f"\n[You]: {event.transcript}") elif event.type == "input_audio_buffer.speech_started": player.stop() # Tool call handling elif event.type == "response.output_item.added": item = event.item if item.type == "function_call": pending_tool_calls[item.call_id] = {"name": item.name, "arguments": ""} elif event.type == "response.function_call_arguments.delta": if event.call_id in pending_tool_calls: pending_tool_calls[event.call_id]["arguments"] += event.delta elif event.type == "response.function_call_arguments.done": call_id = event.call_id if call_id in pending_tool_calls: tool = pending_tool_calls.pop(call_id) args = json.loads(tool["arguments"]) print(f"\n[Tool Call] {tool['name']}({args})") handler = TOOL_HANDLERS.get(tool["name"]) if handler: result = handler(args) else: result = {"error": f"Unknown tool: {tool['name']}"} print(f"[Tool Result] {result}") await connection.conversation.item.create(item={ "type": "function_call_output", "call_id": call_id, "output": json.dumps(result), }) await connection.response.create() elif event.type == "error": print(f"\n[ERROR] {event}") print("Start speaking! (Ctrl+C to stop)\n") async with asyncio.TaskGroup() as tg: tg.create_task(send_mic_audio(connection)) tg.create_task(receive_events()) except Exception as e: print(f"Error: {e}") finally: player.terminate() asyncio.run(main()) ```