From ec71b8157609a14897b0838ee75f548de041c420 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Tue, 5 May 2026 14:33:29 -0500 Subject: [PATCH 1/2] Handle multiple audio channels --- .../components/assist_pipeline/pipeline.py | 1 + .../components/esphome/assist_satellite.py | 36 +++++++++++++++++-- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/homeassistant/components/assist_pipeline/pipeline.py b/homeassistant/components/assist_pipeline/pipeline.py index 61bf588c973d0c..a9b21d6fd85711 100644 --- a/homeassistant/components/assist_pipeline/pipeline.py +++ b/homeassistant/components/assist_pipeline/pipeline.py @@ -932,6 +932,7 @@ async def speech_to_text( { "engine": engine, "metadata": asdict(metadata), + "audio_processing": asdict(self.stt_provider.audio_processing), }, ) ) diff --git a/homeassistant/components/esphome/assist_satellite.py b/homeassistant/components/esphome/assist_satellite.py index 8f8b2b3fca934e..acfc607f2b9b73 100644 --- a/homeassistant/components/esphome/assist_satellite.py +++ b/homeassistant/components/esphome/assist_satellite.py @@ -146,6 +146,8 @@ def __init__(self, entry: ESPHomeConfigEntry) -> None: ) self._active_pipeline_index = 0 + self._active_audio_channel = 0 + self._has_multi_channel_audio = False def _get_entity_id(self, suffix: str) -> str | None: """Return the entity id for pipeline select, etc.""" @@ -291,6 +293,9 @@ async def async_added_to_hass(self) -> None: assist_satellite.AssistSatelliteEntityFeature.START_CONVERSATION ) + if feature_flags & VoiceAssistantFeature.MULTI_CHANNEL_AUDIO: + self._has_multi_channel_audio = True + # Update wake word select when config is updated self.async_on_remove( self._entry_data.async_register_assist_satellite_set_wake_words_callback( @@ -315,6 +320,18 @@ def on_pipeline_event(self, event: PipelineEvent) -> None: data_to_send: dict[str, Any] = {} if event_type == VoiceAssistantEventType.VOICE_ASSISTANT_STT_START: + if ( + self._has_multi_channel_audio + and event.data + and (audio_processing := event.data.get("audio_processing")) + ): + # Settings come from stt SpeechAudioProcessing + if (audio_processing.get("prefers_auto_gain_enabled") is False) and ( + audio_processing.get("prefers_noise_reduction_enabled") is False + ): + # Use non-enhanced audio + self._active_audio_channel = 1 + self._entry_data.async_set_assist_pipeline_state(True) elif event_type == VoiceAssistantEventType.VOICE_ASSISTANT_STT_END: assert event.data is not None @@ -533,6 +550,10 @@ async def handle_pipeline_start( # Try next wake word select maybe_pipeline_index += 1 + # Default to audio channel 0 (enhanced) + # May be changed when STT_START event arrives. + self._active_audio_channel = 0 + _LOGGER.debug( "Running pipeline %s from %s to %s", self._active_pipeline_index + 1, @@ -555,9 +576,20 @@ async def handle_pipeline_start( return port - async def handle_audio(self, data: bytes) -> None: + async def handle_audio(self, data: bytes, data2: bytes | None = None) -> None: """Handle incoming audio chunk from API.""" - self._audio_queue.put_nowait(data) + # Default to enhanced audio (channel 0) + active_data = data + + if ( + self._has_multi_channel_audio + and (data2 is not None) + and (self._active_audio_channel == 1) + ): + # Non-enhanced audio (channel 1) + active_data = data2 + + self._audio_queue.put_nowait(active_data) async def handle_pipeline_stop(self, abort: bool) -> None: """Handle request for pipeline to stop.""" From d344454c4ba3614cdeb3641facae4abbcb3cc2f5 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Tue, 5 May 2026 16:52:54 -0500 Subject: [PATCH 2/2] Update snapshots for tests --- .../assist_pipeline/snapshots/test_init.ambr | 25 ++++ .../snapshots/test_websocket.ambr | 40 ++++++ .../esphome/test_assist_satellite.py | 135 ++++++++++++++++++ 3 files changed, 200 insertions(+) diff --git a/tests/components/assist_pipeline/snapshots/test_init.ambr b/tests/components/assist_pipeline/snapshots/test_init.ambr index 5e77b7e9291407..acc7f708bb1ff1 100644 --- a/tests/components/assist_pipeline/snapshots/test_init.ambr +++ b/tests/components/assist_pipeline/snapshots/test_init.ambr @@ -17,6 +17,11 @@ }), dict({ 'data': dict({ + 'audio_processing': dict({ + 'prefers_auto_gain_enabled': True, + 'prefers_noise_reduction_enabled': True, + 'requires_external_vad': True, + }), 'engine': 'stt.mock_stt', 'metadata': dict({ 'bit_rate': , @@ -119,6 +124,11 @@ }), dict({ 'data': dict({ + 'audio_processing': dict({ + 'prefers_auto_gain_enabled': True, + 'prefers_noise_reduction_enabled': True, + 'requires_external_vad': True, + }), 'engine': 'stt.mock_stt', 'metadata': dict({ 'bit_rate': , @@ -221,6 +231,11 @@ }), dict({ 'data': dict({ + 'audio_processing': dict({ + 'prefers_auto_gain_enabled': True, + 'prefers_noise_reduction_enabled': True, + 'requires_external_vad': True, + }), 'engine': 'test', 'metadata': dict({ 'bit_rate': , @@ -347,6 +362,11 @@ }), dict({ 'data': dict({ + 'audio_processing': dict({ + 'prefers_auto_gain_enabled': True, + 'prefers_noise_reduction_enabled': True, + 'requires_external_vad': True, + }), 'engine': 'stt.mock_stt', 'metadata': dict({ 'bit_rate': , @@ -449,6 +469,11 @@ }), dict({ 'data': dict({ + 'audio_processing': dict({ + 'prefers_auto_gain_enabled': True, + 'prefers_noise_reduction_enabled': True, + 'requires_external_vad': True, + }), 'engine': 'stt.mock_stt', 'metadata': dict({ 'bit_rate': , diff --git a/tests/components/assist_pipeline/snapshots/test_websocket.ambr b/tests/components/assist_pipeline/snapshots/test_websocket.ambr index 4d5ae8e28e72f1..41b33e2dc3160f 100644 --- a/tests/components/assist_pipeline/snapshots/test_websocket.ambr +++ b/tests/components/assist_pipeline/snapshots/test_websocket.ambr @@ -18,6 +18,11 @@ # --- # name: test_audio_pipeline.1 dict({ + 'audio_processing': dict({ + 'prefers_auto_gain_enabled': True, + 'prefers_noise_reduction_enabled': True, + 'requires_external_vad': True, + }), 'engine': 'stt.mock_stt', 'metadata': dict({ 'bit_rate': 16, @@ -112,6 +117,11 @@ # --- # name: test_audio_pipeline_debug.1 dict({ + 'audio_processing': dict({ + 'prefers_auto_gain_enabled': True, + 'prefers_noise_reduction_enabled': True, + 'requires_external_vad': True, + }), 'engine': 'stt.mock_stt', 'metadata': dict({ 'bit_rate': 16, @@ -218,6 +228,11 @@ # --- # name: test_audio_pipeline_with_enhancements.1 dict({ + 'audio_processing': dict({ + 'prefers_auto_gain_enabled': True, + 'prefers_noise_reduction_enabled': True, + 'requires_external_vad': True, + }), 'engine': 'stt.mock_stt', 'metadata': dict({ 'bit_rate': 16, @@ -334,6 +349,11 @@ # --- # name: test_audio_pipeline_with_wake_word_no_timeout.3 dict({ + 'audio_processing': dict({ + 'prefers_auto_gain_enabled': True, + 'prefers_noise_reduction_enabled': True, + 'requires_external_vad': True, + }), 'engine': 'stt.mock_stt', 'metadata': dict({ 'bit_rate': 16, @@ -461,6 +481,11 @@ # --- # name: test_device_capture.1 dict({ + 'audio_processing': dict({ + 'prefers_auto_gain_enabled': True, + 'prefers_noise_reduction_enabled': True, + 'requires_external_vad': True, + }), 'engine': 'stt.mock_stt', 'metadata': dict({ 'bit_rate': 16, @@ -488,6 +513,11 @@ # --- # name: test_device_capture_override.1 dict({ + 'audio_processing': dict({ + 'prefers_auto_gain_enabled': True, + 'prefers_noise_reduction_enabled': True, + 'requires_external_vad': True, + }), 'engine': 'stt.mock_stt', 'metadata': dict({ 'bit_rate': 16, @@ -537,6 +567,11 @@ # --- # name: test_device_capture_queue_full.1 dict({ + 'audio_processing': dict({ + 'prefers_auto_gain_enabled': True, + 'prefers_noise_reduction_enabled': True, + 'requires_external_vad': True, + }), 'engine': 'stt.mock_stt', 'metadata': dict({ 'bit_rate': 16, @@ -761,6 +796,11 @@ # --- # name: test_stt_stream_failed.1 dict({ + 'audio_processing': dict({ + 'prefers_auto_gain_enabled': True, + 'prefers_noise_reduction_enabled': True, + 'requires_external_vad': True, + }), 'engine': 'stt.mock_stt', 'metadata': dict({ 'bit_rate': 16, diff --git a/tests/components/esphome/test_assist_satellite.py b/tests/components/esphome/test_assist_satellite.py index 65703c0b72dd21..c51d5719d742dd 100644 --- a/tests/components/esphome/test_assist_satellite.py +++ b/tests/components/esphome/test_assist_satellite.py @@ -2290,3 +2290,138 @@ async def test_custom_wake_words( # Check non-existent wake word req = await http_client.get("/api/esphome/wake_words/wrong_wake_word.json") assert req.status == HTTPStatus.NOT_FOUND + + +async def test_multichannel_audio( + hass: HomeAssistant, + mock_client: APIClient, + mock_esphome_device: MockESPHomeDeviceType, +) -> None: + """Test that stt-start event can switch audio channels.""" + mock_device = await mock_esphome_device( + mock_client=mock_client, + device_info={ + "voice_assistant_feature_flags": VoiceAssistantFeature.VOICE_ASSISTANT + | VoiceAssistantFeature.SPEAKER + | VoiceAssistantFeature.API_AUDIO + | VoiceAssistantFeature.MULTI_CHANNEL_AUDIO + }, + ) + await hass.async_block_till_done() + + satellite = get_satellite_entity(hass, mock_device.device_info.mac_address) + assert satellite is not None + + pipeline_finished = asyncio.Event() + + async def async_pipeline_from_audio_stream(*args, **kwargs): + event_callback = kwargs["event_callback"] + + # STT + event_callback( + PipelineEvent( + type=PipelineEventType.STT_START, + data={ + "engine": "test-stt-engine", + "metadata": {}, + "audio_processing": { + # Request non-enhanced audio (channel 1) + "prefers_auto_gain_enabled": False, + "prefers_noise_reduction_enabled": False, + }, + }, + ) + ) + + stt_stream = kwargs["stt_stream"] + + chunks = [chunk async for chunk in stt_stream] + + # Verify correct channel + assert chunks == [b"channel 1"] + + pipeline_finished.set() + + with ( + patch( + "homeassistant.components.assist_satellite.entity.async_pipeline_from_audio_stream", + new=async_pipeline_from_audio_stream, + ), + ): + async with asyncio.timeout(1): + await satellite.handle_pipeline_start( + conversation_id="", + flags=VoiceAssistantCommandFlag(0), # stt + audio_settings=VoiceAssistantAudioSettings(), + wake_word_phrase=None, + ) + await satellite.handle_audio(b"channel 0", b"channel 1") + await satellite.handle_pipeline_stop(abort=False) + await pipeline_finished.wait() + + +async def test_multichannel_audio_fallback_channel_0( + hass: HomeAssistant, + mock_client: APIClient, + mock_esphome_device: MockESPHomeDeviceType, +) -> None: + """Test that channel 0 is used if multi-channel audio isn't supported.""" + mock_device = await mock_esphome_device( + mock_client=mock_client, + device_info={ + "voice_assistant_feature_flags": VoiceAssistantFeature.VOICE_ASSISTANT + | VoiceAssistantFeature.SPEAKER + | VoiceAssistantFeature.API_AUDIO + }, + ) + await hass.async_block_till_done() + + satellite = get_satellite_entity(hass, mock_device.device_info.mac_address) + assert satellite is not None + + pipeline_finished = asyncio.Event() + + async def async_pipeline_from_audio_stream(*args, **kwargs): + event_callback = kwargs["event_callback"] + + # STT + event_callback( + PipelineEvent( + type=PipelineEventType.STT_START, + data={ + "engine": "test-stt-engine", + "metadata": {}, + "audio_processing": { + # Request non-enhanced audio (channel 1) + "prefers_auto_gain_enabled": False, + "prefers_noise_reduction_enabled": False, + }, + }, + ) + ) + + stt_stream = kwargs["stt_stream"] + + chunks = [chunk async for chunk in stt_stream] + + # Non-enhanced audio (channel 1) was requested, but it isn't supported. + assert chunks == [b"channel 0"] + + pipeline_finished.set() + + with ( + patch( + "homeassistant.components.assist_satellite.entity.async_pipeline_from_audio_stream", + new=async_pipeline_from_audio_stream, + ), + ): + async with asyncio.timeout(1): + await satellite.handle_pipeline_start( + conversation_id="", + flags=VoiceAssistantCommandFlag(0), # stt + audio_settings=VoiceAssistantAudioSettings(), + wake_word_phrase=None, + ) + await satellite.handle_audio(b"channel 0", b"channel 1") + await satellite.handle_pipeline_stop(abort=False) + await pipeline_finished.wait()