diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index 5b4a81d4f..6b92181fd 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -295,6 +295,51 @@ You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the mult Full example: [examples/offline_inference/audio_language.py](../../examples/offline_inference/audio_language.py) +#### Chunking Long Audio for Transcription + +Speech-to-text models like Whisper have a maximum audio length they can process (typically 30 seconds). For longer audio files, vLLM provides a utility to intelligently split audio into chunks at quiet points to minimize cutting through speech. + +```python +import librosa +from vllm import LLM, SamplingParams +from vllm.multimodal.audio import split_audio + +# Load long audio file +audio, sr = librosa.load("long_audio.wav", sr=16000) + +# Split into chunks at low-energy (quiet) regions +chunks = split_audio( + audio_data=audio, + sample_rate=sr, + max_clip_duration_s=30.0, # Maximum chunk length in seconds + overlap_duration_s=1.0, # Search window for finding quiet split points + min_energy_window_size=1600, # Window size for energy calculation (~100ms at 16kHz) +) + +# Initialize Whisper model +llm = LLM(model="openai/whisper-large-v3-turbo") +sampling_params = SamplingParams(temperature=0, max_tokens=256) + +# Transcribe each chunk +transcriptions = [] +for chunk in chunks: + outputs = llm.generate({ + "prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>", + "multi_modal_data": {"audio": (chunk, sr)}, + }, sampling_params) + transcriptions.append(outputs[0].outputs[0].text) + +# Combine results +full_transcription = " ".join(transcriptions) +``` + +The `split_audio` function: + +- Splits audio at quiet points to avoid cutting through speech +- Uses RMS energy to find low-amplitude regions within the overlap window +- Preserves all audio samples (no data loss) +- Supports any sample rate + #### Automatic Audio Channel Normalization vLLM automatically normalizes audio channels for models that require specific audio formats. When loading audio with libraries like `torchaudio`, stereo files return shape `[channels, time]`, but many audio models (particularly Whisper-based models) expect mono audio with shape `[time]`. diff --git a/tests/multimodal/test_audio.py b/tests/multimodal/test_audio.py index dd3d7e27e..3cc6bcadb 100644 --- a/tests/multimodal/test_audio.py +++ b/tests/multimodal/test_audio.py @@ -16,6 +16,7 @@ from vllm.multimodal.audio import ( normalize_audio, resample_audio_librosa, resample_audio_scipy, + split_audio, ) @@ -584,3 +585,186 @@ class TestAudioPipelineE2E: assert audio_output.ndim == 1 assert audio_output.shape == (10,) np.testing.assert_array_almost_equal(audio_output, np.zeros(10)) + + +# ============================================================ +# Tests for Audio Chunking Utilities +# ============================================================ + + +class TestAudioChunking: + """Tests for split_audio and find_split_point utilities in vllm.multimodal.audio.""" + + def test_split_audio_short_clip(self): + """Audio shorter than max_clip_duration_s should not be split.""" + + # 10 seconds of audio at 16kHz + audio = np.linspace(-1.0, 1.0, 160000, dtype=np.float32) + + chunks = split_audio( + audio_data=audio, + sample_rate=16000, + max_clip_duration_s=30.0, + overlap_duration_s=1.0, + min_energy_window_size=1600, + ) + + assert len(chunks) == 1 + np.testing.assert_array_equal(chunks[0], audio) + + def test_split_audio_exact_length(self): + """Audio exactly at max_clip_duration_s should not be split.""" + + # Exactly 30 seconds at 16kHz + audio = np.linspace(-1.0, 1.0, 480000, dtype=np.float32) + + chunks = split_audio( + audio_data=audio, + sample_rate=16000, + max_clip_duration_s=30.0, + overlap_duration_s=1.0, + min_energy_window_size=1600, + ) + + assert len(chunks) == 1 + np.testing.assert_array_equal(chunks[0], audio) + + def test_split_audio_long_clip(self): + """Long audio should be split into multiple chunks.""" + + # 65 seconds of audio at 16kHz + audio = np.linspace(-1.0, 1.0, 1040000, dtype=np.float32) + + chunks = split_audio( + audio_data=audio, + sample_rate=16000, + max_clip_duration_s=30.0, + overlap_duration_s=1.0, + min_energy_window_size=1600, + ) + + assert len(chunks) > 1 + # First sample preserved + assert chunks[0][0] == audio[0] + # Last sample preserved + assert chunks[-1][-1] == audio[-1] + + def test_split_audio_chunks_have_correct_length(self): + """Each chunk (except last) should be approximately max_clip_duration_s.""" + + # 65 seconds of audio at 16kHz + audio = np.linspace(-1.0, 1.0, 1040000, dtype=np.float32) + + chunks = split_audio( + audio_data=audio, + sample_rate=16000, + max_clip_duration_s=30.0, + overlap_duration_s=1.0, + min_energy_window_size=1600, + ) + + max_samples = int(30.0 * 16000) + overlap_samples = int(1.0 * 16000) + + for chunk in chunks[:-1]: + assert chunk.shape[0] >= max_samples - overlap_samples + assert chunk.shape[0] <= max_samples + + def test_find_split_point_finds_quiet_region(self): + """find_split_point should identify low-energy regions.""" + from vllm.multimodal.audio import find_split_point + + # Create audio with a quiet section in the middle + segment = np.ones(32000, dtype=np.float32) + # Insert quiet region at sample 16000-17600 (100ms) + segment[16000:17600] = 0.01 + + split_idx = find_split_point( + wav=segment, + start_idx=0, + end_idx=32000, + min_energy_window=1600, + ) + + # Split should be in or near the quiet region + assert 16000 <= split_idx <= 17600 + + def test_find_split_point_handles_uniform_audio(self): + """find_split_point should handle uniform energy audio gracefully.""" + from vllm.multimodal.audio import find_split_point + + segment = np.ones(32000, dtype=np.float32) * 0.5 + + split_idx = find_split_point( + wav=segment, + start_idx=0, + end_idx=32000, + min_energy_window=1600, + ) + + assert 0 <= split_idx <= 32000 + + def test_find_split_point_silence(self): + """find_split_point should prefer the quietest scanned window.""" + from vllm.multimodal.audio import find_split_point + + # Deterministic signal: constant energy everywhere except silence. + segment = np.ones(32000, dtype=np.float32) + # Complete silence at 20000-21600. + segment[20000:21600] = 0.0 + + split_idx = find_split_point( + wav=segment, + start_idx=16000, + end_idx=28000, + min_energy_window=1600, + ) + + # Current implementation evaluates non-overlapping 1600-sample windows + # from start_idx, so the quietest scanned window starts at 19200. + assert split_idx == 19200 + + def test_split_audio_preserves_boundaries(self): + """Verify first and last samples are preserved when chunking.""" + + audio = np.arange(1120000, dtype=np.float32) # 70s at 16kHz + + chunks = split_audio( + audio_data=audio, + sample_rate=16000, + max_clip_duration_s=30.0, + overlap_duration_s=1.0, + min_energy_window_size=1600, + ) + + assert chunks[0][0] == audio[0] + assert chunks[-1][-1] == audio[-1] + + def test_split_audio_with_different_sample_rates(self): + """Test chunking works with different sample rates.""" + + # 40 seconds at 8kHz + audio_8k = np.linspace(-1.0, 1.0, 320000, dtype=np.float32) + + chunks = split_audio( + audio_data=audio_8k, + sample_rate=8000, + max_clip_duration_s=30.0, + overlap_duration_s=1.0, + min_energy_window_size=800, + ) + + assert len(chunks) >= 2 + + # 40 seconds at 48kHz + audio_48k = np.linspace(-1.0, 1.0, 1920000, dtype=np.float32) + + chunks_48k = split_audio( + audio_data=audio_48k, + sample_rate=48000, + max_clip_duration_s=30.0, + overlap_duration_s=1.0, + min_energy_window_size=4800, + ) + + assert len(chunks_48k) >= 2 diff --git a/vllm/config/speech_to_text.py b/vllm/config/speech_to_text.py index 0233d3657..e0d72eb20 100644 --- a/vllm/config/speech_to_text.py +++ b/vllm/config/speech_to_text.py @@ -33,4 +33,7 @@ class SpeechToTextConfig: @property def allow_audio_chunking(self) -> bool: - return self.min_energy_split_window_size is not None + return ( + self.min_energy_split_window_size is not None + and self.max_audio_clip_s is not None + ) diff --git a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text/speech_to_text.py index 134a9640a..780b96c6a 100644 --- a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text/speech_to_text.py @@ -45,6 +45,7 @@ from vllm.model_executor.models import ( SupportsTranscription, supports_transcription, ) +from vllm.multimodal.audio import split_audio from vllm.outputs import RequestOutput from vllm.renderers.inputs import DictPrompt, EncoderDecoderDictPrompt from vllm.renderers.inputs.preprocess import parse_enc_dec_prompt, parse_model_prompt @@ -323,11 +324,24 @@ class OpenAISpeechToText(OpenAIServing): self.asr_config.allow_audio_chunking and duration > self.asr_config.max_audio_clip_s ) - chunks = [y] if not do_split_audio else self._split_audio(y, int(sr)) + + if not do_split_audio: + chunks = [y] + else: + assert self.asr_config.max_audio_clip_s is not None + assert self.asr_config.min_energy_split_window_size is not None + chunks = split_audio( + audio_data=y, + sample_rate=int(sr), + max_clip_duration_s=self.asr_config.max_audio_clip_s, + overlap_duration_s=self.asr_config.overlap_chunk_second, + min_energy_window_size=self.asr_config.min_energy_split_window_size, + ) if language is None and getattr( self.model_cls, "supports_explicit_language_detection", False ): + # Auto-detect language from the first chunk. language = await self._detect_language( chunks[0], f"{request_id}-lang_detect" ) @@ -754,55 +768,3 @@ class OpenAISpeechToText(OpenAIServing): yield f"data: {data}\n\n" # Send the final done message after all response.n are finished yield "data: [DONE]\n\n" - - def _split_audio( - self, audio_data: np.ndarray, sample_rate: int - ) -> list[np.ndarray]: - assert self.asr_config.max_audio_clip_s is not None, ( - f"{self.asr_config.max_audio_clip_s=} cannot be None to" - " split audio into chunks." - ) - chunk_size = sample_rate * self.asr_config.max_audio_clip_s - overlap_size = sample_rate * self.asr_config.overlap_chunk_second - chunks = [] - i = 0 - while i < audio_data.shape[-1]: - if i + chunk_size >= audio_data.shape[-1]: - # handle last chunk - chunks.append(audio_data[..., i:]) - break - - # Find the best split point in the overlap region - search_start = i + chunk_size - overlap_size - search_end = min(i + chunk_size, audio_data.shape[-1]) - split_point = self._find_split_point(audio_data, search_start, search_end) - - # Extract chunk up to the split point - chunks.append(audio_data[..., i:split_point]) - i = split_point - return chunks - - def _find_split_point(self, wav: np.ndarray, start_idx: int, end_idx: int) -> int: - """Find the best point to split audio by - looking for silence or low amplitude. - Args: - wav: Audio tensor [1, T] - start_idx: Start index of search region - end_idx: End index of search region - Returns: - Index of best splitting point - """ - segment = wav[start_idx:end_idx] - - # Calculate RMS energy in small windows - min_energy = math.inf - quietest_idx = 0 - min_energy_window = self.asr_config.min_energy_split_window_size - assert min_energy_window is not None - for i in range(0, len(segment) - min_energy_window, min_energy_window): - window = segment[i : i + min_energy_window] - energy = (window**2).mean() ** 0.5 - if energy < min_energy: - quietest_idx = i + start_idx - min_energy = energy - return quietest_idx diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index cccf7d1a6..28f066d11 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -216,3 +216,121 @@ class AudioResampler: f"Invalid resampling method: {self.method}. " "Supported methods are 'librosa' and 'scipy'." ) + + +# ============================================================ +# Audio Chunking / Splitting +# ============================================================ + + +def split_audio( + audio_data: np.ndarray, + sample_rate: int, + max_clip_duration_s: float, + overlap_duration_s: float, + min_energy_window_size: int, +) -> list[np.ndarray]: + """Split audio into chunks with intelligent split points. + + Splits long audio into smaller chunks at low-energy regions to minimize + cutting through speech. Uses overlapping windows to find quiet moments + for splitting. + + Args: + audio_data: Audio array to split. Can be 1D (mono) or multi-dimensional. + Splits along the last dimension (time axis). + sample_rate: Sample rate of the audio in Hz. + max_clip_duration_s: Maximum duration of each chunk in seconds. + overlap_duration_s: Overlap duration in seconds between consecutive chunks. + Used to search for optimal split points. + min_energy_window_size: Window size in samples for finding low-energy regions. + + Returns: + List of audio chunks. Each chunk is a numpy array with the same shape + as the input except for the last (time) dimension. + + Example: + >>> audio = np.random.randn(1040000) # 65 seconds at 16kHz + >>> chunks = split_audio( + ... audio_data=audio, + ... sample_rate=16000, + ... max_clip_duration_s=30.0, + ... overlap_duration_s=1.0, + ... min_energy_window_size=1600, + ... ) + >>> len(chunks) + 3 + """ + chunk_size = int(sample_rate * max_clip_duration_s) + overlap_size = int(sample_rate * overlap_duration_s) + chunks = [] + i = 0 + + while i < audio_data.shape[-1]: + if i + chunk_size >= audio_data.shape[-1]: + # Handle last chunk - take everything remaining + chunks.append(audio_data[..., i:]) + break + + # Find the best split point in the overlap region + search_start = i + chunk_size - overlap_size + search_end = min(i + chunk_size, audio_data.shape[-1]) + split_point = find_split_point( + audio_data, search_start, search_end, min_energy_window_size + ) + + # Extract chunk up to the split point + chunks.append(audio_data[..., i:split_point]) + i = split_point + + return chunks + + +def find_split_point( + wav: np.ndarray, + start_idx: int, + end_idx: int, + min_energy_window: int, +) -> int: + """Find the best point to split audio by looking for silence or low amplitude. + + Searches for the quietest region within a specified range by calculating + RMS energy in sliding windows. + + Args: + wav: Audio array. Can be 1D or multi-dimensional. + start_idx: Start index of search region (inclusive). + end_idx: End index of search region (exclusive). + min_energy_window: Window size in samples for energy calculation. + + Returns: + Index of the quietest point within the search region. This is the + recommended split point to minimize audio artifacts. + + Example: + >>> audio = np.random.randn(32000) + >>> # Insert quiet region + >>> audio[16000:17600] = 0.01 + >>> split_idx = find_split_point( + ... wav=audio, + ... start_idx=0, + ... end_idx=32000, + ... min_energy_window=1600, + ... ) + >>> 16000 <= split_idx <= 17600 + True + """ + segment = wav[start_idx:end_idx] + + # Calculate RMS energy in small windows + min_energy = math.inf + quietest_idx = 0 + + for i in range(0, len(segment) - min_energy_window, min_energy_window): + window = segment[i : i + min_energy_window] + energy = (window**2).mean() ** 0.5 + if energy < min_energy: + quietest_idx = i + start_idx + min_energy = energy + + return quietest_idx