[MM] Allow audio chunking for offline LLM (#34628)
Signed-off-by: NickLucche <nlucches@redhat.com>
This commit is contained in:
@@ -295,6 +295,51 @@ You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the mult
|
||||
|
||||
Full example: [examples/offline_inference/audio_language.py](../../examples/offline_inference/audio_language.py)
|
||||
|
||||
#### Chunking Long Audio for Transcription
|
||||
|
||||
Speech-to-text models like Whisper have a maximum audio length they can process (typically 30 seconds). For longer audio files, vLLM provides a utility to intelligently split audio into chunks at quiet points to minimize cutting through speech.
|
||||
|
||||
```python
|
||||
import librosa
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.multimodal.audio import split_audio
|
||||
|
||||
# Load long audio file
|
||||
audio, sr = librosa.load("long_audio.wav", sr=16000)
|
||||
|
||||
# Split into chunks at low-energy (quiet) regions
|
||||
chunks = split_audio(
|
||||
audio_data=audio,
|
||||
sample_rate=sr,
|
||||
max_clip_duration_s=30.0, # Maximum chunk length in seconds
|
||||
overlap_duration_s=1.0, # Search window for finding quiet split points
|
||||
min_energy_window_size=1600, # Window size for energy calculation (~100ms at 16kHz)
|
||||
)
|
||||
|
||||
# Initialize Whisper model
|
||||
llm = LLM(model="openai/whisper-large-v3-turbo")
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=256)
|
||||
|
||||
# Transcribe each chunk
|
||||
transcriptions = []
|
||||
for chunk in chunks:
|
||||
outputs = llm.generate({
|
||||
"prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
|
||||
"multi_modal_data": {"audio": (chunk, sr)},
|
||||
}, sampling_params)
|
||||
transcriptions.append(outputs[0].outputs[0].text)
|
||||
|
||||
# Combine results
|
||||
full_transcription = " ".join(transcriptions)
|
||||
```
|
||||
|
||||
The `split_audio` function:
|
||||
|
||||
- Splits audio at quiet points to avoid cutting through speech
|
||||
- Uses RMS energy to find low-amplitude regions within the overlap window
|
||||
- Preserves all audio samples (no data loss)
|
||||
- Supports any sample rate
|
||||
|
||||
#### Automatic Audio Channel Normalization
|
||||
|
||||
vLLM automatically normalizes audio channels for models that require specific audio formats. When loading audio with libraries like `torchaudio`, stereo files return shape `[channels, time]`, but many audio models (particularly Whisper-based models) expect mono audio with shape `[time]`.
|
||||
|
||||
@@ -16,6 +16,7 @@ from vllm.multimodal.audio import (
|
||||
normalize_audio,
|
||||
resample_audio_librosa,
|
||||
resample_audio_scipy,
|
||||
split_audio,
|
||||
)
|
||||
|
||||
|
||||
@@ -584,3 +585,186 @@ class TestAudioPipelineE2E:
|
||||
assert audio_output.ndim == 1
|
||||
assert audio_output.shape == (10,)
|
||||
np.testing.assert_array_almost_equal(audio_output, np.zeros(10))
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Tests for Audio Chunking Utilities
|
||||
# ============================================================
|
||||
|
||||
|
||||
class TestAudioChunking:
|
||||
"""Tests for split_audio and find_split_point utilities in vllm.multimodal.audio."""
|
||||
|
||||
def test_split_audio_short_clip(self):
|
||||
"""Audio shorter than max_clip_duration_s should not be split."""
|
||||
|
||||
# 10 seconds of audio at 16kHz
|
||||
audio = np.linspace(-1.0, 1.0, 160000, dtype=np.float32)
|
||||
|
||||
chunks = split_audio(
|
||||
audio_data=audio,
|
||||
sample_rate=16000,
|
||||
max_clip_duration_s=30.0,
|
||||
overlap_duration_s=1.0,
|
||||
min_energy_window_size=1600,
|
||||
)
|
||||
|
||||
assert len(chunks) == 1
|
||||
np.testing.assert_array_equal(chunks[0], audio)
|
||||
|
||||
def test_split_audio_exact_length(self):
|
||||
"""Audio exactly at max_clip_duration_s should not be split."""
|
||||
|
||||
# Exactly 30 seconds at 16kHz
|
||||
audio = np.linspace(-1.0, 1.0, 480000, dtype=np.float32)
|
||||
|
||||
chunks = split_audio(
|
||||
audio_data=audio,
|
||||
sample_rate=16000,
|
||||
max_clip_duration_s=30.0,
|
||||
overlap_duration_s=1.0,
|
||||
min_energy_window_size=1600,
|
||||
)
|
||||
|
||||
assert len(chunks) == 1
|
||||
np.testing.assert_array_equal(chunks[0], audio)
|
||||
|
||||
def test_split_audio_long_clip(self):
|
||||
"""Long audio should be split into multiple chunks."""
|
||||
|
||||
# 65 seconds of audio at 16kHz
|
||||
audio = np.linspace(-1.0, 1.0, 1040000, dtype=np.float32)
|
||||
|
||||
chunks = split_audio(
|
||||
audio_data=audio,
|
||||
sample_rate=16000,
|
||||
max_clip_duration_s=30.0,
|
||||
overlap_duration_s=1.0,
|
||||
min_energy_window_size=1600,
|
||||
)
|
||||
|
||||
assert len(chunks) > 1
|
||||
# First sample preserved
|
||||
assert chunks[0][0] == audio[0]
|
||||
# Last sample preserved
|
||||
assert chunks[-1][-1] == audio[-1]
|
||||
|
||||
def test_split_audio_chunks_have_correct_length(self):
|
||||
"""Each chunk (except last) should be approximately max_clip_duration_s."""
|
||||
|
||||
# 65 seconds of audio at 16kHz
|
||||
audio = np.linspace(-1.0, 1.0, 1040000, dtype=np.float32)
|
||||
|
||||
chunks = split_audio(
|
||||
audio_data=audio,
|
||||
sample_rate=16000,
|
||||
max_clip_duration_s=30.0,
|
||||
overlap_duration_s=1.0,
|
||||
min_energy_window_size=1600,
|
||||
)
|
||||
|
||||
max_samples = int(30.0 * 16000)
|
||||
overlap_samples = int(1.0 * 16000)
|
||||
|
||||
for chunk in chunks[:-1]:
|
||||
assert chunk.shape[0] >= max_samples - overlap_samples
|
||||
assert chunk.shape[0] <= max_samples
|
||||
|
||||
def test_find_split_point_finds_quiet_region(self):
|
||||
"""find_split_point should identify low-energy regions."""
|
||||
from vllm.multimodal.audio import find_split_point
|
||||
|
||||
# Create audio with a quiet section in the middle
|
||||
segment = np.ones(32000, dtype=np.float32)
|
||||
# Insert quiet region at sample 16000-17600 (100ms)
|
||||
segment[16000:17600] = 0.01
|
||||
|
||||
split_idx = find_split_point(
|
||||
wav=segment,
|
||||
start_idx=0,
|
||||
end_idx=32000,
|
||||
min_energy_window=1600,
|
||||
)
|
||||
|
||||
# Split should be in or near the quiet region
|
||||
assert 16000 <= split_idx <= 17600
|
||||
|
||||
def test_find_split_point_handles_uniform_audio(self):
|
||||
"""find_split_point should handle uniform energy audio gracefully."""
|
||||
from vllm.multimodal.audio import find_split_point
|
||||
|
||||
segment = np.ones(32000, dtype=np.float32) * 0.5
|
||||
|
||||
split_idx = find_split_point(
|
||||
wav=segment,
|
||||
start_idx=0,
|
||||
end_idx=32000,
|
||||
min_energy_window=1600,
|
||||
)
|
||||
|
||||
assert 0 <= split_idx <= 32000
|
||||
|
||||
def test_find_split_point_silence(self):
|
||||
"""find_split_point should prefer the quietest scanned window."""
|
||||
from vllm.multimodal.audio import find_split_point
|
||||
|
||||
# Deterministic signal: constant energy everywhere except silence.
|
||||
segment = np.ones(32000, dtype=np.float32)
|
||||
# Complete silence at 20000-21600.
|
||||
segment[20000:21600] = 0.0
|
||||
|
||||
split_idx = find_split_point(
|
||||
wav=segment,
|
||||
start_idx=16000,
|
||||
end_idx=28000,
|
||||
min_energy_window=1600,
|
||||
)
|
||||
|
||||
# Current implementation evaluates non-overlapping 1600-sample windows
|
||||
# from start_idx, so the quietest scanned window starts at 19200.
|
||||
assert split_idx == 19200
|
||||
|
||||
def test_split_audio_preserves_boundaries(self):
|
||||
"""Verify first and last samples are preserved when chunking."""
|
||||
|
||||
audio = np.arange(1120000, dtype=np.float32) # 70s at 16kHz
|
||||
|
||||
chunks = split_audio(
|
||||
audio_data=audio,
|
||||
sample_rate=16000,
|
||||
max_clip_duration_s=30.0,
|
||||
overlap_duration_s=1.0,
|
||||
min_energy_window_size=1600,
|
||||
)
|
||||
|
||||
assert chunks[0][0] == audio[0]
|
||||
assert chunks[-1][-1] == audio[-1]
|
||||
|
||||
def test_split_audio_with_different_sample_rates(self):
|
||||
"""Test chunking works with different sample rates."""
|
||||
|
||||
# 40 seconds at 8kHz
|
||||
audio_8k = np.linspace(-1.0, 1.0, 320000, dtype=np.float32)
|
||||
|
||||
chunks = split_audio(
|
||||
audio_data=audio_8k,
|
||||
sample_rate=8000,
|
||||
max_clip_duration_s=30.0,
|
||||
overlap_duration_s=1.0,
|
||||
min_energy_window_size=800,
|
||||
)
|
||||
|
||||
assert len(chunks) >= 2
|
||||
|
||||
# 40 seconds at 48kHz
|
||||
audio_48k = np.linspace(-1.0, 1.0, 1920000, dtype=np.float32)
|
||||
|
||||
chunks_48k = split_audio(
|
||||
audio_data=audio_48k,
|
||||
sample_rate=48000,
|
||||
max_clip_duration_s=30.0,
|
||||
overlap_duration_s=1.0,
|
||||
min_energy_window_size=4800,
|
||||
)
|
||||
|
||||
assert len(chunks_48k) >= 2
|
||||
|
||||
@@ -33,4 +33,7 @@ class SpeechToTextConfig:
|
||||
|
||||
@property
|
||||
def allow_audio_chunking(self) -> bool:
|
||||
return self.min_energy_split_window_size is not None
|
||||
return (
|
||||
self.min_energy_split_window_size is not None
|
||||
and self.max_audio_clip_s is not None
|
||||
)
|
||||
|
||||
@@ -45,6 +45,7 @@ from vllm.model_executor.models import (
|
||||
SupportsTranscription,
|
||||
supports_transcription,
|
||||
)
|
||||
from vllm.multimodal.audio import split_audio
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.renderers.inputs import DictPrompt, EncoderDecoderDictPrompt
|
||||
from vllm.renderers.inputs.preprocess import parse_enc_dec_prompt, parse_model_prompt
|
||||
@@ -323,11 +324,24 @@ class OpenAISpeechToText(OpenAIServing):
|
||||
self.asr_config.allow_audio_chunking
|
||||
and duration > self.asr_config.max_audio_clip_s
|
||||
)
|
||||
chunks = [y] if not do_split_audio else self._split_audio(y, int(sr))
|
||||
|
||||
if not do_split_audio:
|
||||
chunks = [y]
|
||||
else:
|
||||
assert self.asr_config.max_audio_clip_s is not None
|
||||
assert self.asr_config.min_energy_split_window_size is not None
|
||||
chunks = split_audio(
|
||||
audio_data=y,
|
||||
sample_rate=int(sr),
|
||||
max_clip_duration_s=self.asr_config.max_audio_clip_s,
|
||||
overlap_duration_s=self.asr_config.overlap_chunk_second,
|
||||
min_energy_window_size=self.asr_config.min_energy_split_window_size,
|
||||
)
|
||||
|
||||
if language is None and getattr(
|
||||
self.model_cls, "supports_explicit_language_detection", False
|
||||
):
|
||||
# Auto-detect language from the first chunk.
|
||||
language = await self._detect_language(
|
||||
chunks[0], f"{request_id}-lang_detect"
|
||||
)
|
||||
@@ -754,55 +768,3 @@ class OpenAISpeechToText(OpenAIServing):
|
||||
yield f"data: {data}\n\n"
|
||||
# Send the final done message after all response.n are finished
|
||||
yield "data: [DONE]\n\n"
|
||||
|
||||
def _split_audio(
|
||||
self, audio_data: np.ndarray, sample_rate: int
|
||||
) -> list[np.ndarray]:
|
||||
assert self.asr_config.max_audio_clip_s is not None, (
|
||||
f"{self.asr_config.max_audio_clip_s=} cannot be None to"
|
||||
" split audio into chunks."
|
||||
)
|
||||
chunk_size = sample_rate * self.asr_config.max_audio_clip_s
|
||||
overlap_size = sample_rate * self.asr_config.overlap_chunk_second
|
||||
chunks = []
|
||||
i = 0
|
||||
while i < audio_data.shape[-1]:
|
||||
if i + chunk_size >= audio_data.shape[-1]:
|
||||
# handle last chunk
|
||||
chunks.append(audio_data[..., i:])
|
||||
break
|
||||
|
||||
# Find the best split point in the overlap region
|
||||
search_start = i + chunk_size - overlap_size
|
||||
search_end = min(i + chunk_size, audio_data.shape[-1])
|
||||
split_point = self._find_split_point(audio_data, search_start, search_end)
|
||||
|
||||
# Extract chunk up to the split point
|
||||
chunks.append(audio_data[..., i:split_point])
|
||||
i = split_point
|
||||
return chunks
|
||||
|
||||
def _find_split_point(self, wav: np.ndarray, start_idx: int, end_idx: int) -> int:
|
||||
"""Find the best point to split audio by
|
||||
looking for silence or low amplitude.
|
||||
Args:
|
||||
wav: Audio tensor [1, T]
|
||||
start_idx: Start index of search region
|
||||
end_idx: End index of search region
|
||||
Returns:
|
||||
Index of best splitting point
|
||||
"""
|
||||
segment = wav[start_idx:end_idx]
|
||||
|
||||
# Calculate RMS energy in small windows
|
||||
min_energy = math.inf
|
||||
quietest_idx = 0
|
||||
min_energy_window = self.asr_config.min_energy_split_window_size
|
||||
assert min_energy_window is not None
|
||||
for i in range(0, len(segment) - min_energy_window, min_energy_window):
|
||||
window = segment[i : i + min_energy_window]
|
||||
energy = (window**2).mean() ** 0.5
|
||||
if energy < min_energy:
|
||||
quietest_idx = i + start_idx
|
||||
min_energy = energy
|
||||
return quietest_idx
|
||||
|
||||
@@ -216,3 +216,121 @@ class AudioResampler:
|
||||
f"Invalid resampling method: {self.method}. "
|
||||
"Supported methods are 'librosa' and 'scipy'."
|
||||
)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Audio Chunking / Splitting
|
||||
# ============================================================
|
||||
|
||||
|
||||
def split_audio(
|
||||
audio_data: np.ndarray,
|
||||
sample_rate: int,
|
||||
max_clip_duration_s: float,
|
||||
overlap_duration_s: float,
|
||||
min_energy_window_size: int,
|
||||
) -> list[np.ndarray]:
|
||||
"""Split audio into chunks with intelligent split points.
|
||||
|
||||
Splits long audio into smaller chunks at low-energy regions to minimize
|
||||
cutting through speech. Uses overlapping windows to find quiet moments
|
||||
for splitting.
|
||||
|
||||
Args:
|
||||
audio_data: Audio array to split. Can be 1D (mono) or multi-dimensional.
|
||||
Splits along the last dimension (time axis).
|
||||
sample_rate: Sample rate of the audio in Hz.
|
||||
max_clip_duration_s: Maximum duration of each chunk in seconds.
|
||||
overlap_duration_s: Overlap duration in seconds between consecutive chunks.
|
||||
Used to search for optimal split points.
|
||||
min_energy_window_size: Window size in samples for finding low-energy regions.
|
||||
|
||||
Returns:
|
||||
List of audio chunks. Each chunk is a numpy array with the same shape
|
||||
as the input except for the last (time) dimension.
|
||||
|
||||
Example:
|
||||
>>> audio = np.random.randn(1040000) # 65 seconds at 16kHz
|
||||
>>> chunks = split_audio(
|
||||
... audio_data=audio,
|
||||
... sample_rate=16000,
|
||||
... max_clip_duration_s=30.0,
|
||||
... overlap_duration_s=1.0,
|
||||
... min_energy_window_size=1600,
|
||||
... )
|
||||
>>> len(chunks)
|
||||
3
|
||||
"""
|
||||
chunk_size = int(sample_rate * max_clip_duration_s)
|
||||
overlap_size = int(sample_rate * overlap_duration_s)
|
||||
chunks = []
|
||||
i = 0
|
||||
|
||||
while i < audio_data.shape[-1]:
|
||||
if i + chunk_size >= audio_data.shape[-1]:
|
||||
# Handle last chunk - take everything remaining
|
||||
chunks.append(audio_data[..., i:])
|
||||
break
|
||||
|
||||
# Find the best split point in the overlap region
|
||||
search_start = i + chunk_size - overlap_size
|
||||
search_end = min(i + chunk_size, audio_data.shape[-1])
|
||||
split_point = find_split_point(
|
||||
audio_data, search_start, search_end, min_energy_window_size
|
||||
)
|
||||
|
||||
# Extract chunk up to the split point
|
||||
chunks.append(audio_data[..., i:split_point])
|
||||
i = split_point
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def find_split_point(
|
||||
wav: np.ndarray,
|
||||
start_idx: int,
|
||||
end_idx: int,
|
||||
min_energy_window: int,
|
||||
) -> int:
|
||||
"""Find the best point to split audio by looking for silence or low amplitude.
|
||||
|
||||
Searches for the quietest region within a specified range by calculating
|
||||
RMS energy in sliding windows.
|
||||
|
||||
Args:
|
||||
wav: Audio array. Can be 1D or multi-dimensional.
|
||||
start_idx: Start index of search region (inclusive).
|
||||
end_idx: End index of search region (exclusive).
|
||||
min_energy_window: Window size in samples for energy calculation.
|
||||
|
||||
Returns:
|
||||
Index of the quietest point within the search region. This is the
|
||||
recommended split point to minimize audio artifacts.
|
||||
|
||||
Example:
|
||||
>>> audio = np.random.randn(32000)
|
||||
>>> # Insert quiet region
|
||||
>>> audio[16000:17600] = 0.01
|
||||
>>> split_idx = find_split_point(
|
||||
... wav=audio,
|
||||
... start_idx=0,
|
||||
... end_idx=32000,
|
||||
... min_energy_window=1600,
|
||||
... )
|
||||
>>> 16000 <= split_idx <= 17600
|
||||
True
|
||||
"""
|
||||
segment = wav[start_idx:end_idx]
|
||||
|
||||
# Calculate RMS energy in small windows
|
||||
min_energy = math.inf
|
||||
quietest_idx = 0
|
||||
|
||||
for i in range(0, len(segment) - min_energy_window, min_energy_window):
|
||||
window = segment[i : i + min_energy_window]
|
||||
energy = (window**2).mean() ** 0.5
|
||||
if energy < min_energy:
|
||||
quietest_idx = i + start_idx
|
||||
min_energy = energy
|
||||
|
||||
return quietest_idx
|
||||
|
||||
Reference in New Issue
Block a user