openai_lid_client.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Language Identification (LID) demo using the FireRedLID model on vLLM.

FireRedLID is an audio encoder-decoder model that identifies the spoken
language of an audio clip. Unlike ASR models that output full transcriptions,
FireRedLID outputs at most 2 tokens representing the detected language
(e.g. "en", "zh mandarin").

Start the vLLM server:

    vllm serve PatchyTisa/FireRedLID-vllm

Then run this script:

    # Use the built-in sample audio
    python examples/online_serving/openai_lid_client.py

    # Use your own audio file(s)
    python examples/online_serving/openai_lid_client.py \
        --audio_paths audio_en.wav audio_zh.wav audio_fr.wav

    # Batch-identify multiple files in one run
    python examples/online_serving/openai_lid_client.py \
        --audio_paths /path/to/dir/*.wav

Requirements:
- vLLM with audio support
- openai Python SDK
- kaldi_native_fbank (pulled in by the model)
"""

import argparse
import json
import os

from openai import OpenAI

from vllm.assets.audio import AudioAsset

# ──────────────────────────────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────────────────────────────


def identify_language(
    audio_path: str,
    client: OpenAI,
    model: str,
) -> str:
    """
    Send a single audio file to the vLLM transcription endpoint and return
    the detected language tag.

    FireRedLID re-uses the OpenAI-compatible ``/v1/audio/transcriptions``
    endpoint. The "transcription" it returns is actually the language label
    (e.g. ``"en"`` or ``"zh mandarin"``).
    """
    with open(audio_path, "rb") as f:
        result = client.audio.transcriptions.create(
            file=f,
            model=model,
            response_format="json",
            temperature=0.0,
        )
    return result.text.strip()


def identify_language_raw(
    audio_path: str,
    model: str,
    api_base: str,
) -> str:
    """
    Same as :func:`identify_language` but uses raw HTTP so that the demo
    works without the ``openai`` SDK (useful for quick debugging).
    """
    import requests

    url = f"{api_base}/audio/transcriptions"
    with open(audio_path, "rb") as f:
        files = {"file": (os.path.basename(audio_path), f)}
        data = {
            "model": model,
            "response_format": "json",
        }
        resp = requests.post(url, files=files, data=data)
        resp.raise_for_status()
    return resp.json()["text"].strip()


def identify_language_streaming(
    audio_path: str,
    model: str,
    api_base: str,
) -> str:
    """
    Streaming variant – demonstrates the streaming transcription endpoint.
    For a 1-2 token output the stream finishes almost instantly, but this
    shows that the API path works end-to-end.
    """
    import requests

    url = f"{api_base}/audio/transcriptions"
    with open(audio_path, "rb") as f:
        files = {"file": (os.path.basename(audio_path), f)}
        data = {
            "stream": "true",
            "model": model,
            "response_format": "json",
        }
        response = requests.post(url, files=files, data=data, stream=True)
        response.raise_for_status()

        tokens: list[str] = []
        for chunk in response.iter_lines(
            chunk_size=8192, decode_unicode=False, delimiter=b"\n"
        ):
            if not chunk:
                continue
            payload = json.loads(chunk[len("data: ") :].decode("utf-8"))
            choice = payload["choices"][0]
            delta = choice.get("delta", {}).get("content", "")
            if delta:
                tokens.append(delta)
            if choice.get("finish_reason") is not None:
                break

    return "".join(tokens).strip()


# ──────────────────────────────────────────────────────────────────────
# Main
# ──────────────────────────────────────────────────────────────────────


def main(args: argparse.Namespace) -> None:
    api_base = args.api_base.rstrip("/")
    client = OpenAI(api_key="EMPTY", base_url=api_base)
    model = client.models.list().data[0].id
    print(f"Model : {model}")
    print(f"Server: {api_base}\n")

    # Resolve audio paths ------------------------------------------------
    if args.audio_paths:
        audio_paths = args.audio_paths
    else:
        # Fall back to the built-in vLLM sample audios (both are English).
        audio_paths = [
            str(AudioAsset("mary_had_lamb").get_local_path()),
            str(AudioAsset("winning_call").get_local_path()),
        ]

    # Run LID for each file ----------------------------------------------
    print(f"{'Audio File':<50} {'Language (sync)':<20} {'Language (stream)'}")
    print("-" * 90)

    for path in audio_paths:
        basename = os.path.basename(path)

        # 1) Synchronous via OpenAI SDK
        lang_sync = identify_language(path, client, model)

        # 2) Streaming via raw HTTP
        lang_stream = identify_language_streaming(path, model, api_base)

        print(f"{basename:<50} {lang_sync:<20} {lang_stream}")

    print()


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="FireRedLID – Language Identification demo via vLLM",
    )
    parser.add_argument(
        "--audio_paths",
        nargs="+",
        default=None,
        help=(
            "One or more audio files to identify. "
            "If omitted, uses vLLM's built-in sample audios."
        ),
    )
    parser.add_argument(
        "--api_base",
        type=str,
        default="http://localhost:8000/v1",
        help="vLLM API base URL (default: http://localhost:8000/v1)",
    )
    args = parser.parse_args()
    main(args)