Skip to main content

🎙️ WER Evaluation

This guide walks you through running automatic speech recognition (ASR) on a dataset of audio files using the Smallest.ai Lightning ASR API, normalizing the text output, and computing Word Error Rate (WER) against reference transcripts. The guide includes examples for both WebSocket streaming and HTTP POST methods (raw audio bytes and URL-based).

📂 Input CSV Format

Your dataset should be a CSV file with the following columns:

For Local Audio Files (Raw Audio method):

ColumnDescription
audio_pathPath to the audio file
textGround-truth reference text transcript
Example:
audio_path,text
data/en_001.wav,Hello how are you doing today
data/en_002.wav,This is a test of speech recognition

For Remote Audio Files (URL method):

ColumnDescription
audio_urlURL to the audio file (must be publicly accessible)
textGround-truth reference text transcript
Example:
audio_url,text
https://example.com/audio/en_001.wav,Hello how are you doing today
https://example.com/audio/en_002.wav,This is a test of speech recognition

Installation

Make sure you install dependencies:
pip install websockets jiwer whisper-normalizer

Usage

Save the script as asr_eval.py and update your API key.
import asyncio
import websockets
import json
import csv
from whisper_normalizer.english import EnglishTextNormalizer
from whisper_normalizer.basic import BasicTextNormalizer
from jiwer import wer

english_normalizer = EnglishTextNormalizer()
other_language_normalizer = BasicTextNormalizer()

async def transcribe_audio(api_key, audio_file):
with open(audio_file, 'rb') as f:
audio_data = f.read()

    params = {
        "audioLanguage": "en",         # Change to your language
        "audioEncoding": "linear16",   # 16-bit PCM
        "audioSampleRate": "16000",    # sample rate of the audio file
        "audioChannels": "1",
        "addPunctuation": "true",
        "api_key": api_key
    }
    query_string = "&".join([f"{k}={v}" for k, v in params.items()])
    url = f"wss://waves-api.smallest.ai/api/v1/asr?{query_string}"

    transcription = []

    async with websockets.connect(url) as ws:
        async def listen():
            async for message in ws:
                response = json.loads(message)
                if "text" in response:
                    transcription.append(response["text"])

        listen_task = asyncio.create_task(listen())

        chunk_size = int(16000 * 2 * 0.3)  # 16kHz × 2 bytes × 0.3s
        while audio_data:
            chunk, audio_data = audio_data[:chunk_size], audio_data[chunk_size:]
            await ws.send(chunk)
            await asyncio.sleep(0.3)

        await ws.send(b'')  # End of stream
        await asyncio.sleep(2)
        listen_task.cancel()

    return " ".join(transcription)

def calculate_wer(reference, hypothesis, language="en"):
if language == "en":
ref_normalized = english_normalizer(reference)
hyp_normalized = english_normalizer(hypothesis)
else:
ref_normalized = other_language_normalizer(reference)
hyp_normalized = other_language_normalizer(hypothesis)
return wer(ref_normalized, hyp_normalized)

async def main():
api_key = "your_api_key_here"
input_csv = "fleurs_dataset.csv" # input CSV
output_csv = "transcription_results_streaming.csv"

    results = []

    with open(input_csv, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            audio_file = row['audio_path']
            reference_text = row.get('text', '')

            transcript = await transcribe_audio(api_key, audio_file)
            row['transcript'] = transcript

            row['wer'] = calculate_wer(reference_text, transcript)
            results.append(row)

    if results:
        with open(output_csv, 'w', newline='') as f:
            fieldnames = list(results[0].keys())
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(results)

    wer_scores = [row['wer'] for row in results]
    if wer_scores:
        avg_wer = sum(wer_scores) / len(wer_scores)
        print(f"Average WER: {avg_wer:.3f}")

if **name** == "**main**":
asyncio.run(main())


Output

A new CSV (transcription_results.csv) will be generated with the following columns:
ColumnDescription
audio_pathPath to the audio file
textReference ground-truth transcript
transcriptASR model output
werWord Error Rate (if text provided)
Example:
audio_path,text,transcript,wer
data/en_001.wav,Hello how are you doing today,hello how are you doing today,0.000
data/en_002.wav,This is a test of speech recognition,this is a test speech recognition,0.167

📊 Metrics

  • WER = Word Error Rate = (Substitutions + Deletions + Insertions) ÷ Reference words
  • Normalization is applied using whisper-normalizer before computing WER.
  • For English, EnglishTextNormalizer is used, otherwise a more general one BasicTextNormalizer.