Code Examples

Below is a complete Python example demonstrating audio preprocessing, transcription with age/gender detection, emotion detection, and sentence-level timestamps (utterances).

import os
from pydub import AudioSegment
from smallestai.waves import WavesClient

client = WavesClient(api_key=os.getenv("SMALLEST_API_KEY"))

def preprocess_audio(input_path, output_path):
    """
    Preprocess audio file to optimal format for Lightning STT:
    - Convert to 16 kHz mono WAV
    - Normalize audio levels
    - Remove leading/trailing silence
    """
    audio = AudioSegment.from_file(input_path)
    audio = audio.set_frame_rate(16000).set_channels(1)
    audio = audio.normalize()
    audio = audio.strip_silence(silence_len=100, silence_thresh=-40)
    audio.export(output_path, format="wav")
    print(f"Preprocessed audio saved to: {output_path}")
    return output_path

def transcribe_with_features(audio_path):
    """
    Transcribe audio with age detection, emotion detection, and utterances.
    """
    response = client.transcribe(
        file_path=audio_path,
        model="lightning",
        language="en",
        word_timestamps=True,
        age_detection=True,
        gender_detection=True,
        emotion_detection=True,
        diarize=True
    )
    
    return response

def process_results(response):
    """
    Extract and display transcription results with all metadata.
    """
    print("=" * 60)
    print("TRANSCRIPTION RESULTS")
    print("=" * 60)
    
    print(f"\nTranscription: {response.get('transcription', 'N/A')}")
    
    if 'age' in response:
        print(f"\nAge: {response['age']}")
    if 'gender' in response:
        print(f"Gender: {response['gender']}")
    
    if 'emotions' in response:
        print("\nEmotion Scores:")
        emotions = response['emotions']
        for emotion, score in emotions.items():
            print(f"  {emotion.capitalize()}: {score:.2f}")
    
    if 'utterances' in response:
        print("\nUtterances (Sentence-level timestamps):")
        for i, utterance in enumerate(response['utterances'], 1):
            speaker = utterance.get('speaker', 'unknown')
            start = utterance.get('start', 0)
            end = utterance.get('end', 0)
            text = utterance.get('text', '')
            print(f"\n  [{i}] Speaker: {speaker}")
            print(f"      Time: {start:.2f}s - {end:.2f}s")
            print(f"      Text: {text}")
    
    if 'word_timestamps' in response:
        print(f"\nWord-level timestamps: {len(response['word_timestamps'])} words")
    
    if 'metadata' in response:
        metadata = response['metadata']
        print(f"\nMetadata:")
        print(f"  Duration: {metadata.get('duration', 'N/A')}s")
        print(f"  Filename: {metadata.get('filename', 'N/A')}")

if __name__ == "__main__":
    input_audio = "input_audio.mp3"
    preprocessed_audio = "preprocessed_audio.wav"
    
    try:
        print("Preprocessing audio...")
        preprocess_audio(input_audio, preprocessed_audio)
        
        print("\nTranscribing audio with age, emotion, and utterance detection...")
        result = transcribe_with_features(preprocessed_audio)
        
        process_results(result)
        
        if os.path.exists(preprocessed_audio):
            os.remove(preprocessed_audio)
            print("\nCleaned up temporary preprocessed file.")
            
    except FileNotFoundError:
        print(f"Error: Audio file '{input_audio}' not found.")
    except Exception as e:
        print(f"Error: {str(e)}")

Prerequisites

Install required dependencies:

pip install smallest-ai pydub

Key Features Demonstrated

Audio Preprocessing: Converts audio to 16 kHz mono WAV, normalizes levels, and removes silence
Age & Gender Detection: Enables demographic analysis
Emotion Detection: Captures emotional tone with confidence scores
Utterances: Retrieves sentence-level timestamps with speaker labels
Diarization: Separates speakers for multi-speaker audio

Expected Output

The script will output:

Full transcription text
Age and gender predictions
Emotion scores (happiness, sadness, disgust, fear, anger)
Sentence-level utterances with timestamps and speaker IDs
Audio metadata (duration, filename)

Introduction

Getting Started

Text to Speech

Speech to Text

Voice Cloning

Integrations

Product

Best Practices

Code Examples

Prerequisites

Key Features Demonstrated

Expected Output

Introduction

Getting Started

Text to Speech

Speech to Text

Voice Cloning

Integrations

Product

Best Practices

​Prerequisites

​Key Features Demonstrated

​Expected Output

Prerequisites

Key Features Demonstrated

Expected Output