Skip to main content
Below is a complete Python example demonstrating audio preprocessing, transcription with age/gender detection, emotion detection, and sentence-level timestamps (utterances).
import os
from pydub import AudioSegment
from smallestai.waves import WavesClient

client = WavesClient(api_key=os.getenv("SMALLEST_API_KEY"))

def preprocess_audio(input_path, output_path):
    """
    Preprocess audio file to optimal format for Lightning STT:
    - Convert to 16 kHz mono WAV
    - Normalize audio levels
    - Remove leading/trailing silence
    """
    audio = AudioSegment.from_file(input_path)
    audio = audio.set_frame_rate(16000).set_channels(1)
    audio = audio.normalize()
    audio = audio.strip_silence(silence_len=100, silence_thresh=-40)
    audio.export(output_path, format="wav")
    print(f"Preprocessed audio saved to: {output_path}")
    return output_path

def transcribe_with_features(audio_path):
    """
    Transcribe audio with age detection, emotion detection, and utterances.
    """
    response = client.transcribe(
        file_path=audio_path,
        model="lightning",
        language="en",
        word_timestamps=True,
        age_detection=True,
        gender_detection=True,
        emotion_detection=True,
        diarize=True
    )
    
    return response

def process_results(response):
    """
    Extract and display transcription results with all metadata.
    """
    print("=" * 60)
    print("TRANSCRIPTION RESULTS")
    print("=" * 60)
    
    print(f"\nTranscription: {response.get('transcription', 'N/A')}")
    
    if 'age' in response:
        print(f"\nAge: {response['age']}")
    if 'gender' in response:
        print(f"Gender: {response['gender']}")
    
    if 'emotions' in response:
        print("\nEmotion Scores:")
        emotions = response['emotions']
        for emotion, score in emotions.items():
            print(f"  {emotion.capitalize()}: {score:.2f}")
    
    if 'utterances' in response:
        print("\nUtterances (Sentence-level timestamps):")
        for i, utterance in enumerate(response['utterances'], 1):
            speaker = utterance.get('speaker', 'unknown')
            start = utterance.get('start', 0)
            end = utterance.get('end', 0)
            text = utterance.get('text', '')
            print(f"\n  [{i}] Speaker: {speaker}")
            print(f"      Time: {start:.2f}s - {end:.2f}s")
            print(f"      Text: {text}")
    
    if 'word_timestamps' in response:
        print(f"\nWord-level timestamps: {len(response['word_timestamps'])} words")
    
    if 'metadata' in response:
        metadata = response['metadata']
        print(f"\nMetadata:")
        print(f"  Duration: {metadata.get('duration', 'N/A')}s")
        print(f"  Filename: {metadata.get('filename', 'N/A')}")

if __name__ == "__main__":
    input_audio = "input_audio.mp3"
    preprocessed_audio = "preprocessed_audio.wav"
    
    try:
        print("Preprocessing audio...")
        preprocess_audio(input_audio, preprocessed_audio)
        
        print("\nTranscribing audio with age, emotion, and utterance detection...")
        result = transcribe_with_features(preprocessed_audio)
        
        process_results(result)
        
        if os.path.exists(preprocessed_audio):
            os.remove(preprocessed_audio)
            print("\nCleaned up temporary preprocessed file.")
            
    except FileNotFoundError:
        print(f"Error: Audio file '{input_audio}' not found.")
    except Exception as e:
        print(f"Error: {str(e)}")

Prerequisites

Install required dependencies:
pip install smallest-ai pydub

Key Features Demonstrated

  1. Audio Preprocessing: Converts audio to 16 kHz mono WAV, normalizes levels, and removes silence
  2. Age & Gender Detection: Enables demographic analysis
  3. Emotion Detection: Captures emotional tone with confidence scores
  4. Utterances: Retrieves sentence-level timestamps with speaker labels
  5. Diarization: Separates speakers for multi-speaker audio

Expected Output

The script will output:
  • Full transcription text
  • Age and gender predictions
  • Emotion scores (happiness, sadness, disgust, fear, anger)
  • Sentence-level utterances with timestamps and speaker IDs
  • Audio metadata (duration, filename)