Docs
Speaker Diarization

Speaker Diarization

Separate speakers in an audio file

Argmax Pro SDK includes the SpeakerKitPro framework which supports two speaker diarization engines:

  • Pyannote (macOS 13+, iOS 16+): inference-optimized version of Pyannote. For more details on architecture and competitive benchmarks, please see our Interspeech 2025 paper.
  • Sortformer (macOS 15+, iOS 18+): end-to-end Nvidia Sortformer engine with streaming support. See Real-time Transcription with Speakers for the streaming example.

Basic Example

This is a complete and self-contained CLI example project that demonstrates the usage of Argmax Pro SDK for speaker diarization on an audio file.

Step 0: Verify Pro SDK setup

Argmax Pro SDK access must be set up with SwiftPM before going through this example. If unsure, please see Installation (Step 1 only).

Step 1: Create project directory

Create a project directory as shown below and insert the code shared below into ArgmaxTestCommand.swift and Package.swift

ArgmaxSpeakerDiarizationBasicExample
├── Package.swift
└── Sources
    └── ArgmaxTestCLI
        └── ArgmaxTestCommand.swift

Package.swift:

// swift-tools-version: 6.0
// The swift-tools-version declares the minimum version of Swift required to build this package.
 
import PackageDescription
 
let package = Package(
    name: "Argmax Test CLI",
    platforms: [
        .macOS(.v13)
    ],
    products: [
        .executable(
            name: "argmax-test-cli",
            targets: ["ArgmaxTestCLI"]
        )
    ],
    dependencies: [
        .package(id: "argmaxinc.argmax-sdk-swift", .upToNextMinor(from: "2.0.9")),
        .package(url: "https://github.com/apple/swift-argument-parser.git", exact: "1.3.0")
    ],
    targets: [
        .executableTarget(
            name: "ArgmaxTestCLI",
            dependencies: [
                .product(name: "Argmax", package: "argmaxinc.argmax-sdk-swift"),
                .product(name: "ArgumentParser", package: "swift-argument-parser")
            ]
        ),
    ]
)
 

ArgmaxTestCommand.swift:

import Foundation
@preconcurrency import ArgumentParser
import Argmax
 
@main
struct ArgmaxTestCommand: AsyncParsableCommand {
    static let configuration = CommandConfiguration(
        abstract: "An example CLI tool for Argmax Pro SDK",
        subcommands: [Pyannote.self, Sortformer.self]
    )
 
    struct Pyannote: AsyncParsableCommand {
        static let configuration = CommandConfiguration(
            abstract: "Speaker diarization using Pyannote (macOS 13+)"
        )
 
        @Option(name: .long, help: "Argmax Pro SDK API key")
        var apiKey: String
 
        @Option(help: "Path to the audio file to process")
        var audioPath: String
 
        @Option(help: "Path to save the diarization output in RTTM format")
        var rttmPath: String?
 
        @Option(help: "Number of speakers to detect if known (default: automatic)")
        var numSpeakers: Int?
 
        mutating func run() async throws {
            print("Initializing Argmax Pro SDK...")
            let sdkConfig = ArgmaxConfig(apiKey: apiKey)
            await ArgmaxSDK.with(sdkConfig)
 
            print("Loading audio file from \(audioPath)...")
            let audioFrames = try AudioProcessor.loadAudioAsFloatArray(fromPath: audioPath)
 
            print("Initializing SpeakerKitPro (Pyannote)...")
            let diarizer = SpeakerKitDiarizer.pyannote()
            try await diarizer.downloadModels()
            try await diarizer.loadModels()
            let config = PyannoteConfig(diarizer: diarizer)
            let speakerKit = try await SpeakerKitPro(config)
            defer { Task { await speakerKit.unloadModels() } }
 
            print("Running diarization...")
            let options: (any DiarizationOptions)? = numSpeakers.map {
                PyannoteDiarizationOptions(numberOfSpeakers: $0)
            }
            let result = try await speakerKit.diarize(audioArray: audioFrames, options: options)
 
            outputRTTM(result: result, audioPath: audioPath, rttmPath: rttmPath)
 
            if let timings = result.timings {
                print(timings.debugDescription)
            }
        }
    }
 
    struct Sortformer: AsyncParsableCommand {
        static let configuration = CommandConfiguration(
            abstract: "Speaker diarization using Sortformer (macOS 15+)"
        )
 
        @Option(name: .long, help: "Argmax Pro SDK API key")
        var apiKey: String
 
        @Option(help: "Path to the audio file to process")
        var audioPath: String
 
        @Option(help: "Path to save the diarization output in RTTM format")
        var rttmPath: String?
 
        mutating func run() async throws {
            if #unavailable(macOS 15, iOS 18, watchOS 11, visionOS 2) {
                throw ValidationError("Sortformer requires macOS 15 / iOS 18 or later.")
            }
 
            print("Initializing Argmax Pro SDK...")
            let sdkConfig = ArgmaxConfig(apiKey: apiKey)
            await ArgmaxSDK.with(sdkConfig)
 
            print("Loading audio file from \(audioPath)...")
            let audioFrames = try AudioProcessor.loadAudioAsFloatArray(fromPath: audioPath)
 
            print("Initializing SpeakerKitPro (Sortformer)...")
            let speakerKit = try await SpeakerKitPro()
            defer { Task { await speakerKit.unloadModels() } }
 
            print("Running diarization...")
            let options = SortformerDiarizationOptions(sortformerMode: .prerecorded)
            let result = try await speakerKit.diarize(audioArray: audioFrames, options: options)
 
            outputRTTM(result: result, audioPath: audioPath, rttmPath: rttmPath)
 
            if let timings = result.timings {
                print(timings.debugDescription)
            }
        }
    }
}
 
private func outputRTTM(result: DiarizationResult, audioPath: String, rttmPath: String?) {
    let fileName = URL(filePath: audioPath).deletingPathExtension().lastPathComponent
    let rttmLines = SpeakerKit.generateRTTM(from: result, fileName: fileName)
    let rttmContent = rttmLines.map { $0.description }.joined(separator: "\n")
 
    if let rttmPath {
        do {
            try rttmContent.write(to: URL(filePath: rttmPath), atomically: true, encoding: .utf8)
            print("RTTM saved to \(rttmPath)")
        } catch {
            print("Failed to write RTTM: \(error)")
        }
    } else {
        print(rttmContent)
    }
}
 

Step 2: Build and run in Terminal

Run the following command in your Terminal from within the top-level project directory:

# Pyannote (macOS 13+)
swift run argmax-test-cli pyannote --api-key <API_KEY> --audio-path <AUDIO_PATH>
 
# Sortformer (macOS 15+)
swift run argmax-test-cli sortformer --api-key <API_KEY> --audio-path <AUDIO_PATH>

If you observe error: no registry configured for 'argmaxinc' scope, go back to Step 0.