Docs
Speaker Diarization
Speaker Diarization
Separate speakers in an audio file
Argmax Pro SDK includes the SpeakerKitPro framework which supports two speaker diarization engines:
- Pyannote (macOS 13+, iOS 16+): inference-optimized version of Pyannote. For more details on architecture and competitive benchmarks, please see our Interspeech 2025 paper.
- Sortformer (macOS 15+, iOS 18+): end-to-end Nvidia Sortformer engine with streaming support. See Real-time Transcription with Speakers for the streaming example.
Basic Example
This is a complete and self-contained CLI example project that demonstrates the usage of Argmax Pro SDK for speaker diarization on an audio file.
Step 0: Verify Pro SDK setup
Argmax Pro SDK access must be set up with SwiftPM before going through this example. If unsure, please see Installation (Step 1 only).
Step 1: Create project directory
Create a project directory as shown below and insert the code shared below into ArgmaxTestCommand.swift and Package.swift
ArgmaxSpeakerDiarizationBasicExample
├── Package.swift
└── Sources
└── ArgmaxTestCLI
└── ArgmaxTestCommand.swiftPackage.swift:
// swift-tools-version: 6.0
// The swift-tools-version declares the minimum version of Swift required to build this package.
import PackageDescription
let package = Package(
name: "Argmax Test CLI",
platforms: [
.macOS(.v13)
],
products: [
.executable(
name: "argmax-test-cli",
targets: ["ArgmaxTestCLI"]
)
],
dependencies: [
.package(id: "argmaxinc.argmax-sdk-swift", .upToNextMinor(from: "2.0.9")),
.package(url: "https://github.com/apple/swift-argument-parser.git", exact: "1.3.0")
],
targets: [
.executableTarget(
name: "ArgmaxTestCLI",
dependencies: [
.product(name: "Argmax", package: "argmaxinc.argmax-sdk-swift"),
.product(name: "ArgumentParser", package: "swift-argument-parser")
]
),
]
)
ArgmaxTestCommand.swift:
import Foundation
@preconcurrency import ArgumentParser
import Argmax
@main
struct ArgmaxTestCommand: AsyncParsableCommand {
static let configuration = CommandConfiguration(
abstract: "An example CLI tool for Argmax Pro SDK",
subcommands: [Pyannote.self, Sortformer.self]
)
struct Pyannote: AsyncParsableCommand {
static let configuration = CommandConfiguration(
abstract: "Speaker diarization using Pyannote (macOS 13+)"
)
@Option(name: .long, help: "Argmax Pro SDK API key")
var apiKey: String
@Option(help: "Path to the audio file to process")
var audioPath: String
@Option(help: "Path to save the diarization output in RTTM format")
var rttmPath: String?
@Option(help: "Number of speakers to detect if known (default: automatic)")
var numSpeakers: Int?
mutating func run() async throws {
print("Initializing Argmax Pro SDK...")
let sdkConfig = ArgmaxConfig(apiKey: apiKey)
await ArgmaxSDK.with(sdkConfig)
print("Loading audio file from \(audioPath)...")
let audioFrames = try AudioProcessor.loadAudioAsFloatArray(fromPath: audioPath)
print("Initializing SpeakerKitPro (Pyannote)...")
let diarizer = SpeakerKitDiarizer.pyannote()
try await diarizer.downloadModels()
try await diarizer.loadModels()
let config = PyannoteConfig(diarizer: diarizer)
let speakerKit = try await SpeakerKitPro(config)
defer { Task { await speakerKit.unloadModels() } }
print("Running diarization...")
let options: (any DiarizationOptions)? = numSpeakers.map {
PyannoteDiarizationOptions(numberOfSpeakers: $0)
}
let result = try await speakerKit.diarize(audioArray: audioFrames, options: options)
outputRTTM(result: result, audioPath: audioPath, rttmPath: rttmPath)
if let timings = result.timings {
print(timings.debugDescription)
}
}
}
struct Sortformer: AsyncParsableCommand {
static let configuration = CommandConfiguration(
abstract: "Speaker diarization using Sortformer (macOS 15+)"
)
@Option(name: .long, help: "Argmax Pro SDK API key")
var apiKey: String
@Option(help: "Path to the audio file to process")
var audioPath: String
@Option(help: "Path to save the diarization output in RTTM format")
var rttmPath: String?
mutating func run() async throws {
if #unavailable(macOS 15, iOS 18, watchOS 11, visionOS 2) {
throw ValidationError("Sortformer requires macOS 15 / iOS 18 or later.")
}
print("Initializing Argmax Pro SDK...")
let sdkConfig = ArgmaxConfig(apiKey: apiKey)
await ArgmaxSDK.with(sdkConfig)
print("Loading audio file from \(audioPath)...")
let audioFrames = try AudioProcessor.loadAudioAsFloatArray(fromPath: audioPath)
print("Initializing SpeakerKitPro (Sortformer)...")
let speakerKit = try await SpeakerKitPro()
defer { Task { await speakerKit.unloadModels() } }
print("Running diarization...")
let options = SortformerDiarizationOptions(sortformerMode: .prerecorded)
let result = try await speakerKit.diarize(audioArray: audioFrames, options: options)
outputRTTM(result: result, audioPath: audioPath, rttmPath: rttmPath)
if let timings = result.timings {
print(timings.debugDescription)
}
}
}
}
private func outputRTTM(result: DiarizationResult, audioPath: String, rttmPath: String?) {
let fileName = URL(filePath: audioPath).deletingPathExtension().lastPathComponent
let rttmLines = SpeakerKit.generateRTTM(from: result, fileName: fileName)
let rttmContent = rttmLines.map { $0.description }.joined(separator: "\n")
if let rttmPath {
do {
try rttmContent.write(to: URL(filePath: rttmPath), atomically: true, encoding: .utf8)
print("RTTM saved to \(rttmPath)")
} catch {
print("Failed to write RTTM: \(error)")
}
} else {
print(rttmContent)
}
}
Step 2: Build and run in Terminal
Run the following command in your Terminal from within the top-level project directory:
# Pyannote (macOS 13+)
swift run argmax-test-cli pyannote --api-key <API_KEY> --audio-path <AUDIO_PATH>
# Sortformer (macOS 15+)
swift run argmax-test-cli sortformer --api-key <API_KEY> --audio-path <AUDIO_PATH>If you observe error: no registry configured for 'argmaxinc' scope, go back to Step 0.