Docs
Speaker Diarization
Speaker Diarization
Separate speakers in an audio file
Basic Example
This is a complete and self-contained CLI example project that demonstrates the usage of Argmax Pro SDK for speaker diarization on an audio file.
Step 0: Verify Pro SDK setup
Argmax Pro SDK access must be set up with SwiftPM before going through this example. If unsure, please see Upgrading to Pro SDK (Step 1 only).
Step 1: Create project directory
Create a project directory as shown below and insert the code shared below into ArgmaxTestCommand.swift
and Package.swift
ArgmaxSpeakerDiarizationAdvancedExample
├── Package.swift
└── Sources
└── ArgmaxTestCLI
└── ArgmaxTestCommand.swift
Package.swift
:
// swift-tools-version: 5.10
// The swift-tools-version declares the minimum version of Swift required to build this package.
import PackageDescription
let package = Package(
name: "Argmax Test CLI",
platforms: [
.macOS(.v14)
],
products: [
.executable(
name: "argmax-test-cli",
targets: ["ArgmaxTestCLI"]
)
],
dependencies: [
.package(id: "argmaxinc.argmax-sdk-swift", .upToNextMinor(from: "1.3.3")),
.package(url: "https://github.com/apple/swift-argument-parser.git", exact: "1.3.0")
],
targets: [
.executableTarget(
name: "ArgmaxTestCLI",
dependencies: [
.product(name: "Argmax", package: "argmaxinc.argmax-sdk-swift"),
.product(name: "ArgumentParser", package: "swift-argument-parser")
]
),
]
)
ArgmaxTestCommand.swift
:
import Foundation
import ArgumentParser
import Argmax
@main
struct ArgmaxTestCommand: AsyncParsableCommand {
static let configuration = CommandConfiguration(
abstract: "An example CLI tool for Argmax Pro SDK",
subcommands: [Diarize.self]
)
struct Diarize: AsyncParsableCommand {
static let configuration = CommandConfiguration(
abstract: "Speaker diarization of an audio file"
)
@Option(name: .long, help: "Argmax Pro SDK API key")
var apiKey: String
@Option(help: "Path to the audio file to process")
var audioPath: String
@Option(help: "Path to save the diarization output in RTTM format")
var rttmPath: String?
@Option(help: "Number of speakers to detect if known (default: automatic)")
var numSpeakers: Int?
mutating func run() async throws {
print("Initializing Argmax Pro SDK...")
let sdkConfig = ArgmaxConfig(apiKey: apiKey)
await ArgmaxSDK.with(sdkConfig)
Logging.shared.logLevel = .debug
print("Loading audio file from \(audioPath)")
let audioFrames = try AudioProcessor.loadAudioAsFloatArray(fromPath: audioPath)
print("Initializing SpeakerKitPro...")
let config = SpeakerKitProConfig()
print("SpeakerKitPro Config: \(config)")
let speakerKit = try await SpeakerKitPro(config) { (oldState: ModelState?, newState: ModelState) in
let previousState = oldState?.description ?? "unknown"
print("SpeakerKit model state changed from \(previousState) to \(newState.description)")
}
defer { speakerKit.unloadModels() }
print("Starting diarization...")
try await speakerKit.initializeDiarization(audioArray: audioFrames) { audioClip in
print("Running Speaker Segmenter model...")
Task {
do {
try await speakerKit.processSpeakerSegment(audioArray: audioClip)
} catch {
print("Error running Speaker Segmenter model: \(error)")
}
}
}
print("Processing diarization results...")
let options = numSpeakers.map { DiarizationOptions(numberOfSpeakers: $0) }
let diarizationResult = try await speakerKit.diarize(options: options)
print("Generating RTTM...")
let audioURL = URL(filePath: audioPath)
let fileName = audioURL.deletingPathExtension().lastPathComponent
let rttmLines = try speakerKit.generateRTTM(
from: diarizationResult,
fileName: fileName
)
let rttmContent = rttmLines.map { $0.toString() }.joined(separator: "\n")
if let rttmPath = rttmPath {
try rttmContent.write(
to: URL(filePath: rttmPath),
atomically: true,
encoding: String.Encoding.utf8
)
print("RTTM file saved to \(rttmPath)")
} else {
print(rttmContent)
}
let timingDescription = diarizationResult.timings!.debugDescription
print(timingDescription)
}
}
}
Step 2: Build and run in Terminal
Run the following command in your Terminal from within the top-level project directory:
swift run argmax-test-cli diarize --api-key <API_KEY> --audio-path <AUDIO_PATH>
If you observe error: no registry configured for 'argmaxinc' scope
, go back to Step 0.