Private Beta
Docs
Speaker Diarization

Speaker Diarization

Separate speakers in an audio file

Basic Example

This is a complete and self-contained CLI example project that demonstrates the usage of Argmax Pro SDK for speaker diarization on an audio file.

Step 0: Verify Pro SDK setup

Argmax Pro SDK access must be set up with SwiftPM before going through this example. If unsure, please see Upgrading to Pro SDK (Step 1 only).

Step 1: Create project directory

Create a project directory as shown below and insert the code shared below into ArgmaxTestCommand.swift and Package.swift

ArgmaxSpeakerDiarizationAdvancedExample
├── Package.swift
└── Sources
    └── ArgmaxTestCLI
        └── ArgmaxTestCommand.swift

Package.swift:

// swift-tools-version: 5.10
// The swift-tools-version declares the minimum version of Swift required to build this package.
 
import PackageDescription
 
let package = Package(
    name: "Argmax Test CLI",
    platforms: [
        .macOS(.v14)
    ],
    products: [
        .executable(
            name: "argmax-test-cli",
            targets: ["ArgmaxTestCLI"]
        )
    ],
    dependencies: [
        .package(id: "argmaxinc.argmax-sdk-swift", .upToNextMinor(from: "1.3.3")),
        .package(url: "https://github.com/apple/swift-argument-parser.git", exact: "1.3.0")
    ],
    targets: [
        .executableTarget(
            name: "ArgmaxTestCLI",
            dependencies: [
                .product(name: "Argmax", package: "argmaxinc.argmax-sdk-swift"),
                .product(name: "ArgumentParser", package: "swift-argument-parser")
            ]
        ),
    ]
)
 

ArgmaxTestCommand.swift:

import Foundation
import ArgumentParser
import Argmax
 
 
@main
struct ArgmaxTestCommand: AsyncParsableCommand {
    static let configuration = CommandConfiguration(
        abstract: "An example CLI tool for Argmax Pro SDK",
        subcommands: [Diarize.self]
    )
 
    struct Diarize: AsyncParsableCommand {
        static let configuration = CommandConfiguration(
            abstract: "Speaker diarization of an audio file"
        )
 
        @Option(name: .long, help: "Argmax Pro SDK API key")
        var apiKey: String
 
        @Option(help: "Path to the audio file to process")
        var audioPath: String
 
        @Option(help: "Path to save the diarization output in RTTM format")
        var rttmPath: String?
 
        @Option(help: "Number of speakers to detect if known (default: automatic)")
        var numSpeakers: Int?
 
        mutating func run() async throws {
 
            print("Initializing Argmax Pro SDK...")
            let sdkConfig = ArgmaxConfig(apiKey: apiKey)
            await ArgmaxSDK.with(sdkConfig)
 
            Logging.shared.logLevel = .debug
 
            print("Loading audio file from \(audioPath)")
            let audioFrames = try AudioProcessor.loadAudioAsFloatArray(fromPath: audioPath)
 
            print("Initializing SpeakerKitPro...")
 
            let config = SpeakerKitProConfig()
            print("SpeakerKitPro Config: \(config)")
 
            let speakerKit = try await SpeakerKitPro(config) { (oldState: ModelState?, newState: ModelState) in
                let previousState = oldState?.description ?? "unknown"
                print("SpeakerKit model state changed from \(previousState) to \(newState.description)")
            }
 
            defer { speakerKit.unloadModels() }
 
            print("Starting diarization...")
            try await speakerKit.initializeDiarization(audioArray: audioFrames) { audioClip in
                print("Running Speaker Segmenter model...")
                Task {
                    do {
                        try await speakerKit.processSpeakerSegment(audioArray: audioClip)
                    } catch {
                        print("Error running Speaker Segmenter model: \(error)")
                    }
                }
            }
 
            print("Processing diarization results...")
            let options = numSpeakers.map { DiarizationOptions(numberOfSpeakers: $0) }
            let diarizationResult = try await speakerKit.diarize(options: options)
 
            print("Generating RTTM...")
            let audioURL = URL(filePath: audioPath)
            let fileName = audioURL.deletingPathExtension().lastPathComponent
            let rttmLines = try speakerKit.generateRTTM(
                from: diarizationResult,
                fileName: fileName
            )
 
            let rttmContent = rttmLines.map { $0.toString() }.joined(separator: "\n")
            if let rttmPath = rttmPath {
                try rttmContent.write(
                    to: URL(filePath: rttmPath),
                    atomically: true,
                    encoding: String.Encoding.utf8
                )
                print("RTTM file saved to \(rttmPath)")
            } else {
                print(rttmContent)
            }
 
            let timingDescription = diarizationResult.timings!.debugDescription
            print(timingDescription)
 
        }
 
    }
}
 

Step 2: Build and run in Terminal

Run the following command in your Terminal from within the top-level project directory:

swift run argmax-test-cli diarize --api-key <API_KEY> --audio-path <AUDIO_PATH>

If you observe error: no registry configured for 'argmaxinc' scope, go back to Step 0.