Docs
Real-time Transcription

Real-time Transcription

Argmax Pro SDK includes the WhisperKitPro framework which implements an advanced streaming inference algorithm described in our ICML 2025 paper.

Key features:

  • Accuracy in real-time mode is identical to pre-recorded transcription
  • Dual output text streams: Build trust in stable and accurate results with Confirmed Text while maximizing responsiveness with Hypothesis Text.
  • Streaming API design that exposes event-based callbacks, minimizing the burden on the caller

Introduction

Real-time transcription streams input audio and the corresponding output text continuously during a live recording session:

  1. Input audio stream: Capturing audio in small user-defined intervals
  2. Inference: Incremental speech-to-text inference on the input stream
  3. Output text streams:
    • Confirmed: Finalized portion of the transcript that gets longer over time.
    • Hypothesis: Preliminary transcript that may still be refined as more audio context arrives.

This approach creates an ultra low-latency user experience where words appear on the screen almost as they're spoken, with occasional refinements to the most recent words as the model gathers more context.



Playground Apps

You may find our end-to-end open-source example apps at:

Basic Example

Step 0: Verify Pro SDK setup

Argmax Pro SDK access must be set up with SwiftPM before going through this example. If unsure, please see Installation (Step 1 only).

Step 1: Create project

This is a complete and self-contained CLI example project that demonstrates the usage of Argmax Pro SDK for real-time transcription from a microphone input stream.

Create a project directory as shown below and insert the code shared below into ArgmaxTestCommand.swift and Package.swift

ArgmaxSDKRealTimeTranscriptionBasicExample
├── Package.swift
└── Sources
    └── ArgmaxTestCLI
        └── ArgmaxTestCommand.swift

Package.swift:

// swift-tools-version: 5.10
// The swift-tools-version declares the minimum version of Swift required to build this package.
 
import PackageDescription
 
let package = Package(
    name: "Argmax Test CLI",
    platforms: [
        .macOS(.v14)
    ],
    products: [
        .executable(
            name: "argmax-test-cli",
            targets: ["ArgmaxTestCLI"]
        )
    ],
    dependencies: [
        .package(id: "argmaxinc.argmax-sdk-swift", .upToNextMinor(from: "1.7.0")),
        .package(url: "https://github.com/apple/swift-argument-parser.git", exact: "1.3.0")
    ],
    targets: [
        .executableTarget(
            name: "ArgmaxTestCLI",
            dependencies: [
                .product(name: "Argmax", package: "argmaxinc.argmax-sdk-swift"),
                .product(name: "ArgumentParser", package: "swift-argument-parser")
            ]
        ),
    ]
)

ArgmaxTestCommand.swift:

import Foundation
@preconcurrency import ArgumentParser
@preconcurrency import Argmax
import Combine
 
@main
struct ArgmaxTestCommand: AsyncParsableCommand {
    static let configuration = CommandConfiguration(
        abstract: "An example CLI tool for Argmax Pro SDK",
        subcommands: [Transcribe.self]
    )
 
    struct Transcribe: AsyncParsableCommand {
        static let configuration = CommandConfiguration(
            abstract: "Real-time transcription using system microphone"
        )
 
        @Option(help: "Argmax Pro SDK API key")
        var apiKey: String
 
        @Option(help: "Model name: e.g. `parakeet-v2_476MB`, `tiny.en`, `large-v3-v20240930_626MB`. Default: `parakeet-v2_476MB`")
        var modelName: String = "parakeet-v2_476MB"
        
        @Option(help: "Mode: e.g. `alwaysOn`, `voiceTriggered`, `batteryOptimized`. Default: `voiceTriggered`")
        var mode: String = "voiceTriggered"
 
        func run() async throws {
 
            print("Initializing Argmax Pro SDK...")
 
            let sdkConfig = ArgmaxConfig(apiKey: apiKey)
            sdkConfig.keychainPersistence = false
            await ArgmaxSDK.with(sdkConfig)
 
            print("Downloading \(modelName) model using ModelStore...")
            let modelStore = ModelStore()
            let repoType: RepoType
            
            if modelName.lowercased().contains("parakeet") {
                repoType = .parakeetRepo
            } else if #available(macOS 15, *) {
                repoType = .proRepo
            } else if #available(macOS 13, *) {
                repoType = .openSourceRepo
            } else {
                fatalError("Oldest supported macOS is 13.")
            }
            
            let _ = modelStore.$progress.sink { progress in
                if let progress = progress {
                    let percentage = Int(progress.fractionCompleted * 100)
                    print("\rDownload progress: \(percentage)%", terminator: "")
                    fflush(stdout)
                }
            }
            
            let downloadURL = try await modelStore.downloadModel(
                name: modelName,
                repo: repoType
            )
            
            let modelFolder = downloadURL.path(percentEncoded: false)
            print("\nDownload completed: \(modelFolder)")
 
            let liveTranscriber = try await setupLiveTranscriber(modelFolder: modelFolder)
 
            let streamMode: StreamTranscriptionMode
            switch mode {
            case "alwaysOn":
                streamMode = .alwaysOn
            case "voiceTriggered":
                streamMode = .voiceTriggered(minProcessInterval: 0.3)
            case "batteryOptimized":
                streamMode = .batteryOptimized
            default:
                streamMode = .voiceTriggered(minProcessInterval: 0.3)
            }
            print("\nStream mode: \(streamMode)")
 
            try await transcribeStream(liveTranscriber: liveTranscriber, mode: streamMode)
        }
 
        private func setupLiveTranscriber(modelFolder: String) async throws -> LiveTranscriber {
            let whisperConfig = WhisperKitProConfig(
                modelFolder: modelFolder,
                verbose: true,
                logLevel: .debug
            )
            let whisperKitPro = try await WhisperKitPro(whisperConfig)
            try await whisperKitPro.loadModels()
            let liveTranscriber = LiveTranscriber(whisperKit: whisperKitPro)
            return liveTranscriber
        }
 
        private func transcribeStream(liveTranscriber: LiveTranscriber, mode: StreamTranscriptionMode) async throws {
            let macbookMicrophone = AudioProcessor.getAudioDevices().first(where: { $0.name == "MacBook Pro Microphone" })
            let deviceSource = ArgmaxSource(streamType: .device(macbookMicrophone?.id))
 
            let options = DecodingOptionsPro(
                base: .init(
                        task: .transcribe,
                        wordTimestamps: true,
                        chunkingStrategy: .vad
                    ),
                transcribeInterval: 0.1,
                streamTranscriptionMode: mode,
                alignTimestampsToGlobal: true
            )
 
            try await liveTranscriber.registerStream(
                streamSource: deviceSource,
                options: options
            )
 
            let deviceResults = try await liveTranscriber.startTranscription(for: deviceSource)
            
            let dateFormatter = DateFormatter()
            dateFormatter.dateFormat = "HH:mm:ss.SSS"
            let transcribeTask = Task {
                var accumulatedConfirmedText = ""
                for try await result in deviceResults {
                    let timestamp = dateFormatter.string(from: Date())
                    var hypothesisText = ""
                    switch result {
                    case .confirm(let text, let seconds, let result):
                        accumulatedConfirmedText += " " + text
                    case .hypothesis(let text, let seconds, let result):
                        hypothesisText = text
                    }
                    print("[\(timestamp)] \(accumulatedConfirmedText)\u{001B}[34m\(hypothesisText)\u{001B}[0m")
                }
                return accumulatedConfirmedText
            }
            
            signal(SIGINT, SIG_IGN)
            let signalSource = DispatchSource.makeSignalSource(signal: SIGINT, queue: DispatchQueue.main)
            signalSource.setEventHandler(handler: DispatchWorkItem(block: {
                print("Stop recording...")
                Task.detached {
                    try? await liveTranscriber.stopAndRemoveStream(for: deviceSource)
                    let accumulatedConfirmedText = try! await transcribeTask.value
                    print("\n\nTranscription: \n\n\(accumulatedConfirmedText)\n")
                }
            }))
            
            signalSource.resume()
            let _ = try! await transcribeTask.value
        }
    }
}

Step 2: Build and run

Run the following command in your Terminal from within the top-level project directory:

swift run argmax-test-cli transcribe --api-key <API_KEY>

If you observe error: no registry configured for 'argmaxinc' scope, go back to Step 0.

Here is an example output upon successful build and launch with --model-name large-v3-v20240930_turbo:


Advanced Example

The key differences between the Basic Example and Advanced Example are:

FeatureBasic ExampleAdvanced Example
Instance ManagementAutomatically manages WhisperKitPro instance via LiveTranscriberManual setup and configuration of WhisperKitPro
Session LifecycleHandles session creation and lifecycleManual TranscribeStreamSession lifecycle management
Audio Source AbstractionsProvides convenient abstractions (ArgmaxSource, CustomSource)Direct access to audioProcessor for custom audio handling
CleanupSimplifies cleanup with stopAndRemoveAllTranscriptions()Manual cleanup of streams and sessions
API SurfaceUser-friendly, higher-level APILower-level, more flexible but requires more code

For most use cases, the LiveTranscriber API is recommended. Use the low-level API when you need the additional control and are comfortable managing the complexity.

To set up the Advanced Example, modify the ArgmaxTestCommand.swift file from the Basic Example to the following:

ArgmaxTestCommand.swift

import Foundation
@preconcurrency import ArgumentParser
@preconcurrency import Argmax
import Combine
 
@main
struct ArgmaxTestCommand: AsyncParsableCommand {
    static let configuration = CommandConfiguration(
        abstract: "An example CLI tool for Argmax Pro SDK",
        subcommands: [Transcribe.self]
    )
 
    struct Transcribe: AsyncParsableCommand {
        static let configuration = CommandConfiguration(
            abstract: "Real-time transcription using system microphone"
        )
 
        @Option(help: "Argmax Pro SDK API key")
        var apiKey: String
 
        @Option(help: "Model name: e.g. `parakeet-v2_476MB`, `tiny.en`, `large-v3-v20240930_626MB`. Default: `parakeet-v2_476MB`")
        var modelName: String = "parakeet-v2_476MB"
 
        @Option(help: "Mode: e.g. `alwaysOn`, `voiceTriggered`, `batteryOptimized`. Default: `voiceTriggered`")
        var mode: String = "voiceTriggered"
 
        func run() async throws {
 
            print("Initializing Argmax Pro SDK...")
 
            let sdkConfig = ArgmaxConfig(apiKey: apiKey)
            // Temporarily disable keychain access for debug
            sdkConfig.keychainPersistence = false
            await ArgmaxSDK.with(sdkConfig)
 
            print("Downloading \(modelName) model using ModelStore...")
            let modelStore = ModelStore()
            let repoType: RepoType
 
            // Pick the right model repo
            if modelName.lowercased().contains("parakeet") {
                // Use Nvidia Parakeet models
                repoType = .parakeetRepo
            } else if #available(macOS 15, *) {
                // Use Pro Whisper models
                repoType = .proRepo
            } else if #available(macOS 13, *) {
                // Use Open-source Whisper models
                repoType = .openSourceRepo
            } else {
                fatalError("Oldest supported macOS is 13.")
            }
 
            // Track download progress
            let _ = modelStore.$progress.sink { progress in
                if let progress = progress {
                    let percentage = Int(progress.fractionCompleted * 100)
                    print("\rDownload progress: \(percentage)%", terminator: "")
                    fflush(stdout)
                }
            }
 
            let downloadURL = try await modelStore.downloadModel(
                name: modelName,
                repo: repoType
            )
 
            // To cancel download if needed:
            // modelStore.cancelDownload()
 
            let modelFolder = downloadURL.path(percentEncoded: false)
            print("\nDownload completed: \(modelFolder)")
 
            let whisperKitPro = try await setupWhisperKitPro(modelFolder: modelFolder)
 
            // Set stream transcription mode
            let streamMode: StreamTranscriptionMode
            switch mode {
            case "alwaysOn":
                streamMode = .alwaysOn
            case "voiceTriggered":
                streamMode = .voiceTriggered(minProcessInterval: 0.3)
            case "batteryOptimized":
                streamMode = .batteryOptimized
            default:
                streamMode = .voiceTriggered(minProcessInterval: 0.3) // fallback to default
            }
            print("\nStream mode: \(streamMode)")
 
            try await transcribeStream(whisperKitPro: whisperKitPro)
        }
 
        private func setupWhisperKitPro(modelFolder: String) async throws -> WhisperKitPro {
            let config = WhisperKitProConfig(
                modelFolder: modelFolder,
                verbose: false,
                logLevel: .debug
            )
 
            let whisperKitPro = try await WhisperKitPro(config)
            return whisperKitPro
        }
 
        private func transcribeStream(whisperKitPro: WhisperKitPro) async throws {
            print("Transcribing with low-level WhisperKitPro API...")
 
            // 1. Configure decoding options
            let options = DecodingOptionsPro(
                base: .init(
                        verbose: true,
                        task: .transcribe,
                        skipSpecialTokens: true,
                        wordTimestamps: true,
                        chunkingStrategy: .vad
                    ),
                transcribeInterval: 0.1
            )
 
            // 2. Create audio stream
            let (stream, continuation) = whisperKitPro.audioProcessor.startStreamingRecordingLive()
 
            // 3. Create transcription session
            let session = whisperKitPro.makeStreamSession(options: options)
 
            // 4. Start processing
            await session.start(audioInputStream: stream)
 
            // 5. Process results
            let dateFormatter = DateFormatter()
            dateFormatter.dateFormat = "HH:mm:ss.SSS"
            let transcribeTask = Task {
                for try await result in session.results {
                    let timestamp = dateFormatter.string(from: Date())
                    if let hypothesis = result.hypothesisText {
                        print("[\(timestamp)] Hypothesis: \(hypothesis)")
                    }
                    if !result.text.isEmpty {
                        print("[\(timestamp)] Confirmed: \(result.text)")
                    }
 
//                    // Access word-level timestamps for words in Hypothesis Text
//                    if !result.hypothesisSegments.isEmpty {
//                        print("\nHypothesis Text Word Timestamps:")
//                        result.hypothesisSegments.forEach { segment in
//                            segment.words?.forEach { word in
//                                print("\(word) - \(word.start), \(word.end))")
//                            }
//                        }
//                    }
                }
            }
 
            // 6. Handle termination signal
            signal(SIGINT, SIG_IGN)
            let signalSource = DispatchSource.makeSignalSource(signal: SIGINT, queue: DispatchQueue.main)
            signalSource.setEventHandler(handler: DispatchWorkItem(block: {
                print("Stop recording...")
                Task {
                    // Cleanup - finish the stream
                    continuation.finish()
                }
            }))
            signalSource.resume()
 
            // Wait for transcription task to complete
            try await transcribeTask.value
        }
    }
}

Advanced Features

Modes

Real-time Transcription would continuously process audio as fast as each inference request completes if no adaptive behavior was applied. Downsides may include:

  • False positive predictions from background noise when there is no active speaker
  • Device resources being utilized indiscriminately and potentially impacting battery life

For this purpose, we have built StreamTranscriptionMode which lets developers set adaptive behavior based on input audio and other use case-related intent.

.voiceTriggered (Default)

This mode processes audio only when there is sufficiently high energy in the input audio. To see it in action, please refer to CLI example video above.

This mode's behavior is customizable with silenceThreshold, maxBufferLength and minProcessInterval but the default values work for a wide range of use cases.

.batteryOptimized

Battery-optimized mode is built for use cases where battery life and thermal sustainability are the top optimization objective. This mode is built on top of the voice-triggered mode but it inserts additional adaptive delays to throttle inference speed while keeping latency as low as possible.

.alwaysOn

This mode disables adaptive mode and is not recommended unless there is a specific use case requirement for it.

Background Processing on iOS

Real-time transcription can be sustained even after your app is backgrounded if the following is inserted into your app's Info.plist:

+	<key>UIBackgroundModes</key>
+	<array>
+		<string>audio</string>
+	</array>

This works for whisperKitPro.audioProcessor.startStreamingRecordingLive() because it uses AVAudioSession under the hood. Other audio sources built with AVAudioSession will also work in the background with this change.

Bring Your Own Audio Stream

If you have your own [Float] -> Void callback, you may use the following snippet to convert it into AsyncStream that can be fed to TranscribeStreamSession.start. As an example, here is how Argmax converts WhisperKit AudioProcessor's callback into an AsyncStream.

Multiple Audio Streams

This feature allows multiple input audio streams to be real-time transcribed by the same LiveTranscriber instance. An example use case is concurrent real-time transcription of system audio and microphone for meeting transcriptions.

Before implementing multi-stream transcription, ensure that the ArgmaxTestCommand from Step 1 works correctly, particularly its transcribeStream function which demonstrates the basic LiveTranscriber usage.

Multi-Stream Architecture

The same LiveTranscriber instance can efficiently handle multiple audio streams simultaneously. Each stream gets its own registered source that shares the same LiveTranscriber instance but maintains independent processing context, allowing them to run concurrently without interference.

Example Implementation

private func transcribeMultipleStreams() async throws {
    // 1. Setup LiveTranscriber (same as single stream example)
    let liveTranscriber = try await setupLiveTranscriber()
    
    // 2. Create stream sources for each audio input
    
    // System audio stream (custom stream - for capturing system/app audio)
    let (systemStream, systemContinuation) = createSystemAudioStream() // Your implementation
    let systemSource = CustomSource(
        id: "system-audio",
        audioStream: systemStream,
        audioContinuation: systemContinuation
    )
    
    // Device microphone stream (using built-in device source)
    let deviceSource = ArgmaxSource(streamType: .device())
    
    // 3. Configure decoding options
    let options = DecodingOptionsPro(
        base: .init(
                verbose: true,
                task: .transcribe,
                skipSpecialTokens: true,
                wordTimestamps: true,
                chunkingStrategy: .vad
            ),
        transcribeInterval: 0.1
    )
    
    // 4. Register both streams with LiveTranscriber
    try await liveTranscriber.registerStream(streamSource: systemSource, options: options)
    try await liveTranscriber.registerStream(streamSource: deviceSource, options: options)
    
    // 5. Start transcription and get results stream for each source
    let systemResults = try await liveTranscriber.startTranscription(for: systemSource)
    let deviceResults = try await liveTranscriber.startTranscription(for: deviceSource)
    
    // 6. Process results from both streams concurrently
    // LiveResult parameters:
    // - text: Transcribed text (confirmed or hypothesis)
    // - seconds: Elapsed time since transcription started (monotonic)
    // - result: Complete TranscriptionResultPro (segments, timings, metadata)
    try await withTaskGroup(of: Void.self) { group in
        // System audio results processing
        group.addTask {
            for try await result in systemResults {
                switch result {
                case .confirm(let text, let seconds, let result):
                    print("[SYSTEM] Confirmed: \(text) at \(seconds)s")
                case .hypothesis(let text, let seconds, let result):
                    print("[SYSTEM] Hypothesis: \(text) at \(seconds)s")
                }
            }
        }
        
        // Device microphone results processing
        group.addTask {
            for try await result in deviceResults {
                switch result {
                case .confirm(let text, let seconds, let result):
                    print("[DEVICE] Confirmed: \(text) at \(seconds)s")
                case .hypothesis(let text, let seconds, let result):
                    print("[DEVICE] Hypothesis: \(text) at \(seconds)s")
                }
            }
        }
        
        // Wait for all processing to complete
        try await group.waitForAll()
    }
    
    // 7. Cleanup - stop and remove all transcriptions
    try await liveTranscriber.stopAndRemoveAllTranscriptions()
}
 
 

Key Considerations

  • Stream Management: Each audio stream requires its own ArgmaxSource (either built-in like .device() or custom like CustomSource) registered with the LiveTranscriber.

  • Individual Results Streams: Each source gets its own results stream from startTranscription(for: source) - no need to filter by stream ID.

  • Resource Management: The shared LiveTranscriber instance efficiently manages computational resources across all streams while maintaining independent processing contexts.

  • Graceful Termination: Call stopAndRemoveAllTranscriptions() to stop and remove all registered streams at once, ensuring proper cleanup and graceful termination of transcription sessions.

With Speakers

Argmax Pro SDK 2 and newer support real-time transcription with speakers. Real-time speaker diarization is implemented with Nvidia Sortformer which is currently limited to a maximum of 4 speakers.



combinedSession.results emits two distinct result types via TranscribeDiarizeResultType:

  • .newTranscription — new transcription text arrived; confirmedWordsWithSpeakers, hypothesisWordsWithSpeakers, and seekTime are all updated.
  • .speakerRevision — Sortformer retroactively revised speaker labels on already-confirmed words without new text. Use result.seekTime as a key to find and update (not append) a previously stored batch. In a UI app, replace the speaker assignments in-place; in a CLI context, reprint the revised grouping.

Basic Example

Use the same Package.swift from the Basic Example above, but set the platform to .macOS(.v15) and version to 2.0.9 (required for Sortformer). Replace ArgmaxTestCommand.swift with:

import Foundation
import ArgumentParser
import Argmax
 
@main
struct ArgmaxTestCommand: AsyncParsableCommand {
    static let configuration = CommandConfiguration(
        abstract: "Real-time transcription with Speakers",
        subcommands: [Transcribe.self]
    )
 
    struct Transcribe: AsyncParsableCommand {
        static let configuration = CommandConfiguration(
            abstract: "Real-time transcription with Speakers from system microphone"
        )
 
        @Option(help: "Argmax Pro SDK API key")
        var apiKey: String
 
        func run() async throws {
            let sdkConfig = ArgmaxConfig(apiKey: apiKey)
            sdkConfig.keychainPersistence = false
            await ArgmaxSDK.with(sdkConfig)
 
            // SortformerConfig(load: true) handles model download and load automatically.
            let whisperKitPro = try await WhisperKitPro(WhisperKitProConfig(load: true))
            let speakerKit = try await SpeakerKitPro(SortformerConfig(load: true))
 
            let transcribeSession = whisperKitPro.makeStreamSession()
            let combinedSession = try await speakerKit.makeStreamSession(transcriptionSession: transcribeSession)
 
            let (micStream, micContinuation) = whisperKitPro.audioProcessor.startStreamingRecordingLive()
 
            signal(SIGINT, SIG_IGN)
            let signalSource = DispatchSource.makeSignalSource(signal: SIGINT, queue: DispatchQueue.main)
            signalSource.setEventHandler(handler: DispatchWorkItem(block: { micContinuation.finish() }))
            signalSource.resume()
 
            await combinedSession.start(audioInputStream: micStream)
 
            for try await result in combinedSession.results {
                for word in result.confirmedWordsWithSpeakers {
                    let speaker = word.speaker.map { "Speaker \($0)" } ?? "Unknown"
                    print("\(speaker): \(word.wordTiming.word)")
                }
            }
        }
    }
}

Run with:

swift run argmax-test-cli transcribe --api-key <API_KEY>

Advanced Example

FeatureBasic ExampleAdvanced Example
Model setupSortformerConfig(load: true) auto-downloads and loadsExplicit ModelStore download with progress + WhisperKitProConfig(modelFolder:)
Result handlingPrints confirmed words onlyBranches on result.type, prints hypothesis + confirmed with seekTime
Speaker revisionNot shown.speakerRevision case reprints confirmed batch with revised speaker labels
Diarization tuningDefault optionsExposes maxWordGapInterval and tolerance

To set up the Advanced Example, use the same Package.swift from the Basic Example above. Replace ArgmaxTestCommand.swift with:

ArgmaxTestCommand.swift

import Foundation
import ArgumentParser
import Argmax
import Combine
 
@main
struct ArgmaxTestCommand: AsyncParsableCommand {
    static let configuration = CommandConfiguration(
        abstract: "Argmax Pro SDK example: Real-time transcription with Speakers",
        subcommands: [Transcribe.self]
    )
 
    struct Transcribe: AsyncParsableCommand {
        static let configuration = CommandConfiguration(
            abstract: "Real-time transcription with Speakers from system microphone with Argmax SDK"
        )
 
        @Option(help: "Argmax Pro SDK API key")
        var apiKey: String
 
        @Option(help: "Transcription model name (e.g. `parakeet-v2_476MB`, `large-v3-v20240930_626MB`)")
        var modelName: String = "parakeet-v2_476MB"
 
        func run() async throws {
            print("Initializing Argmax Pro SDK...")
            let sdkConfig = ArgmaxConfig(apiKey: apiKey)
            sdkConfig.keychainPersistence = false
            await ArgmaxSDK.with(sdkConfig)
 
            let whisperKitPro = try await setupWhisperKitPro(modelName: modelName)
            let (speakerKit, streamingMode) = try await setupSpeakerKitPro()
 
            print("\nRecording from microphone... (Ctrl+C to stop)\n")
            try await transcribeMic(whisperKitPro: whisperKitPro, speakerKit: speakerKit, streamingMode: streamingMode)
        }
 
        private func transcribeMic(whisperKitPro: WhisperKitPro, speakerKit: SpeakerKitPro, streamingMode: SortformerStreamingConfig) async throws {
            let (micStream, micContinuation) = whisperKitPro.audioProcessor.startStreamingRecordingLive()
 
            let transcribeTask = Task {
                try await runTranscription(audioStream: micStream, whisperKitPro: whisperKitPro, speakerKit: speakerKit, streamingMode: streamingMode)
            }
 
            signal(SIGINT, SIG_IGN)
            let signalSource = DispatchSource.makeSignalSource(signal: SIGINT, queue: DispatchQueue.main)
            signalSource.setEventHandler(handler: DispatchWorkItem(block: {
                print("\nStop recording...")
                micContinuation.finish()
            }))
            signalSource.resume()
 
            try await transcribeTask.value
        }
 
        private func runTranscription<S: AsyncSequence>(
            audioStream: S,
            whisperKitPro: WhisperKitPro,
            speakerKit: SpeakerKitPro,
            streamingMode: SortformerStreamingConfig
        ) async throws where S.Element == [Float] {
            let options = DecodingOptionsPro(
                base: DecodingOptions(
                    task: .transcribe,
                    wordTimestamps: true,
                    chunkingStrategy: .vad
                ),
                transcribeInterval: 0.1,
                streamTranscriptionMode: .voiceTriggered(minProcessInterval: 0.3),
                alignTimestampsToGlobal: true
            )
 
            let dateFormatter = DateFormatter()
            dateFormatter.dateFormat = "HH:mm:ss.SSS"
 
            let transcribeSession = whisperKitPro.makeStreamSession(options: options)
 
            var diarizationOptions = SortformerDiarizationOptions(sortformerMode: streamingMode)
            // diarizationOptions.maxWordGapInterval = 2.0  // tune speaker-word alignment gap (seconds)
            // diarizationOptions.tolerance = 0.5           // tune speaker boundary tolerance
 
            let combinedSession = try await speakerKit.makeStreamSession(
                transcriptionSession: transcribeSession,
                diarizationConfig: diarizationOptions
            )
 
            await combinedSession.start(audioInputStream: audioStream)
 
            var accumulatedText = ""
            var confirmedBatches: [(seekTime: Float, text: String)] = []
 
            for try await result in combinedSession.results {
                let timestamp = dateFormatter.string(from: Date())
 
                switch result.type {
                case .newTranscription:
                    if let hypothesisText = result.hypothesisText, !hypothesisText.isEmpty {
                        let formatted = groupWordsBySpeaker(result.hypothesisWordsWithSpeakers).joined(separator: "\n  ")
                        if !formatted.isEmpty {
                            print("[\(timestamp)] [t=\(String(format: "%.2f", result.seekTime))s] [Speakers: \(result.speakerCount)] Hypothesis:\n  \(formatted)")
                        }
                    }
                    if !result.confirmedWordsWithSpeakers.isEmpty {
                        let formatted = groupWordsBySpeaker(result.confirmedWordsWithSpeakers).joined(separator: "\n  ")
                        print("[\(timestamp)] [t=\(String(format: "%.2f", result.seekTime))s] [Speakers: \(result.speakerCount)] Confirmed:\n  \(formatted)")
                        confirmedBatches.append((result.seekTime, result.text))
                        accumulatedText = result.text
                    }
 
                case .speakerRevision:
                    // Sortformer revised speaker labels on previously confirmed words — no new text.
                    // In a UI app: find the batch where batch.seekTime == result.seekTime and replace its speaker assignments.
                    if !result.confirmedWordsWithSpeakers.isEmpty {
                        let formatted = groupWordsBySpeaker(result.confirmedWordsWithSpeakers).joined(separator: "\n  ")
                        print("[\(timestamp)] [t=\(String(format: "%.2f", result.seekTime))s] [Speaker revision]:\n  \(formatted)")
                    }
                }
            }
 
            print("\n\nFinal transcript:\n\(accumulatedText)")
        }
    }
 
}
 
// MARK: - Shared helpers
 
private func setupWhisperKitPro(modelName: String) async throws -> WhisperKitPro {
    print("Downloading \(modelName) transcription model...")
    let modelStore = ModelStore()
 
    let repoType: RepoType
    if modelName.lowercased().contains("parakeet") {
        repoType = .parakeetRepo
    } else if #available(macOS 15, *) {
        repoType = .proRepo
    } else if #available(macOS 13, *) {
        repoType = .openSourceRepo
    } else {
        fatalError("Oldest supported macOS is 13.")
    }
 
    let progressSink = modelStore.$progress.sink { progress in
        if let progress = progress {
            let pct = Int(progress.fractionCompleted * 100)
            print("\rTranscription model download: \(pct)%", terminator: "")
            fflush(stdout)
        }
    }
 
    let modelURL = try await modelStore.downloadModel(name: modelName, repo: repoType)
    progressSink.cancel()
    print("\nTranscription model ready: \(modelURL.lastPathComponent)")
 
    print("Loading transcription model...")
    let whisperKitPro = try await WhisperKitPro(WhisperKitProConfig(
        modelFolder: modelURL.path(percentEncoded: false),
        verbose: false,
        logLevel: .none
    ))
    try await whisperKitPro.loadModels()
    print("Transcription model loaded.")
 
    return whisperKitPro
}
 
private func setupSpeakerKitPro() async throws -> (SpeakerKitPro, SortformerStreamingConfig) {
    print("Downloading diarization models...")
    let sortformerConfig = SortformerConfig(load: true, streamingConfig: .realtime)
    let speakerKit = try await SpeakerKitPro(sortformerConfig)
    print("Diarization model loaded.")
    return (speakerKit, .realtime)
}
 
// Groups consecutive words from the same speaker into labeled runs.
private func groupWordsBySpeaker(_ words: [WordWithSpeaker]) -> [String] {
    var lines: [String] = []
    var currentSpeaker: Int? = nil
    var currentText = ""
 
    for wordWithSpeaker in words {
        if wordWithSpeaker.speaker != currentSpeaker {
            if !currentText.isEmpty {
                let label = currentSpeaker.map { "Speaker \($0)" } ?? "Unknown"
                lines.append("[\(label)]: \(currentText.trimmingCharacters(in: .whitespaces))")
            }
            currentSpeaker = wordWithSpeaker.speaker
            currentText = wordWithSpeaker.wordTiming.word
        } else {
            currentText += wordWithSpeaker.wordTiming.word
        }
    }
 
    if !currentText.isEmpty {
        let label = currentSpeaker.map { "Speaker \($0)" } ?? "Unknown"
        lines.append("[\(label)]: \(currentText.trimmingCharacters(in: .whitespaces))")
    }
 
    return lines
}