Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix audio processing edge case #237

Merged
merged 9 commits into from
Nov 1, 2024
4 changes: 2 additions & 2 deletions .github/workflows/development-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ jobs:
name: "Build and Test"
uses: ./.github/workflows/unit-tests.yml
with:
ios-version: "17.2"
macos-runner: "macos-14"
ios-version: "18.1"
macos-runner: "macos-15"

check-approvals:
runs-on: ubuntu-latest
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/pre-release-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ jobs:
include:
- os: macos-13-xlarge
ios-version: "16.1" # Oldest available version
- os: macos-14
ios-version: "17.2" # Latest available version
- os: macos-15
ios-version: "18.1" # Latest available version
uses: ./.github/workflows/unit-tests.yml
with:
ios-version: ${{ matrix.ios-version }}
Expand Down
12 changes: 6 additions & 6 deletions .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,17 @@ jobs:
name: "iOS",
condition: true,
clean-destination: "generic/platform=iOS",
test-destination: "platform=iOS Simulator,OS=${{ inputs.ios-version }},name=iPhone 15",
test-destination: "platform=iOS Simulator,OS=${{ inputs.ios-version }},name=iPhone 16",
}
- {
name: "watchOS",
condition: "${{ inputs.macos-runner == 'macos-14' }}",
condition: "${{ inputs.macos-runner == 'macos-15' }}",
clean-destination: "generic/platform=watchOS",
test-destination: "platform=watchOS Simulator,OS=10.2,name=Apple Watch Ultra 2 (49mm)",
test-destination: "platform=watchOS Simulator,OS=11.1,name=Apple Watch Ultra 2 (49mm)",
}
- {
name: "visionOS",
condition: "${{ inputs.macos-runner == 'macos-14' }}",
condition: "${{ inputs.macos-runner == 'macos-15' }}",
clean-destination: "generic/platform=visionOS",
test-destination: "platform=visionOS Simulator,name=Apple Vision Pro",
}
Expand All @@ -46,7 +46,7 @@ jobs:
- uses: actions/checkout@v4
- uses: maxim-lobanov/setup-xcode@v1
with:
xcode-version: "15.2"
xcode-version: latest-stable
- name: Setup environment
run: make setup
- name: Setup Cache
Expand All @@ -66,7 +66,7 @@ jobs:
echo "Destinations for testing:"
xcodebuild test-without-building -only-testing WhisperKitTests/UnitTests -scheme whisperkit-Package -showdestinations
- name: Boot Simulator and Wait
if: ${{ matrix.run-config['name'] != 'macOS' }} && ${{ inputs.macos-runner == 'macos-14' }}
if: ${{ matrix.run-config['name'] != 'macOS' }} && ${{ inputs.macos-runner == 'macos-15' }}
# Slower runners require some time to fully boot the simulator
# Parse the simulator name from the destination string, boot it, and wait
run: |
Expand Down
12 changes: 7 additions & 5 deletions Sources/WhisperKit/Core/Audio/AudioProcessor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -349,13 +349,15 @@ public class AudioProcessor: NSObject, AudioProcessing {
}

let inputBuffer = AVAudioPCMBuffer(pcmFormat: audioFile.processingFormat, frameCapacity: maxReadFrameSize)!

while audioFile.framePosition < endFramePosition {
let remainingFrames = AVAudioFrameCount(endFramePosition - audioFile.framePosition)
var nextPosition = inputStartFrame
while nextPosition < endFramePosition {
let framePosition = audioFile.framePosition
let remainingFrames = AVAudioFrameCount(endFramePosition - framePosition)
let framesToRead = min(remainingFrames, maxReadFrameSize)
nextPosition = framePosition + Int64(framesToRead)

let currentPositionInSeconds = Double(audioFile.framePosition) / inputSampleRate
let nextPositionInSeconds = (Double(audioFile.framePosition) + Double(framesToRead)) / inputSampleRate
let currentPositionInSeconds = Double(framePosition) / inputSampleRate
let nextPositionInSeconds = Double(nextPosition) / inputSampleRate
Logging.debug("Resampling \(String(format: "%.2f", currentPositionInSeconds))s - \(String(format: "%.2f", nextPositionInSeconds))s")

do {
Expand Down
10 changes: 10 additions & 0 deletions Tests/WhisperKitTests/Evaluate/WERUtils.swift
Original file line number Diff line number Diff line change
Expand Up @@ -123,4 +123,14 @@ enum WERUtils {
let (_, diff) = evaluate(originalTranscript: originalTranscript, generatedTranscript: generatedTranscript)
return diff
}

static func diffString(from diff: [[String?]]) -> String {
return diff.compactMap { entry -> String? in
guard let word = entry[0], word != " " else { return nil }
if let changeType = entry[1] {
return "\(changeType)\(word)"
}
return word
}.joined(separator: " ")
}
}
28 changes: 14 additions & 14 deletions Tests/WhisperKitTests/UnitTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -1416,31 +1416,31 @@ final class UnitTests: XCTestCase {
}

func testVADAudioChunkerAccuracy() async throws {
let testResult = try await XCTUnwrapAsync(
await transcribe(with: .tiny, options: DecodingOptions(), audioFile: "ted_60.m4a"),
let options = DecodingOptions(temperatureFallbackCount: 0, chunkingStrategy: .vad)

let chunkedResult = try await XCTUnwrapAsync(
await transcribe(with: .tiny, options: options, audioFile: "ted_60.m4a"),
"Failed to transcribe"
)

let options = DecodingOptions(chunkingStrategy: .vad)
let clipTimestamps = chunkedResult.compactMap(\.seekTime)
XCTAssertEqual(clipTimestamps, [0, 22.9, 39], "Clip timestamps should match the expected values, found \(clipTimestamps)")

let chunkedResult = try await XCTUnwrapAsync(
await transcribe(with: .tiny, options: options, audioFile: "ted_60.m4a"),
// Run the test using same seek values for accuracy comparison
let testResult = try await XCTUnwrapAsync(
await transcribe(with: .tiny, options: DecodingOptions(temperatureFallbackCount: 0, clipTimestamps: [0, 22.9, 22.9, 39, 39, 60]), audioFile: "ted_60.m4a"),
"Failed to transcribe"
)

XCTAssertFalse(testResult.text.isEmpty, "The test text should not be empty")
XCTAssertFalse(chunkedResult.text.isEmpty, "The chunked text should not be empty")

// Select few sentences to compare at VAD border
// TODO: test that WER is in acceptable range
// XCTAssertTrue(testResult.text.normalized.contains("I would kind".normalized), "Expected text not found in \(testResult.text.normalized)")
// XCTAssertTrue(chunkedResult.text.normalized.contains("I would kind".normalized), "Expected text not found in \(chunkedResult.text.normalized)")
//
// XCTAssertTrue(testResult.text.normalized.contains("every single paper".normalized), "Expected text not found in \(testResult.text.normalized)")
// XCTAssertTrue(chunkedResult.text.normalized.contains("every single paper".normalized), "Expected text not found in \(chunkedResult.text.normalized)")
// Check WER for the full audio and the chunked audio
let (wer, diff) = WERUtils.evaluate(originalTranscript: testResult.text, generatedTranscript: chunkedResult.text)

let diffDescription = WERUtils.diffString(from: diff)

XCTAssertTrue(testResult.text.normalized.contains("But then came my 90 page senior".normalized), "Expected text not found in \(testResult.text.normalized)")
XCTAssertTrue(chunkedResult.text.normalized.contains("But then came my 90 page senior".normalized), "Expected text not found in \(chunkedResult.text.normalized)")
XCTAssertEqual(wer, 0.0, "Transcripts should match with a WER of 0, found \(wer). Full diff: \(diffDescription)")
}

#if !os(watchOS) // FIXME: This test times out on watchOS when run on low compute runners
Expand Down