ai/ios-skills/ios-axiom-ios-ml/coreml-ref/SKILL.md
CoreML API reference - MLModel lifecycle, MLTensor operations, coremltools conversion, compression APIs, state management, compute device availability, performance profiling.
npx skillsauth add kurko/dotfiles coreml-refInstall this skill globally with one command. Works with Claude Code, Cursor, and Windsurf.
3 of 9 scanners reported clean
Some scanners were skipped, did not run, or reported a non-clean status. Review each row below.
// Synchronous load (blocks thread)
let model = try MLModel(contentsOf: compiledModelURL)
// Async load (preferred)
let model = try await MLModel.load(contentsOf: compiledModelURL)
// With configuration
let config = MLModelConfiguration()
config.computeUnits = .all // .cpuOnly, .cpuAndGPU, .cpuAndNeuralEngine
let model = try await MLModel.load(contentsOf: url, configuration: config)
| Type | Extension | Purpose |
|------|-----------|---------|
| Source | .mlmodel, .mlpackage | Development, editing |
| Compiled | .mlmodelc | Runtime execution |
Note: Xcode compiles source models automatically. At runtime, use compiled models.
First load triggers device specialization (can be slow). Subsequent loads use cache.
Load flow:
├─ Check cache for (model path + configuration + device)
│ ├─ Found → Cached load (fast)
│ └─ Not found → Device specialization
│ ├─ Parse model
│ ├─ Optimize operations
│ ├─ Segment for compute units
│ ├─ Compile for each unit
│ └─ Cache result
Cache invalidated by: system updates, low disk space, model modification.
// Load specific function
let config = MLModelConfiguration()
config.functionName = "sticker" // Function name from model
let model = try MLModel(contentsOf: url, configuration: config)
// Check available compute devices
let devices = MLModel.availableComputeDevices
// Check for Neural Engine
let hasNeuralEngine = devices.contains { device in
if case .neuralEngine = device { return true }
return false
}
// Check for specific GPU
for device in devices {
switch device {
case .cpu:
print("CPU available")
case .gpu(let gpu):
print("GPU: \(gpu.name)")
case .neuralEngine(let ne):
print("Neural Engine: \(ne.totalCoreCount) cores")
@unknown default:
break
}
}
| Value | Behavior |
|-------|----------|
| .all | Best performance (default) |
| .cpuOnly | CPU only |
| .cpuAndGPU | Exclude Neural Engine |
| .cpuAndNeuralEngine | Exclude GPU |
// Single prediction (NOT thread-safe)
let output = try model.prediction(from: input)
// Batch prediction
let outputs = try model.predictions(from: batch)
// Single prediction (thread-safe, supports concurrency)
let output = try await model.prediction(from: input)
// With cancellation
let output = try await withTaskCancellationHandler {
try await model.prediction(from: input)
} onCancel: {
// Prediction will be cancelled
}
// Create state from model
let state = model.makeState()
// Prediction with state (state updated in-place)
let output = try model.prediction(from: input, using: state)
// Async with state
let output = try await model.prediction(from: input, using: state)
import CoreML
// From MLShapedArray
let shapedArray = MLShapedArray<Float>(scalars: [1, 2, 3, 4], shape: [2, 2])
let tensor = MLTensor(shapedArray)
// From nested collections
let tensor = MLTensor([[1.0, 2.0], [3.0, 4.0]])
// Zeros/ones
let zeros = MLTensor(zeros: [3, 3], scalarType: Float.self)
// Element-wise
let sum = tensor1 + tensor2
let product = tensor1 * tensor2
let scaled = tensor * 2.0
// Reductions
let mean = tensor.mean()
let sum = tensor.sum()
let max = tensor.max()
// Comparison
let mask = tensor .> mean // Boolean mask
// Softmax
let probs = tensor.softmax()
// Slicing (Python-like syntax)
let row = tensor[0] // First row
let col = tensor[.all, 0] // First column
let slice = tensor[0..<2, 1..<3]
// Reshaping
let reshaped = tensor.reshaped(to: [4])
let expanded = tensor.expandingShape(at: 0)
Critical: Tensor operations are async. Must materialize to access data.
// Materialize to MLShapedArray (blocks until complete)
let array = await tensor.shapedArray(of: Float.self)
// Access scalars
let values = array.scalars
import coremltools as ct
import torch
# Trace PyTorch model
model.eval()
traced = torch.jit.trace(model, example_input)
# Convert
mlmodel = ct.convert(
traced,
inputs=[ct.TensorType(shape=example_input.shape)],
outputs=[ct.TensorType(name="output")],
minimum_deployment_target=ct.target.iOS18
)
mlmodel.save("Model.mlpackage")
# Fixed shape
ct.TensorType(shape=(1, 3, 224, 224))
# Range dimension
ct.TensorType(shape=(1, ct.RangeDim(1, 2048)))
# Enumerated shapes
ct.TensorType(shape=ct.EnumeratedShapes(shapes=[(1, 256), (1, 512), (1, 1024)]))
# For stateful models (KV-cache)
states = [
ct.StateType(
name="keyCache",
wrapped_type=ct.TensorType(shape=(1, 32, 2048, 128))
),
ct.StateType(
name="valueCache",
wrapped_type=ct.TensorType(shape=(1, 32, 2048, 128))
)
]
mlmodel = ct.convert(traced, inputs=inputs, states=states, ...)
from coremltools.optimize.coreml import (
OpPalettizerConfig,
OptimizationConfig,
palettize_weights
)
# Per-tensor (iOS 17+)
config = OpPalettizerConfig(mode="kmeans", nbits=4)
# Per-grouped-channel (iOS 18+, better accuracy)
config = OpPalettizerConfig(
mode="kmeans",
nbits=4,
granularity="per_grouped_channel",
group_size=16
)
opt_config = OptimizationConfig(global_config=config)
compressed = palettize_weights(model, opt_config)
from coremltools.optimize.coreml import (
OpLinearQuantizerConfig,
OptimizationConfig,
linear_quantize_weights
)
# INT8 per-channel (iOS 17+)
config = OpLinearQuantizerConfig(mode="linear", dtype="int8")
# INT4 per-block (iOS 18+)
config = OpLinearQuantizerConfig(
mode="linear",
dtype="int4",
granularity="per_block",
block_size=32
)
opt_config = OptimizationConfig(global_config=config)
compressed = linear_quantize_weights(model, opt_config)
from coremltools.optimize.coreml import (
OpMagnitudePrunerConfig,
OptimizationConfig,
prune_weights
)
config = OpMagnitudePrunerConfig(target_sparsity=0.5)
opt_config = OptimizationConfig(global_config=config)
sparse = prune_weights(model, opt_config)
from coremltools.optimize.torch.palettization import (
DKMPalettizerConfig,
DKMPalettizer
)
config = DKMPalettizerConfig(global_config={"n_bits": 4})
palettizer = DKMPalettizer(model, config)
# Prepare (inserts palettization layers)
prepared = palettizer.prepare()
# Training loop
for epoch in range(epochs):
train_one_epoch(prepared, data_loader)
palettizer.step()
# Finalize
final = palettizer.finalize()
from coremltools.optimize.torch.pruning import (
MagnitudePrunerConfig,
LayerwiseCompressor
)
config = MagnitudePrunerConfig(
target_sparsity=0.4,
n_samples=128
)
compressor = LayerwiseCompressor(model, config)
compressed = compressor.compress(calibration_loader)
from coremltools.models import MultiFunctionDescriptor
from coremltools.models.utils import save_multifunction
# Create descriptor
desc = MultiFunctionDescriptor()
desc.add_function("function_a", "model_a.mlpackage")
desc.add_function("function_b", "model_b.mlpackage")
# Merge (deduplicates shared weights)
save_multifunction(desc, "merged.mlpackage")
Open model in Xcode → Predictions tab → Functions listed above inputs.
let plan = try await MLComputePlan.load(contentsOf: modelURL)
// Inspect operations
for op in plan.modelStructure.operations {
let info = plan.computeDeviceInfo(for: op)
print("Op: \(op.name)")
print(" Preferred: \(info.preferredDevice)")
print(" Estimated cost: \(info.estimatedCost)")
}
New in iOS 18: Shows estimated time per operation, compute device support hints.
Instruments → Core ML template
├─ Load events: "cached" vs "prepare and cache"
├─ Prediction intervals
├─ Compute unit usage
└─ Neural Engine activity
| Target | Key Features | |--------|--------------| | iOS 16 | Weight compression (palettization, quantization, pruning) | | iOS 17 | Async prediction, MLComputeDevice, activation quantization | | iOS 18 | MLTensor, State, SDPA fusion, per-block quantization, multi-function |
Recommendation: Always set minimum_deployment_target=ct.target.iOS18 for best optimizations.
# Default pipeline
mlmodel = ct.convert(traced, ...)
# With palettization support
mlmodel = ct.convert(
traced,
pass_pipeline=ct.PassPipeline.DEFAULT_PALETTIZATION,
...
)
WWDC: 2023-10047, 2023-10049, 2024-10159, 2024-10161
Docs: /coreml, /coreml/mlmodel, /coreml/mltensor, /documentation/coremltools
Skills: coreml, coreml-diag
data-ai
Merge the current worktree branch into main and sync main back. Use when the user says "merge to main", "ship it", "merge and continue", or after completing a task in a worktree and wanting to continue with the next one.
tools
Synchronize AI agent skills, commands, configs, permissions, hooks, and instructions across Claude Code, Codex CLI, and other Agent Skills-compatible tools. Use when the user asks to pull skills from Claude into Codex, sync Codex work back to Claude, migrate agent commands, reconcile frontmatter, update permissions, or keep agent setup files in parity.
testing
Write or update UI-independent use cases for QA. Use when the user says "write use cases", "add use cases", "QA use cases", "update use cases", "compose use cases", or when starting implementation of a new feature (after plan approval). Also activates for "what should we test", "regression cases", or "use cases for QA".
documentation
Skill on how to write a task. Use when user asks you to write a task (for Asana, Linear, Jira, Notion and equivalent). Also activates when user says "create task", "write task", or similar task creation workflow requests.