agentic/code/frameworks/media-curator/skills/acquire/SKILL.md
Download media from discovered sources with format selection and progress tracking
npx skillsauth add jmagly/aiwg acquireInstall this skill globally with one command. Works with Claude Code, Cursor, and Windsurf.
3 of 9 scanners reported clean
Some scanners were skipped, did not run, or reported a non-clean status. Review each row below.
Download media from discovered sources with intelligent format selection, parallel execution, and comprehensive progress tracking.
The /acquire command orchestrates media downloads from various sources (YouTube, Internet Archive, Bandcamp, direct links) using the Acquisition Manager agent. It handles:
--plan <sources.yaml>
/find-sources command.curator/sources/plan-001.yaml--url <URL>
https://youtube.com/watch?v=abc123--format <audio|video|best>
audio: Extract audio only (Opus 128K default)video: Best video up to 1080p with audiobest: Let agent decide based on content type (default)--format audio--output <directory>
./downloads/<timestamp><output>/<artist>/<era>/audio/ and /video/--output /mnt/archive/music--parallel <N>
--parallel 5--local-work <directory>
/tmp/curator-work-$$--local-work /fast-local-disk/tmp--verify-after
--verify-after--extract-audio
--extract-audio--resume <session-id>
.curator/sessions/<session-id>/state.json--resume 20260214-143022# Download single YouTube video (audio only)
/acquire --url "https://youtube.com/watch?v=abc123" --format audio
# Download single video with metadata
/acquire --url "https://youtube.com/watch?v=abc123" \
--format video \
--output /mnt/archive/concerts \
--verify-after
# Download entire source plan (generated by /find-sources)
/acquire --plan .curator/sources/plan-001.yaml
# Download to network mount (uses local working dir)
/acquire --plan .curator/sources/plan-001.yaml \
--output /mnt/network-storage/archive \
--local-work /fast-ssd/curator-work \
--parallel 3
# Download videos and auto-extract audio
/acquire --plan .curator/sources/plan-001.yaml \
--format video \
--extract-audio \
--verify-after
# Result structure:
# downloads/artist/era/video/concert.mkv
# downloads/artist/era/audio/concert.opus
# Resume after network failure
/acquire --resume 20260214-143022
# Resume with different output (move completed files)
/acquire --resume 20260214-143022 \
--output /new/location
# Validate parameters
- Check --plan exists OR --url provided
- Verify output directory is writable
- Check required tools (yt-dlp, wget, curl, ffmpeg)
- Test network mount performance if applicable
- Create session directory structure
# Session setup
SESSION_ID=$(date +%Y%m%d-%H%M%S)
SESSION_DIR=".curator/sessions/$SESSION_ID"
mkdir -p "$SESSION_DIR"/{logs,metadata}
# Initialize state file
cat > "$SESSION_DIR/state.json" <<EOF
{
"session_id": "$SESSION_ID",
"started_at": "$(date -Iseconds)",
"status": "initializing",
"parameters": {
"plan": "$PLAN_FILE",
"format": "$FORMAT",
"output": "$OUTPUT_DIR",
"parallel": $PARALLEL
},
"downloads": []
}
EOF
# If --plan provided, parse YAML
if [[ -n "$PLAN_FILE" ]]; then
# Extract URLs and metadata
SOURCES=$(yq eval '.sources[] | .url' "$PLAN_FILE")
TOTAL_SOURCES=$(echo "$SOURCES" | wc -l)
# Validate each URL is accessible
while IFS= read -r url; do
if yt-dlp --dump-json "$url" >/dev/null 2>&1; then
echo "VALID: $url"
else
echo "WARNING: Cannot access $url"
fi
done <<< "$SOURCES"
fi
# If --url provided, validate single URL
if [[ -n "$SINGLE_URL" ]]; then
if ! yt-dlp --dump-json "$SINGLE_URL" >/dev/null 2>&1; then
echo "ERROR: Cannot access URL: $SINGLE_URL"
exit 1
fi
fi
create_acquisition_structure() {
local output_base="$1"
local artist="$2"
local era="$3"
# Sanitize directory names
local safe_artist=$(echo "$artist" | sed 's/[^a-zA-Z0-9_-]/_/g')
local safe_era=$(echo "$era" | sed 's/[^a-zA-Z0-9_-]/_/g')
# Create directory tree
local audio_dir="$output_base/$safe_artist/$safe_era/audio"
local video_dir="$output_base/$safe_artist/$safe_era/video"
mkdir -p "$audio_dir/.curator"
mkdir -p "$video_dir/.curator"
mkdir -p "$output_base/$safe_artist/.curator"
# Write artist metadata
cat > "$output_base/$safe_artist/.curator/artist-info.json" <<EOF
{
"name": "$artist",
"era": "$era",
"created_at": "$(date -Iseconds)",
"session_id": "$SESSION_ID"
}
EOF
echo "$audio_dir:$video_dir"
}
# Download orchestration with concurrency control
ACTIVE_DOWNLOADS=()
MAX_CONCURRENT=${PARALLEL:-3}
launch_download() {
local url="$1"
local output_dir="$2"
local format="$3"
local download_id="dl-$SESSION_ID-$(date +%s)"
# Wait for slot availability
while [[ ${#ACTIVE_DOWNLOADS[@]} -ge $MAX_CONCURRENT ]]; do
check_completed_downloads
sleep 2
done
# Determine target directory (audio vs video)
local target_dir
if [[ "$format" == "audio" || "$format" == "bestaudio" ]]; then
target_dir="$output_dir/audio"
else
target_dir="$output_dir/video"
fi
# Launch download in background
(
download_with_retry "$url" "$target_dir" "$format" "$download_id"
echo "COMPLETED:$download_id:$?" >> "$SESSION_DIR/completion.log"
) &
local pid=$!
ACTIVE_DOWNLOADS+=("$download_id:$pid")
# Update state file
update_state_file "add" "$download_id" "$url" "in_progress"
}
download_with_retry() {
local url="$1"
local target_dir="$2"
local format="$3"
local download_id="$4"
local attempt=0
local max_attempts=3
while [[ $attempt -lt $max_attempts ]]; do
attempt=$((attempt + 1))
echo "[$download_id] Attempt $attempt/$max_attempts"
if yt-dlp -f "$format" \
--output "$target_dir/%(title)s.%(ext)s" \
--write-info-json \
--write-thumbnail \
--no-playlist \
"$url" \
2>&1 | tee "$SESSION_DIR/logs/$download_id.log"; then
update_state_file "complete" "$download_id" "" "completed"
return 0
fi
# Check error type
local error_type=$(classify_error "$SESSION_DIR/logs/$download_id.log")
if [[ "$error_type" == "disk_full" || "$error_type" == "permission_denied" ]]; then
echo "FATAL: $error_type - aborting all downloads"
pkill -P $$ # Kill all child processes
exit 2
fi
if [[ $attempt -lt $max_attempts ]]; then
local wait_time=$((5 * attempt))
echo "Waiting ${wait_time}s before retry..."
sleep "$wait_time"
fi
done
update_state_file "fail" "$download_id" "" "failed"
return 1
}
check_completed_downloads() {
local new_active=()
for entry in "${ACTIVE_DOWNLOADS[@]}"; do
local pid="${entry#*:}"
if kill -0 "$pid" 2>/dev/null; then
new_active+=("$entry")
fi
done
ACTIVE_DOWNLOADS=("${new_active[@]}")
}
# Real-time progress monitoring
monitor_progress() {
local session_dir="$1"
while true; do
# Load current state
local state_file="$session_dir/state.json"
local total=$(jq '.downloads | length' "$state_file")
local completed=$(jq '[.downloads[] | select(.status == "completed")] | length' "$state_file")
local in_progress=$(jq '[.downloads[] | select(.status == "in_progress")] | length' "$state_file")
local failed=$(jq '[.downloads[] | select(.status == "failed")] | length' "$state_file")
# Clear screen and display status
clear
cat <<EOF
ACQUISITION PROGRESS
====================
Session: $SESSION_ID
Started: $(jq -r '.started_at' "$state_file")
Status: $completed/$total completed ($failed failed, $in_progress in progress)
Active Downloads:
EOF
# Show active download details
jq -r '.downloads[] | select(.status == "in_progress") | " [\(.progress_percent)%] \(.filename) @ \(.speed_mbps)MB/s (ETA: \(.eta_seconds)s)"' "$state_file"
# Exit if all done
if [[ $((completed + failed)) -eq $total ]]; then
echo ""
echo "All downloads completed."
break
fi
sleep 5
done
}
# Run in background
monitor_progress "$SESSION_DIR" &
MONITOR_PID=$!
if [[ "$EXTRACT_AUDIO" == "true" ]]; then
echo "Extracting audio from video files..."
find "$OUTPUT_DIR" -type d -name "video" | while read -r video_dir; do
local audio_dir="${video_dir%/video}/audio"
find "$video_dir" -type f \( -name "*.mkv" -o -name "*.mp4" -o -name "*.webm" \) | while read -r video_file; do
local basename=$(basename "$video_file" | sed 's/\.[^.]*$//')
local audio_file="$audio_dir/${basename}.opus"
echo " $video_file -> $audio_file"
ffmpeg -i "$video_file" \
-vn \
-acodec libopus \
-b:a 128K \
"$audio_file" \
2>&1 | tee -a "$SESSION_DIR/logs/audio-extraction.log"
if [[ ${PIPESTATUS[0]} -eq 0 ]]; then
echo "SUCCESS: $audio_file"
else
echo "FAILED: $video_file"
fi
done
done
fi
if [[ "$VERIFY_AFTER" == "true" ]]; then
echo "Verifying downloaded files..."
local verification_results="$SESSION_DIR/verification.json"
echo '{"verified": [], "failed": []}' > "$verification_results"
find "$OUTPUT_DIR" -type f \( -name "*.opus" -o -name "*.mkv" -o -name "*.mp4" -o -name "*.flac" \) | while read -r file; do
echo "Verifying: $file"
# Check file size
local size=$(stat -c%s "$file" 2>/dev/null || stat -f%z "$file")
if [[ $size -eq 0 ]]; then
echo "FAILED: Zero-byte file"
jq ".failed += [\"$file\"]" "$verification_results" > "$verification_results.tmp"
mv "$verification_results.tmp" "$verification_results"
continue
fi
# Check media integrity with ffmpeg
if ffmpeg -v error -i "$file" -f null - 2>&1 | grep -q "error"; then
echo "FAILED: Corrupted media file"
jq ".failed += [\"$file\"]" "$verification_results" > "$verification_results.tmp"
mv "$verification_results.tmp" "$verification_results"
continue
fi
echo "VERIFIED: $file ($size bytes)"
jq ".verified += [\"$file\"]" "$verification_results" > "$verification_results.tmp"
mv "$verification_results.tmp" "$verification_results"
done
local verified_count=$(jq '.verified | length' "$verification_results")
local failed_count=$(jq '.failed | length' "$verification_results")
echo ""
echo "Verification complete: $verified_count verified, $failed_count failed"
fi
if [[ -n "$LOCAL_WORK" && "$OUTPUT_DIR" != "$LOCAL_WORK"* ]]; then
echo "Copying from local work directory to network mount..."
# Batch copy with progress
rsync -av --progress \
--exclude='.DS_Store' \
--exclude='Thumbs.db' \
"$LOCAL_WORK/" "$OUTPUT_DIR/"
# Verify checksums
echo "Verifying network copy..."
(cd "$LOCAL_WORK" && find . -type f -exec sha256sum {} \; | sort) > /tmp/local-checksums
(cd "$OUTPUT_DIR" && find . -type f -exec sha256sum {} \; | sort) > /tmp/remote-checksums
if diff /tmp/local-checksums /tmp/remote-checksums; then
echo "Verification SUCCESS - removing local working copy"
rm -rf "$LOCAL_WORK"
else
echo "WARNING: Checksum mismatch - keeping local copy at $LOCAL_WORK"
fi
fi
Real-time console output during acquisition:
ACQUISITION PROGRESS
====================
Session: 20260214-143022
Started: 2026-02-14T14:30:22Z
Status: 5/12 completed (1 failed, 3 in progress)
Active Downloads:
[67%] concert-1.mkv @ 12.5MB/s (ETA: 245s)
[23%] album.flac @ 8.2MB/s (ETA: 680s)
[91%] podcast-ep5.opus @ 2.1MB/s (ETA: 45s)
Recent Completions:
[✓] documentary.mp4 (1.2GB, 8m 34s)
[✓] live-set.opus (85MB, 1m 12s)
Recent Failures:
[✗] unavailable-video.mkv (Video unavailable - marked for alternate source)
Total Downloaded: 4.8GB / ~12.5GB estimated
Average Speed: 8.9MB/s
Estimated Completion: 14:58 (28 minutes remaining)
| Error | Detection | Response | User Action Required | |-------|-----------|----------|---------------------| | URL inaccessible | Pre-download validation fails | Skip URL, mark for manual review | Check URL, update source plan | | Network timeout | Download stalls for 60s | Retry with exponential backoff (3x) | Check network connection | | Rate limited | HTTP 429 response | Wait 60s, retry (2x) | Reduce parallel count | | Disk full | Write fails with ENOSPC | STOP all downloads, escalate | Free disk space | | Format unavailable | yt-dlp format error | Try fallback formats | Accept lower quality or skip | | Corrupted download | ffmpeg verification fails | Delete and retry (2x) | Report to source | | Permission denied | Write fails with EACCES | STOP, escalate | Fix permissions | | Mount failure | Network mount timeout | Fall back to local-only mode | Check mount health |
# Per-download error log: .curator/sessions/<session-id>/logs/<download-id>.log
[2026-02-14 14:35:22] Starting download: https://youtube.com/watch?v=abc123
[2026-02-14 14:35:23] Format selected: bestvideo[height<=1080]+bestaudio
[2026-02-14 14:37:45] ERROR: HTTP Error 429: Too Many Requests
[2026-02-14 14:37:45] Classified as: rate_limited
[2026-02-14 14:37:45] Applying retry strategy: wait 60s (attempt 1/2)
[2026-02-14 14:38:45] Retrying download...
[2026-02-14 14:42:10] Download completed: concert-1.mkv (1.2GB)
# Detect stale sessions and offer recovery
detect_incomplete_sessions() {
find .curator/sessions -name "state.json" -mtime -7 | while read -r state_file; do
local status=$(jq -r '.status' "$state_file")
local session_id=$(jq -r '.session_id' "$state_file")
if [[ "$status" == "in_progress" ]]; then
local completed=$(jq '[.downloads[] | select(.status == "completed")] | length' "$state_file")
local total=$(jq '.downloads | length' "$state_file")
echo "Incomplete session detected: $session_id ($completed/$total completed)"
echo "Resume with: /acquire --resume $session_id"
fi
done
}
generate_acquisition_report() {
local session_dir="$1"
local state_file="$session_dir/state.json"
local report_file="$session_dir/report.md"
cat > "$report_file" <<EOF
# Acquisition Session Report
**Session ID**: $(jq -r '.session_id' "$state_file")
**Started**: $(jq -r '.started_at' "$state_file")
**Completed**: $(date -Iseconds)
**Duration**: $(calculate_duration "$(jq -r '.started_at' "$state_file")" "$(date -Iseconds)")
## Summary
- **Total Downloads**: $(jq '.downloads | length' "$state_file")
- **Completed**: $(jq '[.downloads[] | select(.status == "completed")] | length' "$state_file")
- **Failed**: $(jq '[.downloads[] | select(.status == "failed")] | length' "$state_file")
- **Total Size**: $(jq '[.downloads[] | select(.status == "completed") | .filesize_bytes] | add | . / 1073741824' "$state_file") GB
## Successful Downloads
$(jq -r '.downloads[] | select(.status == "completed") | "- \(.filename) (\(.filesize_bytes | tonumber / 1048576 | floor)MB)"' "$state_file")
## Failed Downloads
$(jq -r '.downloads[] | select(.status == "failed") | "- \(.url)\n Error: \(.error)"' "$state_file")
## Parameters
- **Source Plan**: $(jq -r '.parameters.plan // "N/A"' "$state_file")
- **Format Preference**: $(jq -r '.parameters.format' "$state_file")
- **Output Directory**: $(jq -r '.parameters.output' "$state_file")
- **Parallel Downloads**: $(jq -r '.parameters.parallel' "$state_file")
## File Locations
- **Session Directory**: $session_dir
- **Download Logs**: $session_dir/logs/
- **Metadata**: $session_dir/metadata/
- **Verification Results**: $session_dir/verification.json
EOF
echo "Report generated: $report_file"
cat "$report_file"
}
# Export metadata for external tools
export_metadata() {
local session_dir="$1"
local export_file="$session_dir/metadata-export.json"
jq '{
session_id: .session_id,
started_at: .started_at,
downloads: [
.downloads[] | select(.status == "completed") | {
url: .url,
filename: .filename,
format: .format,
filesize_bytes: .filesize_bytes,
duration_seconds: .duration_seconds,
checksum_sha256: .checksum_sha256
}
]
}' "$session_dir/state.json" > "$export_file"
echo "Metadata exported: $export_file"
}
# 1. Discover sources
/find-sources --artist "Pink Floyd" --era "1970s" --sources youtube,archive
# 2. Acquire media from discovered sources
/acquire --plan .curator/sources/plan-001.yaml \
--format video \
--extract-audio \
--verify-after \
--output /mnt/archive/pink-floyd
# 3. Extract metadata (runs automatically or manually)
/extract-metadata --source /mnt/archive/pink-floyd/The_Wall/audio
# 4. Organize final collection
/organize --source /mnt/archive/pink-floyd
--local-work when output is network mount--parallel for network mounts (recommend 2)--limit-rate to yt-dlp for shared networksdata-ai
Report which research-corpus radar sidecars are overdue for refresh. Computes staleness (days since last refresh vs the cadence window) for every radar, sorted most-overdue-first. Runs via `aiwg corpus radar-status`.
data-ai
Aggregate research-corpus radar sidecars into a corpus or per-cluster freshness report — totals, overdue count, per-cluster / per-GRADE / per-trajectory breakdowns, an overdue table, and per-radar rationale snippets. Runs via `aiwg corpus radar-report`.
testing
Scaffold radar/freshness sidecars for research-corpus REFs. Pulls title/authors from the citation sidecar and GRADE from the analysis doc, defaults the refresh cadence from GRADE and the cluster from a corpus-local map, and stamps documentation/radar/REF-XXX-radar.md. Runs via `aiwg corpus radar-init`.
data-ai
Compute an entity's publication trajectory — per-year paper counts, topic drift, hot-streak detection (≥3 consecutive A-grade years), and career phase. Runs via `aiwg corpus profile-temporal`.