library/specializations/data-engineering-analytics/skills/data-quality-profiler/SKILL.md
Profiles data assets to assess quality dimensions, detect anomalies, and generate comprehensive data quality reports with actionable recommendations.
npx skillsauth add a5c-ai/babysitter data-quality-profilerInstall this skill globally with one command. Works with Claude Code, Cursor, and Windsurf.
3 of 9 scanners reported clean
Some scanners were skipped, did not run, or reported a non-clean status. Review each row below.
Profiles data assets to assess quality dimensions and detect anomalies across the six core data quality dimensions.
This skill performs comprehensive data profiling to assess completeness, accuracy, consistency, validity, timeliness, and uniqueness. It generates statistical profiles, detects anomalies, identifies PII, and provides actionable recommendations for data quality improvement.
{
"dataSource": {
"type": "object",
"required": true,
"properties": {
"type": {
"type": "string",
"enum": ["table", "file", "query"],
"description": "Type of data source"
},
"connection": {
"type": "object",
"description": "Connection details (platform, database, schema)"
},
"identifier": {
"type": "string",
"description": "Table name, file path, or query string"
}
}
},
"sampleSize": {
"type": "number",
"description": "Number of rows to sample (null for full scan)",
"default": 10000
},
"dimensions": {
"type": "array",
"items": {
"type": "string",
"enum": ["accuracy", "completeness", "consistency", "validity", "timeliness", "uniqueness"]
},
"default": ["completeness", "validity", "uniqueness"],
"description": "Quality dimensions to assess"
},
"previousProfile": {
"type": "object",
"description": "Previous profile for drift detection"
},
"businessRules": {
"type": "array",
"items": {
"column": "string",
"rule": "string",
"threshold": "number"
},
"description": "Custom business rules to validate"
},
"piiDetection": {
"type": "boolean",
"default": true,
"description": "Enable PII detection and classification"
}
}
{
"profile": {
"type": "object",
"properties": {
"tableName": "string",
"rowCount": "number",
"columnCount": "number",
"profileTimestamp": "string",
"columns": {
"type": "array",
"items": {
"name": "string",
"declaredType": "string",
"inferredType": "string",
"statistics": {
"nullCount": "number",
"nullPercent": "number",
"distinctCount": "number",
"distinctPercent": "number",
"min": "any",
"max": "any",
"mean": "number",
"median": "number",
"stddev": "number",
"histogram": "array"
},
"patterns": {
"mostCommon": "array",
"detectedFormat": "string",
"regexPattern": "string"
},
"qualityScores": {
"completeness": "number",
"validity": "number",
"uniqueness": "number"
}
}
}
}
},
"anomalies": {
"type": "array",
"items": {
"column": "string",
"type": "outlier|drift|unexpected_null|unexpected_value|format_violation",
"severity": "high|medium|low",
"description": "string",
"examples": "array",
"recommendation": "string"
}
},
"piiFindings": {
"type": "array",
"items": {
"column": "string",
"piiType": "email|phone|ssn|credit_card|name|address|ip|custom",
"confidence": "number",
"sampleCount": "number",
"recommendation": "string"
}
},
"overallScore": {
"type": "number",
"description": "Weighted quality score (0-100)"
},
"dimensionScores": {
"completeness": "number",
"accuracy": "number",
"consistency": "number",
"validity": "number",
"timeliness": "number",
"uniqueness": "number"
},
"recommendations": {
"type": "array",
"items": {
"priority": "high|medium|low",
"category": "string",
"description": "string",
"impact": "string"
}
},
"drift": {
"type": "object",
"description": "Changes compared to previous profile",
"properties": {
"schemaChanges": "array",
"statisticalDrift": "array",
"volumeChange": "object"
}
}
}
{
"dataSource": {
"type": "table",
"connection": {
"platform": "snowflake",
"database": "analytics",
"schema": "core"
},
"identifier": "dim_customers"
},
"dimensions": ["completeness", "validity", "uniqueness"]
}
{
"dataSource": {
"type": "file",
"identifier": "./data/customer_export.csv"
},
"sampleSize": 50000,
"piiDetection": true,
"dimensions": ["completeness", "validity", "accuracy"]
}
{
"dataSource": {
"type": "query",
"connection": {
"platform": "bigquery",
"project": "my-project"
},
"identifier": "SELECT * FROM orders WHERE order_date >= '2024-01-01'"
},
"businessRules": [
{"column": "order_total", "rule": "positive", "threshold": 0},
{"column": "status", "rule": "in_set", "values": ["pending", "completed", "cancelled"]},
{"column": "customer_id", "rule": "not_null", "threshold": 100}
]
}
{
"dataSource": {
"type": "table",
"identifier": "fact_sales"
},
"previousProfile": {
"profileTimestamp": "2024-01-01T00:00:00Z",
"rowCount": 1000000,
"columns": [...]
},
"dimensions": ["consistency", "timeliness"]
}
Measures the presence of required data:
| Metric | Calculation | |--------|-------------| | Column completeness | (total - nulls) / total * 100 | | Row completeness | rows with all required fields / total rows * 100 | | Overall | Weighted average across columns |
Measures conformance to business rules:
| Check Type | Example | |------------|---------| | Type conformance | String in INT column | | Format conformance | Invalid email format | | Range conformance | Age > 150 | | Referential | FK without matching PK |
Measures duplicate and cardinality:
| Metric | Calculation | |--------|-------------| | Distinct ratio | distinct / total * 100 | | Duplicate count | total - distinct | | PK uniqueness | unique PKs / total * 100 |
Measures correctness against ground truth:
Measures uniformity across the dataset:
Measures data freshness:
| Metric | Threshold | |--------|-----------| | Data age | Hours since last update | | Freshness SLA | % meeting freshness requirement | | Lag detection | Processing delay measurement |
| Type | Pattern Examples | |------|------------------| | Email | [email protected] | | Phone | (XXX) XXX-XXXX, +1-XXX-XXX-XXXX | | SSN | XXX-XX-XXXX | | Credit Card | XXXX-XXXX-XXXX-XXXX (with Luhn check) | | Name | First/Last name patterns | | Address | Street, city, state, zip patterns | | IP Address | IPv4 and IPv6 |
data-quality-framework.js)data-catalog.js)etl-elt-pipeline.js)ab-testing-pipeline.js)development
Model documentation skill for generating model cards following Google's model card framework.
development
MLflow integration skill for experiment tracking, model registry, and artifact management. Enables LLMs to log experiments, compare runs, manage model lifecycle, and retrieve artifacts through the MLflow API.
data-ai
LIME-based local explanation skill for individual predictions across tabular, text, and image data.
devops
Kubeflow Pipelines skill for ML workflow orchestration, component management, and Kubernetes-native ML.