skills/scientific-writing/openalex-database/SKILL.md
Query OpenAlex REST API for 250M+ scholarly works, authors, institutions, journals, concepts. Search by keyword, author, DOI, ORCID, or ID; filter by year, OA, citations, field; retrieve citations, references, author disambiguation. Free, no auth. For PubMed use pubmed-database; preprints use biorxiv-database.
npx skillsauth add jaechang-hits/sciagent-skills openalex-databaseInstall this skill globally with one command. Works with Claude Code, Cursor, and Windsurf.
3 of 9 scanners reported clean
Some scanners were skipped, did not run, or reported a non-clean status. Review each row below.
OpenAlex is a free, open-access index of 250M+ scholarly works, 90M+ authors, 110,000+ journals, and 10,000+ institutions. It succeeds Microsoft Academic Graph and provides rich metadata: abstracts, open-access URLs, citation counts, referenced works, author disambiguated IDs (ORCID), and concept tags. The REST API requires no authentication for up to 100,000 requests/day; a polite pool (email parameter) gives priority processing.
pubmed-database; for bioRxiv preprints use biorxiv-databaserequests, pandas[email protected] query param to join polite pool (higher priority, same limit)pip install requests pandas
import requests
BASE = "https://api.openalex.org"
# Search for works on CRISPR
r = requests.get(f"{BASE}/works",
params={"search": "CRISPR gene editing",
"filter": "publication_year:2023",
"per_page": 5,
"mailto": "[email protected]"})
r.raise_for_status()
data = r.json()
print(f"Total results: {data['meta']['count']}")
for work in data["results"][:3]:
print(f" {work['title'][:80]} ({work['publication_year']}) cites={work['cited_by_count']}")
Search works by title/abstract keywords with filters.
import requests, pandas as pd
BASE = "https://api.openalex.org"
def search_works(query, filters=None, per_page=25, mailto="[email protected]"):
params = {"search": query, "per_page": per_page, "mailto": mailto}
if filters:
params["filter"] = ",".join(f"{k}:{v}" for k, v in filters.items())
r = requests.get(f"{BASE}/works", params=params)
r.raise_for_status()
return r.json()
# Search with filters
data = search_works("single-cell RNA sequencing",
filters={"publication_year": "2020-2024",
"open_access.is_oa": "true"},
per_page=10)
print(f"Open-access scRNA-seq papers 2020-2024: {data['meta']['count']}")
rows = []
for w in data["results"]:
rows.append({
"title": w["title"],
"year": w["publication_year"],
"citations": w["cited_by_count"],
"doi": w.get("doi"),
"oa_url": w.get("open_access", {}).get("oa_url"),
})
df = pd.DataFrame(rows)
print(df[["title", "year", "citations"]].head())
# Paginate through all results
def paginate_works(query, filters=None, max_results=200, mailto="[email protected]"):
"""Retrieve up to max_results works, paginating automatically."""
all_results = []
cursor = "*"
while len(all_results) < max_results:
params = {"search": query, "per_page": 200,
"cursor": cursor, "mailto": mailto}
if filters:
params["filter"] = ",".join(f"{k}:{v}" for k, v in filters.items())
r = requests.get(f"{BASE}/works", params=params)
data = r.json()
all_results.extend(data["results"])
cursor = data["meta"].get("next_cursor")
if not cursor:
break
return all_results[:max_results]
papers = paginate_works("transformer protein structure", max_results=100)
print(f"Retrieved {len(papers)} papers")
Retrieve a single work by DOI or OpenAlex ID.
import requests
BASE = "https://api.openalex.org"
# By DOI
doi = "10.1038/s41592-019-0458-z" # Scanpy paper
r = requests.get(f"{BASE}/works/https://doi.org/{doi}",
params={"mailto": "[email protected]"})
r.raise_for_status()
work = r.json()
print(f"Title : {work['title']}")
print(f"Year : {work['publication_year']}")
print(f"Citations: {work['cited_by_count']}")
print(f"Journal : {work.get('primary_location', {}).get('source', {}).get('display_name')}")
abstract = work.get("abstract_inverted_index")
if abstract:
# Reconstruct abstract from inverted index
words = {pos: word for word, positions in abstract.items() for pos in positions}
text = " ".join(words[i] for i in sorted(words))
print(f"Abstract (first 200): {text[:200]}")
Find author records, resolve ORCID identifiers, retrieve publication lists.
import requests, pandas as pd
BASE = "https://api.openalex.org"
# Search for an author
r = requests.get(f"{BASE}/authors",
params={"search": "Jennifer Doudna",
"per_page": 5,
"mailto": "[email protected]"})
authors = r.json()["results"]
for a in authors[:3]:
print(f"Author: {a['display_name']}")
print(f" OpenAlex ID : {a['id']}")
print(f" ORCID : {a.get('orcid', 'n/a')}")
# 2024+: singular `last_known_institution` was replaced by plural list `last_known_institutions[0]`
insts = a.get("last_known_institutions") or []
print(f" Institution : {insts[0]['display_name'] if insts else 'n/a'}")
print(f" Works count : {a['works_count']}")
print(f" h-index : {a['summary_stats'].get('h_index', 'n/a')}")
print()
# Get all papers by an author (by ORCID)
orcid = "0000-0001-9161-999X" # Jennifer A. Doudna (correct ORCID; the 8742-3594 variant 404s)
r = requests.get(f"{BASE}/works",
params={"filter": f"author.orcid:{orcid}",
"sort": "cited_by_count:desc",
"per_page": 10,
"mailto": "[email protected]"})
papers = r.json()["results"]
for p in papers[:5]:
print(f" [{p['publication_year']}] {p['title'][:70]} (cites: {p['cited_by_count']})")
Get referenced works and citing works for a paper.
import requests, pandas as pd
BASE = "https://api.openalex.org"
work_id = "W2018426904" # CRISPR paper
# Get what this paper references
r = requests.get(f"{BASE}/works/{work_id}",
params={"select": "referenced_works,cited_by_count,title",
"mailto": "[email protected]"})
work = r.json()
ref_ids = work.get("referenced_works", [])
print(f"'{work['title']}' cites {len(ref_ids)} papers")
print(f"Total citations: {work['cited_by_count']}")
# Fetch metadata for references (batch)
if ref_ids:
ids_str = "|".join(id.split("/")[-1] for id in ref_ids[:10])
r2 = requests.get(f"{BASE}/works",
params={"filter": f"openalex_id:{ids_str}",
"per_page": 10,
"mailto": "[email protected]"})
refs = r2.json()["results"]
for ref in refs[:5]:
print(f" [{ref['publication_year']}] {ref['title'][:70]}")
Filter by research concepts and analyze publication trends.
import requests, pandas as pd
BASE = "https://api.openalex.org"
# Get concept ID for "Machine Learning". OpenAlex concept search is brittle for
# multi-word phrases ("machine learning biology" returns 0); use the single core term.
r = requests.get(f"{BASE}/concepts",
params={"search": "machine learning",
"per_page": 3,
"mailto": "[email protected]"})
concepts = r.json()["results"]
for c in concepts[:3]:
print(f"Concept: {c['display_name']} (ID: {c['id']}, level: {c['level']})")
# Count papers per year for a concept
concept_id = "C154945302" # Machine learning (OpenAlex ID)
r2 = requests.get(f"{BASE}/works",
params={"filter": f"concepts.id:{concept_id},publication_year:2015-2024",
"group_by": "publication_year",
"per_page": 200,
"mailto": "[email protected]"})
groups = r2.json()["group_by"]
df = pd.DataFrame(groups).rename(columns={"key": "year", "count": "papers"})
df = df.sort_values("year")
print(df.tail(5).to_string(index=False))
Retrieve papers from a specific institution, journal, or conference.
import requests, pandas as pd
BASE = "https://api.openalex.org"
# Papers from a specific journal in the last year
r = requests.get(f"{BASE}/works",
params={
"filter": "primary_location.source.issn:0028-0836,publication_year:2023",
"per_page": 10,
"sort": "cited_by_count:desc",
"mailto": "[email protected]"
})
data = r.json()
print(f"Nature papers 2023: {data['meta']['count']}")
for w in data["results"][:5]:
print(f" [{w['cited_by_count']} cites] {w['title'][:70]}")
OpenAlex stores abstracts as inverted indexes (word → list of positions) rather than plain text due to copyright restrictions. Reconstruct with: " ".join(words[i] for i in sorted({pos: w for w, ps in inv.items() for pos in ps})).
OpenAlex uses cursor-based pagination (cursor parameter) instead of offset. Start with cursor="*" and use the next_cursor from each response. Maximum 200 results per page; cursor pagination supports up to 10,000 results.
Goal: Download all papers matching a topic query with metadata for systematic review.
import requests, time, pandas as pd
BASE = "https://api.openalex.org"
MAILTO = "[email protected]"
def systematic_search(query, year_from, year_to, max_results=500):
"""Paginate through results and return a DataFrame."""
all_results = []
cursor = "*"
filters = f"publication_year:{year_from}-{year_to}"
while len(all_results) < max_results:
r = requests.get(f"{BASE}/works",
params={"search": query, "filter": filters,
"per_page": 200, "cursor": cursor,
"mailto": MAILTO,
"select": "id,doi,title,publication_year,cited_by_count,open_access"})
r.raise_for_status()
data = r.json()
all_results.extend(data["results"])
cursor = data["meta"].get("next_cursor")
if not cursor:
break
time.sleep(0.1)
rows = []
for w in all_results[:max_results]:
rows.append({
"openalex_id": w["id"],
"doi": w.get("doi"),
"title": w.get("title"),
"year": w.get("publication_year"),
"citations": w.get("cited_by_count"),
"is_oa": w.get("open_access", {}).get("is_oa"),
"oa_url": w.get("open_access", {}).get("oa_url"),
})
return pd.DataFrame(rows)
# Example: papers on drug repurposing 2019-2024
df = systematic_search("drug repurposing machine learning", 2019, 2024, max_results=200)
df.to_csv("drug_repurposing_literature.csv", index=False)
print(f"Retrieved {len(df)} papers")
print(df[["title", "year", "citations", "is_oa"]].head(5).to_string(index=False))
Goal: Map co-authors for a researcher to analyze their collaboration network.
import requests, time, pandas as pd
from collections import defaultdict
BASE = "https://api.openalex.org"
MAILTO = "[email protected]"
def get_author_works(orcid, max_papers=50):
r = requests.get(f"{BASE}/works",
params={"filter": f"author.orcid:{orcid}",
"sort": "cited_by_count:desc",
"per_page": min(max_papers, 200),
"mailto": MAILTO})
r.raise_for_status()
return r.json()["results"]
def extract_collaborators(works):
collab_count = defaultdict(int)
for work in works:
for authorship in work.get("authorships", []):
author = authorship.get("author", {})
name = author.get("display_name")
if name:
collab_count[name] += 1
return collab_count
# Map collaborators for a researcher
orcid = "0000-0001-9161-999X" # Jennifer A. Doudna
works = get_author_works(orcid, max_papers=50)
collabs = extract_collaborators(works)
top_collabs = sorted(collabs.items(), key=lambda x: -x[1])
df = pd.DataFrame(top_collabs, columns=["collaborator", "papers_together"])
df = df[df["collaborator"] != "Jennifer A. Doudna"] # exclude self
print("Top collaborators:")
print(df.head(10).to_string(index=False))
df.to_csv("collaboration_network.csv", index=False)
| Parameter | Module | Default | Range / Options | Effect |
|-----------|--------|---------|-----------------|--------|
| search | All | — | text string | Full-text search across title+abstract |
| filter | All | — | field:value,field:value | Structured filters (AND logic) |
| per_page | All | 25 | 1–200 | Results per page |
| cursor | Pagination | "*" | cursor string | Cursor for pagination |
| sort | Works | relevance | cited_by_count:desc, publication_year:desc | Result ordering |
| select | All | all fields | comma-separated field names | Limit response fields (faster) |
| group_by | Works | — | field name | Aggregate counts by field |
| mailto | All | — | email address | Polite pool access (prioritized) |
Always include mailto: Add [email protected] to all requests to join the polite pool and receive priority processing without rate throttling.
Use select for large paginations: When paginating through thousands of results, specify only needed fields (select=id,doi,title,cited_by_count) to reduce response size and speed up parsing.
Use cursor pagination, not offset: OpenAlex does not support offset pagination beyond 10,000 results. Use cursor-based pagination (cursor parameter) for deep traversals.
Reconstruct abstracts from inverted index: Not all works have abstracts; check abstract_inverted_index is not None before reconstructing to avoid KeyError.
Cache by work ID: OpenAlex Work IDs (W…) are stable identifiers. Cache retrieved work metadata to avoid re-fetching within a project.
When to use: Enrich a list of DOIs with citation counts, open-access URLs, and abstracts.
import requests, pandas as pd, time
BASE = "https://api.openalex.org"
dois = [
"10.1038/s41592-019-0458-z",
"10.1186/s13059-021-02519-4",
"10.1038/s41587-019-0071-9",
]
rows = []
for doi in dois:
r = requests.get(f"{BASE}/works/https://doi.org/{doi}",
params={"select": "title,publication_year,cited_by_count,open_access",
"mailto": "[email protected]"})
if r.ok:
w = r.json()
rows.append({
"doi": doi, "title": w.get("title"),
"year": w.get("publication_year"),
"citations": w.get("cited_by_count"),
"is_oa": w.get("open_access", {}).get("is_oa"),
})
time.sleep(0.1)
df = pd.DataFrame(rows)
print(df.to_string(index=False))
When to use: Geographic analysis of research output on a topic.
import requests, pandas as pd
r = requests.get(
"https://api.openalex.org/works",
params={"search": "CRISPR therapeutics",
"filter": "publication_year:2023",
"group_by": "authorships.institutions.country_code",
"per_page": 200,
"mailto": "[email protected]"}
)
df = pd.DataFrame(r.json()["group_by"]).rename(columns={"key": "country", "count": "papers"})
print(df.sort_values("papers", ascending=False).head(10).to_string(index=False))
When to use: Identify landmark papers on a topic for background reading.
import requests, pandas as pd
r = requests.get(
"https://api.openalex.org/works",
params={"search": "protein language model",
"sort": "cited_by_count:desc",
"per_page": 10,
"mailto": "[email protected]"}
)
for w in r.json()["results"]:
print(f"[{w['cited_by_count']:5d} cites] ({w['publication_year']}) {w['title'][:70]}")
| Problem | Cause | Solution |
|---------|-------|----------|
| HTTP 429 Too Many Requests | Rate limit exceeded | Add time.sleep(0.15) between requests; use polite pool (mailto) |
| Empty abstract_inverted_index | No abstract available | Check for None before reconstructing; not all works have abstracts |
| Cursor pagination returns duplicates | Cursor expired | Re-start pagination with cursor="*" |
| DOI lookup returns 404 | DOI not indexed in OpenAlex | Try title search instead; OpenAlex indexes 250M+ but not 100% of literature |
| Filter returns 0 results | Field name wrong or filter syntax error | Check filter syntax: field:value with no spaces; verify field names in API docs |
| cited_by_count is stale | Citation counts update periodically | Counts are refreshed regularly but may lag by days; use for trends not exact figures |
pubmed-database — Biomedical literature with MeSH controlled vocabulary; better for clinical and life sciencesbiorxiv-database — Biomedical preprints not yet indexed in OpenAlexscientific-brainstorming — Hypothesis generation workflows using literature as inputliterature-review — Guide for designing systematic literature reviews using OpenAlextools
Fast short-read DNA aligner for WGS/WES/ChIP-seq. 2× faster BWA-MEM successor; outputs SAM/BAM with read group headers for GATK. Primary plus supplementary records for chimeric reads. Use STAR for RNA-seq splice-aware alignment; Bowtie2 is a comparable alternative.
tools
smina molecular docking CLI. AutoDock Vina fork with customizable scoring functions, native SDF/MOL2/PDB ligand input, autoboxing, local energy minimization, and per-atom score breakdowns. Pipeline: receptor PDBQT prep -> ligand prep (RDKit/OpenBabel) -> dock via autobox or explicit grid -> rescore/minimize with custom scoring -> rank poses by affinity. Choose smina over Vina when you need custom scoring terms (--custom_scoring), local optimization of an existing pose (--local_only), per-atom contributions (--atom_term_data), or SDF/MOL2 ligands without manual PDBQT conversion. For unknown binding sites use diffdock-blind-docking; for the Python-bindings/Vinardo workflow use autodock-vina-docking.
development
mdtraj molecular dynamics trajectory analysis (Python). Reads DCD/XTC/TRR/NetCDF/H5/PDB topologies and trajectories; computes RMSD vs time, radius of gyration, per-residue RMSF, residue-residue contact frequency maps, phi/psi torsions for Ramachandran plots (general + Gly/Pro), and 8-state DSSP secondary structure. Modules: trajectory I/O, geometry (distances/angles/dihedrals), structural analysis (RMSD/Rg/RMSF/SASA), contacts, hydrogen bonds, secondary structure (DSSP), NMR observables. For broader atom-selection grammar use mdanalysis-trajectory; for running MD simulations use OpenMM/GROMACS.
development
Programmatic PubMed access via NCBI E-utilities REST API. Covers Boolean/MeSH queries, field-tagged search, endpoints (ESearch, EFetch, ESummary, EPost, ELink), history server for batches, citation matching, systematic review strategies. Use for biomedical literature search or automated pipelines.