coding/asm-patterns/SKILL.md
Assembly language patterns, calling conventions, and code structure for x86-64 and ARM64. Use when writing, reviewing, or generating .asm/.s/.S files; when implementing functions that interoperate with C/system code; or when establishing correct prologues, epilogues, stack management, SIMD loops, syscall stubs, or PIC data access.
npx skillsauth add aeondave/malskill asm-patternsInstall this skill globally with one command. Works with Claude Code, Cursor, and Windsurf.
3 of 9 scanners reported clean
Some scanners were skipped, did not run, or reported a non-clean status. Review each row below.
Canonical patterns and guardrails for x86-64 and ARM64 assembly.
Apply to all .asm, .s, .S files and any assembly embedded in C/Rust/Go.
Declare the target at the top of every file:
; x86-64, NASM, Linux/macOS
; Target: x86-64 System V ABI
; Syntax: Intel (NASM)
%ifdef / .ifdef guards| Role | Registers |
|---|---|
| Integer args (1–6) | rdi, rsi, rdx, rcx, r8, r9 |
| Float args (1–8) | xmm0–xmm7 |
| Return (int) | rax |
| Return (float) | xmm0 |
| Caller-saved | rax, rcx, rdx, rsi, rdi, r8–r11 |
| Callee-saved | rbx, rbp, r12–r15 |
| Stack alignment | 16-byte aligned before call |
| Role | Registers |
|---|---|
| Integer args (1–4) | rcx, rdx, r8, r9 |
| Float args (1–4) | xmm0–xmm3 |
| Return (int) | rax |
| Return (float) | xmm0 |
| Caller-saved (volatile) | rax, rcx, rdx, r8–r11, xmm0–xmm5 |
| Callee-saved (nonvolatile) | rbx, rbp, rdi, rsi, r12–r15, xmm6–xmm15 |
| Stack alignment | 16-byte aligned before call |
| Shadow space | 32 bytes reserved by caller above return address — always, even for 0-arg functions |
Key differences from System V:
rdi, rsi are callee-saved (volatile on System V)xmm6–xmm15 are callee-saved (all volatile on System V)[rsp+8]–[rsp+0x28] after call; callee may use to spill register argsRUNTIME_FUNCTION + unwind codes (not DWARF)| Role | Registers |
|---|---|
| Integer args (1–8) | x0–x7 |
| Float args (1–8) | d0–d7 |
| Return (int) | x0 |
| Return (float) | d0 |
| Callee-saved | x19–x28, x29 (FP) |
| Stack alignment | 16-byte aligned at all times |
Key rule: in non-leaf functions, save x29 + x30 together with stp at entry; restore with ldp before ret.
See references/x86-64.md for full register table, sub-register views, and ABI edge cases.
; int64_t compute(int64_t x, int64_t y)
; Args: rdi=x, rsi=y Returns: rax
global compute
compute:
push rbp
mov rbp, rsp
push rbx ; callee-saved used below
sub rsp, 8 ; keep stack 16-byte aligned
mov rbx, rdi ; save x across call
call some_helper ; rdi..r11 may be clobbered
imul rax, rbx ; result * saved x
imul rax, rsi ; * y
add rsp, 8
pop rbx
pop rbp
ret
; int64_t add3(int64_t a, int64_t b, int64_t c)
; Args: rdi=a, rsi=b, rdx=c Returns: rax
global add3
add3:
lea rax, [rdi + rsi] ; rax = a + b
add rax, rdx ; rax += c
ret
; int64_t compute(int64_t x, int64_t y)
; Args: rcx=x, rdx=y Returns: rax
global compute
compute:
push rbx
sub rsp, 32 ; shadow space (always 32 bytes)
mov rbx, rcx ; save x across call (rbx callee-saved)
call some_helper ; rcx, rdx, r8-r11 clobbered
imul rax, rbx
add rsp, 32
pop rbx
ret
Win64 >4 args: place arg5+ at [rsp+32] above shadow space; allocate 32 + 8*extra_args (keep 16-byte aligned).
// int64_t compute(int64_t x, int64_t y)
// x0=x, x1=y → x0=result
.global compute
compute:
stp x29, x30, [sp, #-32]! // save fp+lr, alloc frame
mov x29, sp
stp x19, x20, [sp, #16] // save callee-saved
mov x19, x0 // save x across call
bl some_helper
mul x0, x0, x19 // result * saved x
mul x0, x0, x1 // * y
ldp x19, x20, [sp, #16]
ldp x29, x30, [sp], #32
ret
; void vadd_f32(float *dst, const float *a, const float *b, size_t n)
; n must be a multiple of 4; dst/a/b may be unaligned
global vadd_f32
vadd_f32:
test rcx, rcx
jz .done
shr rcx, 2 ; n /= 4
.loop:
movups xmm0, [rsi]
addps xmm0, [rdx]
movups [rdi], xmm0
add rsi, 16
add rdx, 16
add rdi, 16
dec rcx
jnz .loop
.done:
ret
// void vadd_f32(float *dst, const float *a, const float *b, size_t n)
.global vadd_f32
vadd_f32:
cbz x3, .done
lsr x3, x3, #2 // n /= 4
.loop:
ld1 {v0.4s}, [x1], #16
ld1 {v1.4s}, [x2], #16
fadd v0.4s, v0.4s, v1.4s
st1 {v0.4s}, [x0], #16
subs x3, x3, #1
b.ne .loop
.done:
ret
; syscall(number, arg1, arg2, arg3)
; Note: r10 replaces rcx (syscall clobbers rcx and r11)
SYS_WRITE equ 1
SYS_EXIT equ 60
section .data
msg db "hello", 10
mlen equ $ - msg
section .text
global _start
_start:
mov rax, SYS_WRITE
mov rdi, 1
lea rsi, [rel msg] ; RIP-relative — required for PIC/PIE
mov rdx, mlen
syscall
mov rax, SYS_EXIT
xor edi, edi
syscall
// ARM64 Linux: x8 = syscall number, x0-x5 = args, svc #0
.equ SYS_WRITE, 64
.equ SYS_EXIT, 93
.global _start
_start:
mov x8, #SYS_WRITE
mov x0, #1 // fd = stdout
adr x1, msg // PC-relative address
mov x2, #6 // length
svc #0
mov x8, #SYS_EXIT
mov x0, #0
svc #0
.section .rodata
msg: .ascii "hello\n"
// macOS: x16 = syscall number, svc #0x80; carry flag set on error
mov x16, #4 // SYS_write
mov x0, #1 // stdout
adr x1, msg
mov x2, #6
svc #0x80
b.cs .error // carry set = syscall failed
default rel ; make ALL memory refs RIP-relative (NASM)
section .data
counter dq 0
section .text
global get_counter
get_counter:
mov rax, [counter] ; compiles to: mov rax, [rip + offset]
ret
; Load address of data (not value) — needed for passing pointers
lea rdi, [rel my_string] ; NASM: RIP-relative address
lea rdi, my_string(%rip) ; GAS: same thing, AT&T syntax
call puts
call .here
.here:
pop ebx ; EBX = runtime address of .here
; access data: [ebx + (label - .here)]
xor rax, rax
push rax ; null terminator
mov rax, 0x6F6C6C6548 ; "Hello" in little-endian
push rax
mov rdi, rsp ; rdi -> "Hello\0"
x86-64 TSO guarantees acquire-release on regular MOV; explicit fences only for sequential consistency or NT stores.
; CAS: if [rdi] == rax then [rdi] = rcx; else rax = [rdi]
mov rax, expected
mov rcx, desired
lock cmpxchg [rdi], rcx
jnz .retry ; CAS failed — rax has current value
; Atomic fetch-and-add
mov rax, 1
lock xadd [rdi], rax ; old value returned in rax
| Fence | Use case |
|---|---|
| mfence | Full barrier — rarely needed with lock instructions |
| sfence | Required after non-temporal stores (movnti, vmovntps) |
| lfence | Serialize dispatch (e.g., before rdtsc for timing) |
For ARM64 atomics (exclusive pairs, LSE, barriers), see references/arm64.md.
; Software breakpoint
int3 ; x86-64: SIGTRAP
brk #0 ; ARM64: debug exception
; Unreachable crash (mark dead code)
ud2 ; x86-64: guaranteed #UD
.inst 0x00000000 ; ARM64: SIGILL
; Stack canary check (GCC-style, Linux x86-64)
mov rax, [fs:0x28] ; prologue: load canary
mov [rsp + FRAME_SIZE - 8], rax
; ... function body ...
cmp rax, [fs:0x28] ; epilogue: verify
jne __stack_chk_fail
; pointer alignment check, not ; compare)module_function_sublabel (e.g., crypto_sha256_loop)equ / .equ with descriptive names, never magic numberscall; never leave dirty in epiloguealign 16 before hot loops; mandatory for aligned SIMD loadsPlan register usage before writing code. Document the mapping in a comment block:
; Register plan:
; rbx = base pointer (callee-saved, survives calls)
; r12 = loop counter (callee-saved)
; rdi/rsi = scratch / pass to callees
; xmm0-xmm3 = SIMD accumulators
Rules:
rbx, r12–r15 / x19–x28)callpush/pop count — each saves 2 cycles + stack spacestp/ldp pairs (always two at a time)| Rule | Rationale |
|---|---|
| Profile first, optimize second | Don't write ASM unless profiling proves the bottleneck |
| lea over imul for small multiplies | lea rax, [rdi+rdi*2] = 1 cycle; imul = 3 cycles |
| vzeroupper after AVX code | Avoids SSE-AVX transition penalty (Haswell–Ice Lake) |
| Multiple accumulators in reductions | Breaks dependency chains; uses more execution ports |
| cmov/csel for unpredictable branches | Avoids ~15-cycle misprediction; skip if branch >95% predictable |
| Align hot loops to 16/32 bytes | Reduces I-cache splits and front-end stalls |
| Avoid div/idiv in loops | 20-40 cycles; constant divisors → multiply+shift |
| pause in spinloops | Reduces power; avoids memory-order pipeline flush (x86) |
| Match load/store widths | Prevents store-forwarding stalls |
| movzx eax, X over mov al, X | Avoids partial-register stalls and false dependencies |
| Break dependency chains | Split serial chains: add rax,rbx / add rcx,rdx > add rax,rbx / add rax,rdx |
| Unroll by 2–4× in tight loops | Reduces branch overhead; exposes ILP to out-of-order engine |
| Count down to zero | dec rcx / jnz saves a cmp; ZF set automatically |
For latency tables and alignment guidelines, see references/x86-64.md. For ARM64 tips (stp/ldp pairs, cbz/tbnz, NEON lane costs), see references/arm64.md.
| Mistake | Fix |
|---|---|
| Missing shadow space on Win64 | Always sub rsp, 32 before call on Windows |
| Stack misaligned before call | RSP must be 16-byte aligned at the call instruction |
| Forgetting syscall clobbers rcx, r11 | Use r10 for arg4; save rcx/r11 if needed |
| Using push/pop on ARM64 | Use stp/ldp with explicit offsets |
| Missing vzeroupper after AVX | Required before calling SSE/non-AVX code |
| movaps on unaligned address | Use movups unless alignment guaranteed |
| Modifying x18 on macOS ARM64 | Reserved platform register — never touch |
| mov al, X then reading rax | Partial-register stall; use movzx eax, byte [X] |
| Red zone on Win64 / signal handlers | No red zone on Win64; Linux leaf-only |
| Signed vs unsigned condition codes | jl/jg (signed) vs jb/ja (unsigned) — wrong choice = subtle bugs |
| shr clears OF but doesn't set SF usefully | Test result separately after logical shifts; don't rely on flags |
| imul single-operand vs two/three | 1-op imul rdi writes rdx:rax; 2-op imul rax, rdi keeps only low 64 |
| Forgetting lock on shared atomics | cmpxchg/xadd without lock are not atomic on SMP |
| Redundant cmp before conditional jump | test rax, rax is shorter than cmp rax, 0; dec/sub already sets ZF |
Load on demand during development:
| File | When to load | |---|---| | references/x86-64.md | Register table, instruction selection, SSE/AVX/AVX-512 patterns, Win64 details, atomics, ABI edge cases, BMI1/BMI2, branchless patterns, NASM macros, toolchain commands | | references/arm64.md | Register table, NEON/SVE patterns, atomics, AAPCS edge cases, Apple Silicon specifics, branchless patterns, GAS macros | | assets/function-template-x64.asm | NASM function template with System V + Win64 conditional assembly | | assets/function-template-arm64.s | GAS ARM64 function template with prologue/epilogue |
development
White-box auditing methodology for AI-generated ('vibe-coded') applications. Focuses on modern stack misconfigurations (Supabase, Next.js, Vercel).
development
Hybrid AI/Deterministic SAST methodology for discovering zero-day vulnerabilities in source code. Orchestrates structural search with AI-driven data flow and sink validation.
development
Auth assessment: hardware/embedded methodology; UART/JTAG/SWD/SPI/I2C, firmware extraction, boot/debug paths, embedded OS evidence.
devops
Container methodology: Identifying containerization limits, Docker/K8s misconfigurations, and executing escapes to the host node.