.remote-cache/kreuzberg-shared-rules/.ai-rulez/skills/ffi-and-language-interop-standards/SKILL.md
______________________________________________________________________ ## priority: critical # FFI and Language Interop Standards ## C API Design Principles FFI code is a **contract** between languages. Breaking this contract causes crashes. Take extra care. ### 1. Explicit Ownership & Lifetime Every pointer must have clear ownership semantics. Use SAFETY comments. ```rust /// Allocate string owned by caller /// # Safety /// Caller must call `htm2md_free_string()` to release memory #[no_m
npx skillsauth add kreuzberg-dev/html-to-markdown .remote-cache/kreuzberg-shared-rules/.ai-rulez/skills/ffi-and-language-interop-standardsInstall this skill globally with one command. Works with Claude Code, Cursor, and Windsurf.
3 of 9 scanners reported clean
Some scanners were skipped, did not run, or reported a non-clean status. Review each row below.
FFI code is a contract between languages. Breaking this contract causes crashes. Take extra care.
Every pointer must have clear ownership semantics. Use SAFETY comments.
/// Allocate string owned by caller
/// # Safety
/// Caller must call `htm2md_free_string()` to release memory
#[no_mangle]
pub unsafe extern "C" fn htm2md_new_string(text: *const c_char) -> *mut c_char {
if text.is_null() {
return std::ptr::null_mut();
}
let c_str = match CStr::from_ptr(text).to_str() {
Ok(s) => s,
Err(_) => return std::ptr::null_mut(),
};
match CString::new(c_str) {
Ok(cs) => cs.into_raw(), // Transfer ownership to caller
Err(_) => std::ptr::null_mut(),
}
}
/// Free string allocated by Rust
/// # Safety
/// - `ptr` must be allocated by htm2md_new_string()
/// - `ptr` must not be used after this call
/// - `ptr` must not be freed by caller after this call
#[no_mangle]
pub unsafe extern "C" fn htm2md_free_string(ptr: *mut c_char) {
if !ptr.is_null() {
let _ = CString::from_raw(ptr); // Reclaim ownership and drop
}
}
Never expose raw Rust types across FFI. Use opaque handles.
use std::ffi::CStr;
use std::os::raw::c_char;
// Opaque handle = void* to callers, real type internally
#[repr(transparent)]
pub struct HtmlConverterHandle(*mut html_to_markdown::HtmlConverter);
impl HtmlConverterHandle {
fn from_ptr(ptr: *mut html_to_markdown::HtmlConverter) -> Self {
HtmlConverterHandle(ptr)
}
fn into_ptr(self) -> *mut html_to_markdown::HtmlConverter {
self.0
}
fn as_ref(&self) -> Option<&html_to_markdown::HtmlConverter> {
unsafe { self.0.as_ref() }
}
}
/// Create converter
/// # Safety
/// Caller must call `htm2md_converter_free()` to release
#[no_mangle]
pub unsafe extern "C" fn htm2md_converter_new() -> HtmlConverterHandle {
let converter = Box::new(html_to_markdown::HtmlConverter::new());
HtmlConverterHandle::from_ptr(Box::into_raw(converter))
}
/// Get version string
/// # Safety
/// - `handle` must be valid converter from `htm2md_converter_new()`
/// - Returned string is valid until `htm2md_converter_free(handle)` called
#[no_mangle]
pub unsafe extern "C" fn htm2md_converter_version(
handle: HtmlConverterHandle,
) -> *const c_char {
match handle.as_ref() {
Some(conv) => {
let version = conv.version();
// Danger: String on stack! Must use thread-local or static
// BAD: Will be freed when function returns
version.as_ptr() as *const c_char
}
None => std::ptr::null(),
}
}
Always check for null pointers:
/// Convert HTML to Markdown
/// # Safety
/// - `handle` must be valid converter
/// - `html` must be null-terminated UTF-8 string, or NULL
/// - Returns NULL on error
#[no_mangle]
pub unsafe extern "C" fn htm2md_convert(
handle: HtmlConverterHandle,
html: *const c_char,
) -> *mut c_char {
// Validate inputs
let converter = match handle.as_ref() {
Some(c) => c,
None => return std::ptr::null_mut(), // NULL handle error
};
if html.is_null() {
return std::ptr::null_mut(); // NULL input error
}
let html_str = match CStr::from_ptr(html).to_str() {
Ok(s) => s,
Err(_) => return std::ptr::null_mut(), // Invalid UTF-8 error
};
// Perform operation
match converter.convert(html_str) {
Ok(result) => {
CString::new(result)
.ok()
.map(|s| s.into_raw())
.unwrap_or_else(|| std::ptr::null_mut())
}
Err(_) => std::ptr::null_mut(),
}
}
/// Free converter
#[no_mangle]
pub unsafe extern "C" fn htm2md_converter_free(handle: HtmlConverterHandle) {
if !handle.0.is_null() {
let _ = Box::from_raw(handle.0);
}
}
cbindgen automatically generates C headers from Rust FFI code.
cbindgen.toml:
language = "C"
header = "/* Auto-generated FFI header */"
include_guard = "HTML_TO_MARKDOWN_FFI_H"
namespace = "htm2md"
autogen_warning = "// This is auto-generated from Rust source"
[defines]
"target_pointer_width = 32" = "HTM2MD_32BIT"
"target_pointer_width = 64" = "HTM2MD_64BIT"
[export]
exclude = ["internal_*"]
mangle = { "prefix" = "" }
Generate header:
# Install cbindgen
cargo install cbindgen
# Generate C header
cbindgen --crate html-to-markdown-ffi -o include/htm2md.h
# Verify in CI
cargo-binstall cbindgen
cbindgen --verify --crate html-to-markdown-ffi
Example generated header:
#ifndef HTML_TO_MARKDOWN_FFI_H
#define HTML_TO_MARKDOWN_FFI_H
#include <stdint.h>
#include <stddef.h>
typedef struct HtmlConverterHandle HtmlConverterHandle;
HtmlConverterHandle* htm2md_converter_new(void);
const char* htm2md_convert(HtmlConverterHandle* handle, const char* html);
void htm2md_converter_free(HtmlConverterHandle* handle);
void htm2md_free_string(char* ptr);
#endif
Each pointer has exactly one owner responsible for freeing:
// BAD: Caller allocates, Rust frees, C uses? Disaster!
// Transfer of ownership unclear
// GOOD: Clear ownership
pub unsafe extern "C" fn html_to_markdown(
html: *const c_char, // Caller owns, Rust borrows (reads)
) -> *mut c_char { // Rust owns returned string, caller must free
// ...
}
Document when pointers become invalid:
/// Get error message from last operation
/// # Safety
/// - `handle` must be valid
/// - Pointer is valid only until next `htm2md_*()` call on same handle
/// - Caller must NOT free returned string
#[no_mangle]
pub unsafe extern "C" fn htm2md_last_error(
handle: HtmlConverterHandle,
) -> *const c_char {
// Return borrowed reference to internal error string
// SAFE: Lives in Handle until next operation
}
// BAD: Returned string is static, don't free!
#[no_mangle]
pub unsafe extern "C" fn htm2md_get_version() -> *const c_char {
b"1.0.0\0".as_ptr() as *const c_char // DO NOT FREE
}
// GOOD: Clear documentation
/// Get static version string
/// # Safety
/// Returned pointer is static and must NOT be freed
#[no_mangle]
pub unsafe extern "C" fn htm2md_get_version() -> *const c_char {
c"1.0.0".as_ptr()
}
Every unsafe block needs a SAFETY comment explaining why it's safe:
// BAD: No explanation
unsafe {
Box::from_raw(ptr)
}
// GOOD: Explains safety invariant
// SAFETY: `ptr` is guaranteed to be:
// - non-NULL (checked above)
// - previously allocated by htm2md_converter_new()
// - not yet freed (lifetime contract enforced by handle)
unsafe {
Box::from_raw(ptr)
}
// GOOD: Complex case with multiple conditions
// SAFETY: Safe because:
// 1. `handle.as_ref()` checks pointer is valid and aligned
// 2. &self borrow prevents concurrent mutation
// 3. Returned slice references only data inside converter
// 4. Pointer lifetime is limited to handle's existence
unsafe {
handle.as_ref().unwrap().get_data()
}
Consistent null checking across all functions:
/// Convert HTML to Markdown (NULL-safe)
/// Returns NULL on error (including NULL input)
#[no_mangle]
pub unsafe extern "C" fn htm2md_convert(
handle: HtmlConverterHandle,
html: *const c_char,
) -> *mut c_char {
// All NULL cases return NULL
let converter = handle.as_ref()
.map(|h| h)
.unwrap_or_else(|| return std::ptr::null_mut())?;
if html.is_null() {
return std::ptr::null_mut();
}
// Process...
}
package htm2md
/*
#include "htm2md.h"
*/
import "C"
import "unsafe"
type Converter struct {
handle *C.struct_HtmlConverterHandle
}
func NewConverter() *Converter {
handle := C.htm2md_converter_new()
return &Converter{handle: handle}
}
func (c *Converter) Convert(html string) (string, error) {
// Convert Go string to C string
cHtml := C.CString(html)
defer C.free(unsafe.Pointer(cHtml))
// Call C function
cResult := C.htm2md_convert(c.handle, cHtml)
if cResult == nil {
return "", errors.New("conversion failed")
}
// Convert C string back to Go
defer C.htm2md_free_string(cResult)
result := C.GoString(cResult)
return result, nil
}
func (c *Converter) Close() {
if c.handle != nil {
C.htm2md_converter_free(c.handle)
c.handle = nil
}
}
import java.lang.foreign.*;
import java.lang.invoke.MethodHandle;
public class HtmlToMarkdown {
private static final Linker LINKER = Linker.nativeLinker();
private static final SymbolLookup SYMBOLS = LINKER.defaultLookup();
static {
System.loadLibrary("html_to_markdown");
}
private static final MethodHandle HTM2MD_CONVERTER_NEW;
private static final MethodHandle HTM2MD_CONVERT;
private static final MethodHandle HTM2MD_CONVERTER_FREE;
static {
Arena arena = Arena.ofAuto();
HTM2MD_CONVERTER_NEW = LINKER.downcallHandle(
SYMBOLS.find("htm2md_converter_new").orElseThrow(),
FunctionDescriptor.of(ValueLayout.ADDRESS)
);
HTM2MD_CONVERT = LINKER.downcallHandle(
SYMBOLS.find("htm2md_convert").orElseThrow(),
FunctionDescriptor.of(
ValueLayout.ADDRESS, // return: char*
ValueLayout.ADDRESS, // handle
ValueLayout.ADDRESS // html
)
);
HTM2MD_CONVERTER_FREE = LINKER.downcallHandle(
SYMBOLS.find("htm2md_converter_free").orElseThrow(),
FunctionDescriptor.ofVoid(ValueLayout.ADDRESS)
);
}
public static MemorySegment newConverter() throws Throwable {
return (MemorySegment) HTM2MD_CONVERTER_NEW.invoke();
}
public static String convert(MemorySegment handle, String html) throws Throwable {
Arena arena = Arena.ofAuto();
MemorySegment htmlSegment = arena.allocateUtf8String(html);
MemorySegment result = (MemorySegment) HTM2MD_CONVERT.invoke(handle, htmlSegment);
if (result.address() == 0) {
throw new RuntimeException("Conversion failed");
}
String javaString = result.getUtf8String(0);
return javaString;
}
public static void freeConverter(MemorySegment handle) throws Throwable {
HTM2MD_CONVERTER_FREE.invoke(handle);
}
}
#[cfg(test)]
mod ffi_tests {
use super::*;
#[test]
fn test_null_handle() {
unsafe {
let result = htm2md_convert(
HtmlConverterHandle(std::ptr::null_mut()),
b"test\0".as_ptr() as *const c_char,
);
assert!(result.is_null());
}
}
#[test]
fn test_null_input() {
unsafe {
let handle = htm2md_converter_new();
let result = htm2md_convert(handle, std::ptr::null());
assert!(result.is_null());
htm2md_converter_free(handle);
}
}
#[test]
fn test_memory_leak() {
unsafe {
let handle = htm2md_converter_new();
let html = CString::new("<h1>Test</h1>").unwrap();
let result = htm2md_convert(handle, html.as_ptr());
htm2md_free_string(result); // Must free result
htm2md_converter_free(handle); // Must free handle
// Valgrind should show 0 leaks
}
}
}
Exposing Rust types directly:
// BAD: Callers can't safely use Rust internals
pub struct HtmlConverter { ... } // Opaque to C
pub fn get_config(&self) -> &ConversionConfig { ... }
// GOOD: Opaque handles only
pub fn get_config(handle: HtmlConverterHandle) -> ConfigHandle { ... }
Forgetting null checks:
// BAD: Assumes valid pointer
pub unsafe extern "C" fn convert(html: *const c_char) -> *mut c_char {
let s = CStr::from_ptr(html).to_str().unwrap();
}
// GOOD: Check all inputs
pub unsafe extern "C" fn convert(html: *const c_char) -> *mut c_char {
if html.is_null() { return std::ptr::null_mut(); }
let s = match CStr::from_ptr(html).to_str() {
Ok(s) => s,
Err(_) => return std::ptr::null_mut(),
};
}
Returning stack data:
// BAD: Buffer is freed after function returns
pub unsafe extern "C" fn get_status() -> *const c_char {
let status = String::from("OK");
status.as_ptr() as *const c_char // DANGLING POINTER
}
// GOOD: Return static or heap-allocated data
pub unsafe extern "C" fn get_status() -> *const c_char {
b"OK\0".as_ptr() as *const c_char
}
tools
Convert HTML to Markdown, Djot, or plain text with structured extraction. Use when writing code that calls html-to-markdown APIs in Rust, Python, TypeScript, Go, Ruby, PHP, Java, C#, Elixir, R, C, or WASM. Covers installation, conversion, configuration, metadata extraction, document structure, and CLI usage.
development
Developer quick start guide with prerequisites, setup, and workflow commands
development
Common task runner commands for build, test, lint, and format workflows
tools
______________________________________________________________________ ## priority: high # Workspace Structure & Project Organization **Rust workspace** (Cargo.toml): crates/{kreuzberg,kreuzberg-py,kreuzberg-node,kreuzberg-ffi,kreuzberg-cli}, packages/ruby/ext/kreuzberg_rb/native, tools/{benchmark-harness,e2e-generator}, e2e/{rust,go}. **Language packages**: packages/{python,typescript,ruby,java,go} - thin wrappers around Rust core. **E2E tests**: Auto-generated from fixtures/ via tools/e2e