Back to Hermes Agent

Optimization Reference

skills/creative/ascii-video/references/optimization.md

2026.6.523.7 KB
Original Source

Optimization Reference

See also: architecture.md · composition.md · scenes.md · shaders.md · inputs.md · troubleshooting.md

Hardware Detection

Detect the user's hardware at script startup and adapt rendering parameters automatically. Never hardcode worker counts or resolution.

CPU and Memory Detection

python
import multiprocessing
import platform
import shutil
import os

def detect_hardware():
    """Detect hardware capabilities and return render config."""
    cpu_count = multiprocessing.cpu_count()
    
    # Leave 1-2 cores free for OS + ffmpeg encoding
    if cpu_count >= 16:
        workers = cpu_count - 2
    elif cpu_count >= 8:
        workers = cpu_count - 1
    elif cpu_count >= 4:
        workers = cpu_count - 1
    else:
        workers = max(1, cpu_count)
    
    # Memory detection (platform-specific)
    try:
        if platform.system() == "Darwin":
            import subprocess
            mem_bytes = int(subprocess.check_output(["sysctl", "-n", "hw.memsize"]).strip())
        elif platform.system() == "Linux":
            with open("/proc/meminfo") as f:
                for line in f:
                    if line.startswith("MemTotal"):
                        mem_bytes = int(line.split()[1]) * 1024
                        break
        else:
            mem_bytes = 8 * 1024**3  # assume 8GB on unknown
    except Exception:
        mem_bytes = 8 * 1024**3

    mem_gb = mem_bytes / (1024**3)
    
    # Each worker uses ~50-150MB depending on grid sizes
    # Cap workers if memory is tight
    mem_per_worker_mb = 150
    max_workers_by_mem = int(mem_gb * 1024 * 0.6 / mem_per_worker_mb)  # use 60% of RAM
    workers = min(workers, max_workers_by_mem)
    
    # ffmpeg availability and codec support
    has_ffmpeg = shutil.which("ffmpeg") is not None
    
    return {
        "cpu_count": cpu_count,
        "workers": workers,
        "mem_gb": mem_gb,
        "platform": platform.system(),
        "arch": platform.machine(),
        "has_ffmpeg": has_ffmpeg,
    }

Adaptive Quality Profiles

Scale resolution, FPS, CRF, and grid density based on hardware:

python
def quality_profile(hw, target_duration_s, user_preference="auto"):
    """
    Returns render settings adapted to hardware.
    user_preference: "auto", "draft", "preview", "production", "max"
    """
    if user_preference == "draft":
        return {"vw": 960, "vh": 540, "fps": 12, "crf": 28, "workers": min(4, hw["workers"]),
                "grid_scale": 0.5, "shaders": "minimal", "particles_max": 200}
    
    if user_preference == "preview":
        return {"vw": 1280, "vh": 720, "fps": 15, "crf": 25, "workers": hw["workers"],
                "grid_scale": 0.75, "shaders": "standard", "particles_max": 500}
    
    if user_preference == "max":
        return {"vw": 3840, "vh": 2160, "fps": 30, "crf": 15, "workers": hw["workers"],
                "grid_scale": 2.0, "shaders": "full", "particles_max": 3000}
    
    # "production" or "auto"
    # Auto-detect: estimate render time, downgrade if it would take too long
    n_frames = int(target_duration_s * 24)
    est_seconds_per_frame = 0.18  # ~180ms at 1080p
    est_total_s = n_frames * est_seconds_per_frame / max(1, hw["workers"])
    
    if hw["mem_gb"] < 4 or hw["cpu_count"] <= 2:
        # Low-end: 720p, 15fps
        return {"vw": 1280, "vh": 720, "fps": 15, "crf": 23, "workers": hw["workers"],
                "grid_scale": 0.75, "shaders": "standard", "particles_max": 500}
    
    if est_total_s > 3600:  # would take over an hour
        # Downgrade to 720p to speed up
        return {"vw": 1280, "vh": 720, "fps": 24, "crf": 20, "workers": hw["workers"],
                "grid_scale": 0.75, "shaders": "standard", "particles_max": 800}
    
    # Standard production: 1080p 24fps
    return {"vw": 1920, "vh": 1080, "fps": 24, "crf": 20, "workers": hw["workers"],
            "grid_scale": 1.0, "shaders": "full", "particles_max": 1200}


def apply_quality_profile(profile):
    """Set globals from quality profile."""
    global VW, VH, FPS, N_WORKERS
    VW = profile["vw"]
    VH = profile["vh"]
    FPS = profile["fps"]
    N_WORKERS = profile["workers"]
    # Grid sizes scale with resolution
    # CRF passed to ffmpeg encoder
    # Shader set determines which post-processing is active

CLI Integration

python
parser = argparse.ArgumentParser()
parser.add_argument("--quality", choices=["draft", "preview", "production", "max", "auto"],
                    default="auto", help="Render quality preset")
parser.add_argument("--aspect", choices=["landscape", "portrait", "square"],
                    default="landscape", help="Aspect ratio preset")
parser.add_argument("--workers", type=int, default=0, help="Override worker count (0=auto)")
parser.add_argument("--resolution", type=str, default="", help="Override resolution e.g. 1280x720")
args = parser.parse_args()

hw = detect_hardware()
if args.workers > 0:
    hw["workers"] = args.workers
profile = quality_profile(hw, target_duration, args.quality)

# Apply aspect ratio preset (before manual resolution override)
ASPECT_PRESETS = {
    "landscape": (1920, 1080),
    "portrait":  (1080, 1920),
    "square":    (1080, 1080),
}
if args.aspect != "landscape" and not args.resolution:
    profile["vw"], profile["vh"] = ASPECT_PRESETS[args.aspect]

if args.resolution:
    w, h = args.resolution.split("x")
    profile["vw"], profile["vh"] = int(w), int(h)
apply_quality_profile(profile)

log(f"Hardware: {hw['cpu_count']} cores, {hw['mem_gb']:.1f}GB RAM, {hw['platform']}")
log(f"Render:   {profile['vw']}x{profile['vh']} @{profile['fps']}fps, "
    f"CRF {profile['crf']}, {profile['workers']} workers")

Portrait Mode Considerations

Portrait (1080x1920) has the same pixel count as landscape 1080p, so performance is equivalent. But composition patterns differ:

ConcernLandscapePortrait
Grid cols at lg16090
Grid rows at lg4580
Max text line chars~50 centered~25-30 centered
Vertical rainShort travelLong, dramatic travel
Horizontal spectrumFull widthNeeds rotation or compression
Radial effectsNatural circlesTall ellipses (aspect correction handles this)
Particle explosionsWide spreadTall spread
Text stacking3-4 lines comfortable8-10 lines comfortable
Quote layout2-3 wide lines5-6 short lines

Portrait-optimized patterns:

  • Vertical rain/matrix effects are naturally enhanced — longer column travel
  • Fire columns rise through more screen space
  • Rising embers/particles have more vertical runway
  • Text can be stacked more aggressively with more lines
  • Radial effects work if aspect correction is applied (GridLayer handles this automatically)
  • Spectrum bars can be rotated 90 degrees (vertical bars from bottom)

Portrait text layout:

python
def layout_text_portrait(text, max_chars_per_line=25, grid=None):
    """Break text into short lines for portrait display."""
    words = text.split()
    lines = []; current = ""
    for w in words:
        if len(current) + len(w) + 1 > max_chars_per_line:
            lines.append(current.strip())
            current = w + " "
        else:
            current += w + " "
    if current.strip():
        lines.append(current.strip())
    return lines

Performance Budget

Target: 100-200ms per frame (5-10 fps single-threaded, 40-80 fps across 8 workers).

ComponentTimeNotes
Feature extraction1-5msPre-computed for all frames before render
Effect function2-15msVectorized numpy, avoid Python loops
Character render80-150msBottleneck -- per-cell Python loop
Shader pipeline5-25msDepends on active shaders
ffmpeg encode~5msAmortized by pipe buffering

Bitmap Pre-Rasterization

Rasterize every character at init, not per-frame:

python
# At init time -- done once
for c in all_characters:
    img = Image.new("L", (cell_w, cell_h), 0)
    ImageDraw.Draw(img).text((0, 0), c, fill=255, font=font)
    bitmaps[c] = np.array(img, dtype=np.float32) / 255.0  # float32 for fast multiply

# At render time -- fast lookup
bitmap = bitmaps[char]
canvas[y:y+ch, x:x+cw] = np.maximum(canvas[y:y+ch, x:x+cw],
                                      (bitmap[:,:,None] * color).astype(np.uint8))

Collect all characters from all palettes + overlay text into the init set. Lazy-init for any missed characters.

Pre-Rendered Background Textures

Alternative to _render_vf() for backgrounds where characters don't need to change every frame. Pre-bake a static ASCII texture once at init, then multiply by a per-cell color field each frame. One matrix multiply vs thousands of bitmap blits.

Use when: background layer uses a fixed character palette and only color/brightness varies per frame. NOT suitable for layers where character selection depends on a changing value field.

Init: Bake the Texture

python
# In GridLayer.__init__:
self._bg_row_idx = np.clip(
    (np.arange(VH) - self.oy) // self.ch, 0, self.rows - 1
)
self._bg_col_idx = np.clip(
    (np.arange(VW) - self.ox) // self.cw, 0, self.cols - 1
)
self._bg_textures = {}

def make_bg_texture(self, palette):
    """Pre-render a static ASCII texture (grayscale float32) once."""
    if palette not in self._bg_textures:
        texture = np.zeros((VH, VW), dtype=np.float32)
        rng = random.Random(12345)
        ch_list = [c for c in palette if c != " " and c in self.bm]
        if not ch_list:
            ch_list = list(self.bm.keys())[:5]
        for row in range(self.rows):
            y = self.oy + row * self.ch
            if y + self.ch > VH:
                break
            for col in range(self.cols):
                x = self.ox + col * self.cw
                if x + self.cw > VW:
                    break
                bm = self.bm[rng.choice(ch_list)]
                texture[y:y+self.ch, x:x+self.cw] = bm
        self._bg_textures[palette] = texture
    return self._bg_textures[palette]

Render: Color Field x Cached Texture

python
def render_bg(self, color_field, palette=PAL_CIRCUIT):
    """Fast background: pre-rendered ASCII texture * per-cell color field.
    color_field: (rows, cols, 3) uint8. Returns (VH, VW, 3) uint8."""
    texture = self.make_bg_texture(palette)
    # Expand cell colors to pixel coords via pre-computed index maps
    color_px = color_field[
        self._bg_row_idx[:, None], self._bg_col_idx[None, :]
    ].astype(np.float32)
    return (texture[:, :, None] * color_px).astype(np.uint8)

Usage in a Scene

python
# Build per-cell color from effect fields (cheap — rows*cols, not VH*VW)
hue = ((t * 0.05 + val * 0.2) % 1.0).astype(np.float32)
R, G, B = hsv2rgb(hue, np.full_like(val, 0.5), val)
color_field = mkc(R, G, B, g.rows, g.cols)  # (rows, cols, 3) uint8

# Render background — single matrix multiply, no per-cell loop
canvas_bg = g.render_bg(color_field, PAL_DENSE)

The texture init loop runs once and is cached per palette. Per-frame cost is one fancy-index lookup + one broadcast multiply — orders of magnitude faster than the per-cell bitmap blit loop in render() for dense backgrounds.

Coordinate Array Caching

Pre-compute all grid-relative coordinate arrays at init, not per-frame:

python
# These are O(rows*cols) and used in every effect
self.rr = np.arange(rows)[:, None]    # row indices
self.cc = np.arange(cols)[None, :]    # col indices
self.dist = np.sqrt(dx**2 + dy**2)   # distance from center
self.angle = np.arctan2(dy, dx)       # angle from center
self.dist_n = ...                      # normalized distance

Vectorized Effect Patterns

Avoid Per-Cell Python Loops in Effects

The render loop (compositing bitmaps) is unavoidably per-cell. But effect functions must be fully vectorized numpy -- never iterate over rows/cols in Python.

Bad (O(rows*cols) Python loop):

python
for r in range(rows):
    for c in range(cols):
        val[r, c] = math.sin(c * 0.1 + t) * math.cos(r * 0.1 - t)

Good (vectorized):

python
val = np.sin(g.cc * 0.1 + t) * np.cos(g.rr * 0.1 - t)

Vectorized Matrix Rain

The naive per-column per-trail-pixel loop is the second biggest bottleneck after the render loop. Use numpy fancy indexing:

python
# Instead of nested Python loops over columns and trail pixels:
# Build row index arrays for all active trail pixels at once
all_rows = []
all_cols = []
all_fades = []
for c in range(cols):
    head = int(S["ry"][c])
    trail_len = S["rln"][c]
    for i in range(trail_len):
        row = head - i
        if 0 <= row < rows:
            all_rows.append(row)
            all_cols.append(c)
            all_fades.append(1.0 - i / trail_len)

# Vectorized assignment
ar = np.array(all_rows)
ac = np.array(all_cols)
af = np.array(all_fades, dtype=np.float32)
# Assign chars and colors in bulk using fancy indexing
ch[ar, ac] = ...  # vectorized char assignment
co[ar, ac, 1] = (af * bri * 255).astype(np.uint8)  # green channel

Vectorized Fire Columns

Same pattern -- accumulate index arrays, assign in bulk:

python
fire_val = np.zeros((rows, cols), dtype=np.float32)
for fi in range(n_cols):
    fx_c = int((fi * cols / n_cols + np.sin(t * 2 + fi * 0.7) * 3) % cols)
    height = int(energy * rows * 0.7)
    dy = np.arange(min(height, rows))
    fr = rows - 1 - dy
    frac = dy / max(height, 1)
    # Width spread: base columns wider at bottom
    for dx in range(-1, 2):  # 3-wide columns
        c = fx_c + dx
        if 0 <= c < cols:
            fire_val[fr, c] = np.maximum(fire_val[fr, c],
                                          (1 - frac * 0.6) * (0.5 + rms * 0.5))
# Now map fire_val to chars and colors in one vectorized pass

PIL String Rendering for Text-Heavy Scenes

Alternative to per-cell bitmap blitting when rendering many long text strings (scrolling tickers, typewriter sequences, idea floods). Uses PIL's native ImageDraw.text() which renders an entire string in one C call, vs one Python-loop bitmap blit per character.

Typical win: a scene with 56 ticker rows renders 56 PIL text() calls instead of ~10K individual bitmap blits.

Use when: scene renders many rows of readable text strings. NOT suitable for sparse or spatially-scattered single characters (use normal render() for those).

python
from PIL import Image, ImageDraw

def render_text_layer(grid, rows_data, font):
    """Render dense text rows via PIL instead of per-cell bitmap blitting.

    Args:
        grid: GridLayer instance (for oy, ch, ox, font metrics)
        rows_data: list of (row_index, text_string, rgb_tuple) — one per row
        font: PIL ImageFont instance (grid.font)

    Returns:
        uint8 array (VH, VW, 3) — canvas with rendered text
    """
    img = Image.new("RGB", (VW, VH), (0, 0, 0))
    draw = ImageDraw.Draw(img)
    for row_idx, text, color in rows_data:
        y = grid.oy + row_idx * grid.ch
        if y + grid.ch > VH:
            break
        draw.text((grid.ox, y), text, fill=color, font=font)
    return np.array(img)

Usage in a Ticker Scene

python
# Build ticker data (text + color per row)
rows_data = []
for row in range(n_tickers):
    text = build_ticker_text(row, t)       # scrolling substring
    color = hsv2rgb_scalar(hue, 0.85, bri) # (R, G, B) tuple
    rows_data.append((row, text, color))

# One PIL pass instead of thousands of bitmap blits
canvas_tickers = render_text_layer(g_md, rows_data, g_md.font)

# Blend with other layers normally
result = blend_canvas(canvas_bg, canvas_tickers, "screen", 0.9)

This is purely a rendering optimization — same visual output, fewer draw calls. The grid's render() method is still needed for sparse character fields where characters are placed individually based on value fields.

Bloom Optimization

Do NOT use scipy.ndimage.uniform_filter -- measured at 424ms/frame.

Use 4x downsample + manual box blur instead -- 84ms/frame (5x faster):

python
sm = canvas[::4, ::4].astype(np.float32)  # 4x downsample
br = np.where(sm > threshold, sm, 0)
for _ in range(3):                          # 3-pass manual box blur
    p = np.pad(br, ((1,1),(1,1),(0,0)), mode='edge')
    br = (p[:-2,:-2] + p[:-2,1:-1] + p[:-2,2:] +
          p[1:-1,:-2] + p[1:-1,1:-1] + p[1:-1,2:] +
          p[2:,:-2] + p[2:,1:-1] + p[2:,2:]) / 9.0
bl = np.repeat(np.repeat(br, 4, axis=0), 4, axis=1)[:H, :W]

Vignette Caching

Distance field is resolution- and strength-dependent, never changes per frame:

python
_vig_cache = {}
def sh_vignette(canvas, strength):
    key = (canvas.shape[0], canvas.shape[1], round(strength, 2))
    if key not in _vig_cache:
        Y = np.linspace(-1, 1, H)[:, None]
        X = np.linspace(-1, 1, W)[None, :]
        _vig_cache[key] = np.clip(1.0 - np.sqrt(X**2+Y**2) * strength, 0.15, 1).astype(np.float32)
    return np.clip(canvas * _vig_cache[key][:,:,None], 0, 255).astype(np.uint8)

Same pattern for CRT barrel distortion (cache remap coordinates).

Film Grain Optimization

Generate noise at half resolution, tile up:

python
noise = np.random.randint(-amt, amt+1, (H//2, W//2, 1), dtype=np.int16)
noise = np.repeat(np.repeat(noise, 2, axis=0), 2, axis=1)[:H, :W]

2x blocky grain looks like film grain and costs 1/4 the random generation.

Parallel Rendering

Worker Architecture

python
hw = detect_hardware()
N_WORKERS = hw["workers"]

# Batch splitting (for non-clip architectures)
batch_size = (n_frames + N_WORKERS - 1) // N_WORKERS
batches = [(i, i*batch_size, min((i+1)*batch_size, n_frames), features, seg_path) ...]

with multiprocessing.Pool(N_WORKERS) as pool:
    segments = pool.starmap(render_batch, batches)

Per-Clip Parallelism (Preferred for Segmented Videos)

python
from concurrent.futures import ProcessPoolExecutor, as_completed

with ProcessPoolExecutor(max_workers=N_WORKERS) as pool:
    futures = {pool.submit(render_clip, seg, features, path): seg["id"]
               for seg, path in clip_args}
    for fut in as_completed(futures):
        clip_id = futures[fut]
        try:
            fut.result()
            log(f"  {clip_id} done")
        except Exception as e:
            log(f"  {clip_id} FAILED: {e}")

Worker Isolation

Each worker:

  • Creates its own Renderer instance (with full grid + bitmap init)
  • Opens its own ffmpeg subprocess
  • Has independent random seed (random.seed(batch_id * 10000))
  • Writes to its own segment file and stderr log

ffmpeg Pipe Safety

CRITICAL: Never stderr=subprocess.PIPE with long-running ffmpeg. The stderr buffer fills at ~64KB and deadlocks:

python
# WRONG -- will deadlock
pipe = subprocess.Popen(cmd, stdin=subprocess.PIPE, stderr=subprocess.PIPE)

# RIGHT -- stderr to file
stderr_fh = open(err_path, "w")
pipe = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=stderr_fh)
# ... write all frames ...
pipe.stdin.close()
pipe.wait()
stderr_fh.close()

Concatenation

python
with open(concat_file, "w") as cf:
    for seg in segments:
        cf.write(f"file '{seg}'\n")

cmd = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", concat_file]
if audio_path:
    cmd += ["-i", audio_path, "-c:v", "copy", "-c:a", "aac", "-b:a", "192k", "-shortest"]
else:
    cmd += ["-c:v", "copy"]
cmd.append(output_path)
subprocess.run(cmd, capture_output=True, check=True)

Particle System Performance

Cap particle counts based on quality profile:

SystemLowStandardHigh
Explosion30010002500
Embers50015003000
Starfield3008001500
Dissolve2006001200

Cull by truncating lists:

python
MAX_PARTICLES = profile.get("particles_max", 1200)
if len(S["px"]) > MAX_PARTICLES:
    for k in ("px", "py", "vx", "vy", "life", "char"):
        S[k] = S[k][-MAX_PARTICLES:]  # keep newest

Memory Management

  • Feature arrays: pre-computed for all frames, shared across workers via fork semantics (COW)
  • Canvas: allocated once per worker, reused (np.zeros(...))
  • Character arrays: allocated per frame (cheap -- rows*cols U1 strings)
  • Bitmap cache: ~500KB per grid size, initialized once per worker

Total memory per worker: ~50-150MB. Total: ~400-800MB for 8 workers.

For low-memory systems (< 4GB), reduce worker count and use smaller grids.

Brightness Verification

After render, spot-check brightness at sample timestamps:

python
for t in [2, 30, 60, 120, 180]:
    cmd = ["ffmpeg", "-ss", str(t), "-i", output_path,
           "-frames:v", "1", "-f", "rawvideo", "-pix_fmt", "rgb24", "-"]
    r = subprocess.run(cmd, capture_output=True)
    arr = np.frombuffer(r.stdout, dtype=np.uint8)
    print(f"t={t}s  mean={arr.mean():.1f}  max={arr.max()}")

Target: mean > 5 for quiet sections, mean > 15 for active sections. If consistently below, increase brightness floor in effects and/or global boost multiplier.

Render Time Estimates

Scale with hardware. Baseline: 1080p, 24fps, ~180ms/frame/worker.

DurationFrames4 workers8 workers16 workers
30s720~3 min~2 min~1 min
2 min2,880~13 min~7 min~4 min
3.5 min5,040~23 min~12 min~6 min
5 min7,200~33 min~17 min~9 min
10 min14,400~65 min~33 min~17 min

At 720p: multiply times by ~0.5. At 4K: multiply by ~4.

Heavier effects (many particles, dense grids, extra shader passes) add ~20-50%.


Temp File Cleanup

Rendering generates intermediate files that accumulate across runs. Clean up after the final concat/mux step.

Files to Clean

File typeSourceLocation
WAV extractsffmpeg -i input.mp3 ... tmp.wavtempfile.mktemp() or project dir
Segment clipsrender_clip() outputsegments/seg_00.mp4 etc.
Concat listffmpeg concat demuxer inputsegments/concat.txt
ffmpeg stderr logspiped to file for debugging*.log in project dir
Feature cachepickled numpy arrays*.pkl or *.npz

Cleanup Function

python
import glob
import tempfile
import shutil

def cleanup_render_artifacts(segments_dir="segments", keep_final=True):
    """Remove intermediate files after successful render.
    
    Call this AFTER verifying the final output exists and plays correctly.
    
    Args:
        segments_dir: directory containing segment clips and concat list
        keep_final: if True, only delete intermediates (not the final output)
    """
    removed = []
    
    # 1. Segment clips
    if os.path.isdir(segments_dir):
        shutil.rmtree(segments_dir)
        removed.append(f"directory: {segments_dir}")
    
    # 2. Temporary WAV files
    for wav in glob.glob("*.wav"):
        if wav.startswith("tmp") or wav.startswith("extracted_"):
            os.remove(wav)
            removed.append(wav)
    
    # 3. ffmpeg stderr logs
    for log in glob.glob("ffmpeg_*.log"):
        os.remove(log)
        removed.append(log)
    
    # 4. Feature cache (optional — useful to keep for re-renders)
    # for cache in glob.glob("features_*.npz"):
    #     os.remove(cache)
    #     removed.append(cache)
    
    print(f"Cleaned {len(removed)} artifacts: {removed}")
    return removed

Integration with Render Pipeline

Call cleanup at the end of the main render script, after the final output is verified:

python
# At end of main()
if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
    cleanup_render_artifacts(segments_dir="segments")
    print(f"Done. Output: {output_path}")
else:
    print("WARNING: final output missing or empty — skipping cleanup")

Temp File Best Practices

  • Use tempfile.mkdtemp() for segment directories — avoids polluting the project dir
  • Name WAV extracts with tempfile.mktemp(suffix=".wav") so they're in the OS temp dir
  • For debugging, set KEEP_INTERMEDIATES=1 env var to skip cleanup
  • Feature caches (.npz) are cheap to store and expensive to recompute — default to keeping them