boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

lint.sh (2815B)


      1 #!/bin/sh
      2 ## lint.sh — catch undefined P1 tokens before they reach M1.
      3 ##
      4 ## M1 silently passes undefined tokens through as literal text, so a
      5 ## misspelled ld_r0,r4,0 (when only ld_r0,r4,8 is defined) produces
      6 ## a runnable-but-SIGILL-ing binary with no diagnostic. This script
      7 ## extracts every op-shaped token referenced in a .M1 program and
      8 ## asserts each has a matching `DEFINE` in the per-arch defs file.
      9 ##
     10 ## After the 2026 rename, op tokens are lowercase identifiers that may
     11 ## contain ',' for multi-operand forms (li_r0, mov_r4,r0, add_r1,r1,r2,
     12 ## prologue_n3, sys_write). The tokenizer must skip `"…"` / `'…'`
     13 ## quoted literals (can span lines and carry prose like "usage: lisp")
     14 ## and `#`/`;` line comments, so the pass is written in Python.
     15 ##
     16 ## Usage: lint.sh <p1_arch.M1> <prog.M1> [<prog.M1> ...]
     17 ## Exit:  0 on success; 1 + diagnostic on any missing token; 2 on misuse.
     18 
     19 set -eu
     20 
     21 if [ "$#" -lt 2 ]; then
     22     echo "usage: $0 <p1_arch.M1> <prog.M1> [<prog.M1> ...]" >&2
     23     exit 2
     24 fi
     25 
     26 exec python3 - "$@" <<'PYEOF'
     27 import re
     28 import sys
     29 
     30 defs_path = sys.argv[1]
     31 prog_paths = sys.argv[2:]
     32 
     33 def defined_names(path):
     34     names = set()
     35     with open(path) as f:
     36         for line in f:
     37             parts = line.split(None, 2)
     38             if len(parts) >= 2 and parts[0] == 'DEFINE':
     39                 names.add(parts[1])
     40     return names
     41 
     42 def tokenize_source(text):
     43     """Emit op-shaped lowercase tokens from M0/M1 source, skipping
     44     `"…"` / `'…'` quoted literals and `#`/`;` line comments. Quotes may
     45     span newlines (strings are line-unaware in M0), comments end at LF."""
     46     i, n = 0, len(text)
     47     tokens = []
     48     cur = []
     49     def flush():
     50         if cur:
     51             tokens.append(''.join(cur))
     52             cur.clear()
     53     while i < n:
     54         c = text[i]
     55         if c == '"' or c == "'":
     56             flush()
     57             q = c
     58             i += 1
     59             while i < n and text[i] != q:
     60                 i += 1
     61             i += 1  # consume closing quote (or end)
     62         elif c == '#' or c == ';':
     63             flush()
     64             while i < n and text[i] != '\n':
     65                 i += 1
     66         elif c.isspace():
     67             flush()
     68             i += 1
     69         else:
     70             cur.append(c)
     71             i += 1
     72     flush()
     73     return tokens
     74 
     75 TOKEN_RE = re.compile(r'^[a-z][a-z0-9_,]*$')
     76 
     77 defs = defined_names(defs_path)
     78 used = set()
     79 for p in prog_paths:
     80     with open(p) as f:
     81         for tok in tokenize_source(f.read()):
     82             if TOKEN_RE.match(tok):
     83                 used.add(tok)
     84 
     85 missing = sorted(t for t in used if t not in defs)
     86 if missing:
     87     sys.stderr.write(
     88         f'error: P1 lint: undefined token(s) referenced in M1 source\n'
     89         f'       (defs file: {defs_path})\n'
     90     )
     91     for m in missing:
     92         sys.stderr.write(f'    {m}\n')
     93     sys.exit(1)
     94 PYEOF