boot2

Playing with the boostrap
git clone https://git.ryansepassi.com/git/boot2.git
Log | Files | Refs | README

lint.sh (2884B)


      1 #!/bin/sh
      2 ## lint.sh — catch undefined P1 tokens before they reach M1.
      3 ##
      4 ## M1 silently passes undefined tokens through as literal text, so a
      5 ## misspelled ld_r0,r4,0 (when only ld_r0,r4,8 is defined) produces
      6 ## a runnable-but-SIGILL-ing binary with no diagnostic. This script
      7 ## extracts every op-shaped token referenced in a .M1 program and
      8 ## asserts each has a matching `DEFINE` in the per-arch defs file.
      9 ##
     10 ## After the 2026 rename, op tokens are lowercase identifiers that may
     11 ## contain ',' for multi-operand forms (li_r0, mov_r4,r0, add_r1,r1,r2,
     12 ## prologue_n3, sys_write). The tokenizer must skip `"…"` / `'…'`
     13 ## quoted literals (can span lines and carry prose like "usage: lisp")
     14 ## and `#`/`;` line comments, so the pass is written in Python.
     15 ##
     16 ## Env: ARCH=aarch64|amd64|riscv64
     17 ## Usage: lint.sh <prog.M1> [<prog.M1> ...]
     18 ## Exit:  0 on success; 1 + diagnostic on any missing token; 2 on misuse.
     19 
     20 set -eu
     21 
     22 : "${ARCH:?ARCH must be set}"
     23 
     24 if [ "$#" -lt 1 ]; then
     25     echo "usage: ARCH=<arch> $0 <prog.M1> [<prog.M1> ...]" >&2
     26     exit 2
     27 fi
     28 
     29 exec python3 - "P1/P1-$ARCH.M1" "$@" <<'PYEOF'
     30 import re
     31 import sys
     32 
     33 defs_path = sys.argv[1]
     34 prog_paths = sys.argv[2:]
     35 
     36 def defined_names(path):
     37     names = set()
     38     with open(path) as f:
     39         for line in f:
     40             parts = line.split(None, 2)
     41             if len(parts) >= 2 and parts[0] == 'DEFINE':
     42                 names.add(parts[1])
     43     return names
     44 
     45 def tokenize_source(text):
     46     """Emit op-shaped lowercase tokens from M0/M1 source, skipping
     47     `"…"` / `'…'` quoted literals and `#`/`;` line comments. Quotes may
     48     span newlines (strings are line-unaware in M0), comments end at LF."""
     49     i, n = 0, len(text)
     50     tokens = []
     51     cur = []
     52     def flush():
     53         if cur:
     54             tokens.append(''.join(cur))
     55             cur.clear()
     56     while i < n:
     57         c = text[i]
     58         if c == '"' or c == "'":
     59             flush()
     60             q = c
     61             i += 1
     62             while i < n and text[i] != q:
     63                 i += 1
     64             i += 1  # consume closing quote (or end)
     65         elif c == '#' or c == ';':
     66             flush()
     67             while i < n and text[i] != '\n':
     68                 i += 1
     69         elif c.isspace():
     70             flush()
     71             i += 1
     72         else:
     73             cur.append(c)
     74             i += 1
     75     flush()
     76     return tokens
     77 
     78 TOKEN_RE = re.compile(r'^[a-z][a-z0-9_,]*$')
     79 
     80 defs = defined_names(defs_path)
     81 used = set()
     82 for p in prog_paths:
     83     with open(p) as f:
     84         for tok in tokenize_source(f.read()):
     85             if TOKEN_RE.match(tok):
     86                 used.add(tok)
     87 
     88 missing = sorted(t for t in used if t not in defs)
     89 if missing:
     90     sys.stderr.write(
     91         f'error: P1 lint: undefined token(s) referenced in M1 source\n'
     92         f'       (defs file: {defs_path})\n'
     93     )
     94     for m in missing:
     95         sys.stderr.write(f'    {m}\n')
     96     sys.exit(1)
     97 PYEOF