[Odin] Token iterator and prefix/suffix matching[Odin] Token iterator and prefix/suffix matching
✂️
LLM Tokeniser
Week 13, 2026
package main
import "core:fmt"
import "core:slice"
import "core:strings"
TEXT := #load("input.txt", string) or_else "I'll be back - T800"
main :: proc () {
prefixes := [?] string { "un", "re", "in", "dis", "pre", "mis", "ex", "anti", "inter", "sub", "over", "trans", "auto", "semi", "hyper", "multi", "co", "under", "il", "im" }
suffixes := [?] string { "ed", "ing", "ly", "ful", "less", "ness", "able", "ible", "ment", "ion", "er", "or", "ist", "ism", "ity", "ize", "ous", "ic", "al", "hood", "ship", "age" }
slice.sort_by(prefixes[:], cmp_len_abc)
slice.sort_by(suffixes[:], cmp_len_abc)
text_lc := strings.to_lower(TEXT) // for prefix/suffix matching
it := token_iterate(TEXT)
for type, token, start, end in token_next(&it) {
if type != .abc {
fmt.println(token)
continue
}
prefix := ""
middle_oc := token
middle_lc := text_lc[start:end]
suffix := ""
for p in prefixes do if strings.starts_with(middle_lc, p) {
prefix = middle_oc[:len(p)]
middle_oc = middle_oc[len(p):]
middle_lc = middle_lc[len(p):]
break
}
for s in suffixes do if strings.ends_with(middle_lc, s) {
suffix = middle_oc[len(middle_oc)-len(s):]
middle_oc = middle_oc[:len(middle_oc)-len(s)]
break
}
if prefix != "" do fmt.println(prefix)
if middle_oc != "" do fmt.println(middle_oc)
if suffix != "" do fmt.println(suffix)
}
}
Token_Type :: enum { none, pun, abc, num }
Token_Iterator :: struct {
text: string,
index: int,
}
token_iterate :: proc (text: string) -> Token_Iterator {
return { text=text }
}
token_next :: proc (it: ^Token_Iterator) -> (type: Token_Type, token: string, start, end: int, ok: bool) {
seq: struct {
type: Token_Type,
start, end: int,
}
loop: for ; it.index < len(it.text); it.index += 1 {
i := it.index
c := it.text[i]
switch {
case is_sep(c):
if seq.type != .none {
seq.end = i
break loop
}
case is_pun(c):
if seq.type == .none {
seq = { type=.pun, start=i, end=i+1 }
} else {
seq.end = i
}
break loop
case is_abc(c), is_num(c):
c_type: Token_Type = is_abc(c) ? .abc : .num
if seq.type != c_type {
if seq.type == .none {
seq = { type=c_type, start=i }
} else {
seq.end = i
break loop
}
}
}
}
if seq.type != .none {
if seq.end == 0 {
assert(it.index == len(it.text))
seq.end = it.index
}
if seq.end != 0 {
it.index = seq.end
return seq.type, it.text[seq.start:seq.end], seq.start, seq.end, true
}
}
return
}
cmp_len_abc :: proc (a, b: string) -> bool {
return len(a) == len(b) ? a < b : len(a) > len(b)
}
is_num :: proc (c: byte) -> bool {
return c>='0' && c<='9'
}
is_abc :: proc (c: byte) -> bool {
return (c>='A' && c<='Z') || (c>='a' && c<='z')
}
is_sep :: proc (c: byte) -> bool {
return c==' ' || c=='\t' || c=='\n' || c=='\a' || c=='\r'
}
is_pun :: proc (c: byte) -> bool {
// https://web.alfredstate.edu/faculty/weimandn/miscellaneous/ascii/ASCII%20Conversion%20Chart.gif
return (c>=33 && c<=47) || (c>=58 && c<=64) || (c>=91 && c<=96) || (c>=123 && c<=126)
}