[Odin] Token iterator and prefix/suffix matching[Odin] Token iterator and prefix/suffix matching

✂️

LLM Tokeniser

Week 13, 2026

package main import "core:fmt" import "core:slice" import "core:strings" TEXT := #load("input.txt", string) or_else "I'll be back - T800" main :: proc () { prefixes := [?] string { "un", "re", "in", "dis", "pre", "mis", "ex", "anti", "inter", "sub", "over", "trans", "auto", "semi", "hyper", "multi", "co", "under", "il", "im" } suffixes := [?] string { "ed", "ing", "ly", "ful", "less", "ness", "able", "ible", "ment", "ion", "er", "or", "ist", "ism", "ity", "ize", "ous", "ic", "al", "hood", "ship", "age" } slice.sort_by(prefixes[:], cmp_len_abc) slice.sort_by(suffixes[:], cmp_len_abc) text_lc := strings.to_lower(TEXT) // for prefix/suffix matching it := token_iterate(TEXT) for type, token, start, end in token_next(&it) { if type != .abc { fmt.println(token) continue } prefix := "" middle_oc := token middle_lc := text_lc[start:end] suffix := "" for p in prefixes do if strings.starts_with(middle_lc, p) { prefix = middle_oc[:len(p)] middle_oc = middle_oc[len(p):] middle_lc = middle_lc[len(p):] break } for s in suffixes do if strings.ends_with(middle_lc, s) { suffix = middle_oc[len(middle_oc)-len(s):] middle_oc = middle_oc[:len(middle_oc)-len(s)] break } if prefix != "" do fmt.println(prefix) if middle_oc != "" do fmt.println(middle_oc) if suffix != "" do fmt.println(suffix) } } Token_Type :: enum { none, pun, abc, num } Token_Iterator :: struct { text: string, index: int, } token_iterate :: proc (text: string) -> Token_Iterator { return { text=text } } token_next :: proc (it: ^Token_Iterator) -> (type: Token_Type, token: string, start, end: int, ok: bool) { seq: struct { type: Token_Type, start, end: int, } loop: for ; it.index < len(it.text); it.index += 1 { i := it.index c := it.text[i] switch { case is_sep(c): if seq.type != .none { seq.end = i break loop } case is_pun(c): if seq.type == .none { seq = { type=.pun, start=i, end=i+1 } } else { seq.end = i } break loop case is_abc(c), is_num(c): c_type: Token_Type = is_abc(c) ? .abc : .num if seq.type != c_type { if seq.type == .none { seq = { type=c_type, start=i } } else { seq.end = i break loop } } } } if seq.type != .none { if seq.end == 0 { assert(it.index == len(it.text)) seq.end = it.index } if seq.end != 0 { it.index = seq.end return seq.type, it.text[seq.start:seq.end], seq.start, seq.end, true } } return } cmp_len_abc :: proc (a, b: string) -> bool { return len(a) == len(b) ? a < b : len(a) > len(b) } is_num :: proc (c: byte) -> bool { return c>='0' && c<='9' } is_abc :: proc (c: byte) -> bool { return (c>='A' && c<='Z') || (c>='a' && c<='z') } is_sep :: proc (c: byte) -> bool { return c==' ' || c=='\t' || c=='\n' || c=='\a' || c=='\r' } is_pun :: proc (c: byte) -> bool { // https://web.alfredstate.edu/faculty/weimandn/miscellaneous/ascii/ASCII%20Conversion%20Chart.gif return (c>=33 && c<=47) || (c>=58 && c<=64) || (c>=91 && c<=96) || (c>=123 && c<=126) }