[Python] Python Step by Step[Python] Python Step by Step

✂️

LLM Tokeniser

Week 13, 2026

def main() -> None: prefixes = ["inter", "under", "multi", "hyper", "trans", "over", "auto", "semi", "anti", "dis", "pre", "mis", "sub", "un", "re", "in", "ex", "co", "il", "im"] suffixes = ["less", "ness", "able", "ible", "ment", "hood", "ship", "ing", "ful", "ion", "ist", "ism", "ity", "ize", "ous", "age", "ed", "ly", "er", "or", "ic", "al"] specific = ['`', '~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '_', '+', '=', '[', ']', '{', '}', '\\', '|', '/', '"', '\'', ':', ';', '>', '<', ',', '.', '?'] text = "Example Text 123" result = [] for el in text.split(' '): out: list[str] = [] d = el.lower() j = 0 for i in range(len(d)): if d[i] in specific: out.append(el[j:i]) out.append(d[i]) j = i+1 if j != len(d): out.append(el[j:]) for i in range(len(out)): l = len(out[i]) index = -1 for j in range(l): if out[i][j].isdecimal() and index == -1: index = j elif index != -1 and not out[i][j].isdecimal(): out.insert(i+1, out[i][index:j]) out.insert(i+2, out[i][j:]) out[i] = out[i][:index] break elif index != -1 and j == l-1: out.insert(i+1, out[i][index:]) out[i] = out[i][:index] break shift = 0 for i in range(len(out)): skip = False if len(out[i+shift]) > 1 and not out[i+shift].isdecimal(): for pref in prefixes: a = out[i+shift][:len(pref)] if out[i+shift].lower() == pref: skip = True break if out[i+shift][:len(pref)].lower() == pref: out.insert(i+shift+1, out[i+shift][len(pref):]) out[i+shift] = a shift += 1 break if skip: break for suff in suffixes: a = out[i+shift][-len(suff):] if out[i+shift].lower() == suff: break if out[i+shift][-len(suff):].lower() == suff: out.insert(i+shift+1, a) out[i+shift] = out[i+shift][:-len(suff)] shift += 1 break for i in out: if i != '': result.append(i) for i in result: print(i) main()