forth

A WIP implementation of Forth targeting UEFI x86-64.
git clone git://git.christianermann.dev/forth
Log | Files | Refs

commit 3ee9d8312c2784a36f0f51cda63ba987597e3fef
parent a3094c0c26956e84c96ed93bfa85c5e5ff49edc0
Author: Christian Ermann <christianermann@gmail.com>
Date:   Sat,  6 Apr 2024 16:21:12 -0400

Add prose to Section 4

Diffstat:
Mforth.asm | 389+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
1 file changed, 243 insertions(+), 146 deletions(-)

diff --git a/forth.asm b/forth.asm @@ -186,7 +186,6 @@ name_#label_: label_: dq code_#label_ ; 7. Set the codeword. section '.text' code readable executable - align 1 code_#label_: ; 8. This is where our assembly code will go. } @@ -335,248 +334,346 @@ set_program initialize_stack_s3, program_s3 ; ; Section 4 - Reading Words ; +; In this section, we'll define words for processing user input and even build +; a simple interpreter. At this point, the interpreter will only be able to +; execute words that we've already defined. We'll need to do a bit more work +; before we can start defining new words in the interpreter. +; defcode "KEY", 3, 0, KEY +; Read a character. +; call sys_read_char push rax NEXT defcode "WORD", 4, 0, WORD_ +; Read a word. +; +; A word is considered to be a group of ASCII characters surrounded by white +; space. +; +; WORD is implemented using a pattern that we'll use a lot: We'll define +; a label '_WORD' that holds most of the implementation, aside from stack +; manipulation. This lets us easily re-use the implementation in other +; primitive words, we just have to make sure we're not overwriting important +; registers when we do this. +; call _WORD - push rcx - push rdx + push rdi ; word name address + push rcx ; word name length NEXT _WORD: -.skip_whitespace_and_comments: +; Read a word. +; +; Returns: +; RDI: word name address +; RCX: word name length +; + +.skip_whitespace: +; Read characters until we reach one that isn't white space. +; +; After a non-whitespace character is found, RDI is used to hold the address +; of the next character to be stored. +; call sys_read_char - cmp al, 0x20 - je .skip_whitespace_and_comments + cmp al, ' ' + je .skip_whitespace cmp al, 0xA - je .skip_whitespace_and_comments + je .skip_whitespace cmp al, 0xD - je .skip_whitespace_and_comments + je .skip_whitespace - mov rdx, .buffer + mov rdi, .buffer .store_char: - mov byte [rdx], al - inc rdx +; Store a character into the buffer. +; +; There are some assembly instructions such as 'stos', 'lods', etc. that +; perform string operations for us. These instructions require us to use +; registers in specific ways, so it's always a good idea to double check what +; an instruction does. +; +; In this instance, 'stosb' stores the byte in AL at address RDI, then +; increments RDI to point at the next byte in the buffer. +; + stosb .next_char: +; Keep processing characters until we find the end of the word. +; call sys_read_char - cmp al, 0x20 + cmp al, ' ' je .end cmp al, 0xA + je .end + cmp al, 0xD jne .store_char .end: +; Return buffer address in RDI and length in RCX. +; mov rcx, .buffer - sub rdx, rcx + sub rdi, rcx + xchg rdi, rcx ret defcode "FIND", 4, 0, FIND - pop rcx ; string length - pop rdi ; string address - push rsi +; Find a word in the dictionary. +; +; We search the dictionary starting from the end (the newest words) and move +; towards the beginning (the oldest words). +; + pop rcx ; word name length + pop rdi ; word name address call _FIND - pop rsi - push rdx + push rdx ; word address NEXT -postpone { _FIND: +; Find a word in the dictionary. +; +; Args: +; RDI: word name address +; RCX: word name length +; +; Returns: +; RDX: word address +; mov rdx, [latest] + jmp .check_out_of_words + +.next_word: +; Move to the next word in the dictionary. +; + mov rdx, [rdx] -.match_word: - test rdx, rdx ; check if null (start of dictionary / out of words) +.check_out_of_words: +; Are we at the beginning of the dictionary? +; + test rdx, rdx je .end - movzx rax, byte [rdx + 8] ; al = flags +.check_hidden: +; Is the word we're looking at hidden? +; + movzx rax, byte [rdx + 8] ; al = word flags test al, FLAG_HIDDEN - jnz .next_word - - movzx rax, byte [rdx + 9] ; al = name length - - ; TEMP: print current word for debugging - ;push rax - ;push rcx - ;push rdx - ;push rdi - ;lea rcx, [rdx + 10] - ;mov rdx, rax - ;call sys_print_string - ;pop rdi - ;pop rdx - ;pop rcx - ;pop rax - - cmp cl, al ; check if word lengths match jne .next_word - push rdi ; Save word address - push rcx ; Save word length - lea rsi, [rdx + 10] ; Get word name address +.check_length: +; Is the word we're looking at the right length? +; + movzx rax, byte [rdx + 9] ; al = word name length + cmp cl, al + jne .next_word + +.check_names: +; Do the words actually match? +; +; We use the 'cmpsb' instruction to compare the names of the words which is +; another of the special string instructions mentioned earlier. 'cmpsb' +; compares the strings referenced by RDI and RSI for up to RCX characters. +; +; The 'repe' prefix causes 'cmpsb' to be called repeatedly until RCX is 0 or a +; difference between the strings is found. +; +; This means that RSI, RDI, and RCX will all be modified. Since we need the +; original values of these registers after the loop, we have to make sure to +; save them to the stack, and then restore them afterwards. +; + push rsi + push rdi + push rcx - ; RSI = src string - ; RDI = dst string - ; RCX = # of characters to compare + lea rsi, [rdx + 10] ; rsi = word name address of current entry repe cmpsb + pop rcx pop rdi + pop rsi jne .next_word .end: ret -.next_word: - mov rdx, [rdx] - jmp .match_word -} - defcode ">CFA", 4, 0, TO_CFA - pop rdi +; Convert the address of a word into it's code field address. +; +; The code field address, or CFA, is the memory address of the code that +; actually implements this word. +; + pop rdx call _TO_CFA - push rdi + push rdx NEXT _TO_CFA: - xor rax, rax - movzx rax, byte [rdi + 9] ; al = name length - - ; skip past flags, length, and name, then align to 8-byte boundary - lea rdi, [rdi + 9 + rax + 7] - and rdi, 0xFFFFFFFFFFFFFFF8 - +; Convert the address of a word into it's code field address. +; +; The implementation of this word uses a common method for aligning an address +; to the nearest 8-byte boundary. First, we add 8-1=7 to our address. This +; ensures we're within the 8-byte region we want to end up in. Next we flip all +; the bits for 7, then do a bit-by-bit AND with the current address to drop +; down to the actual 8-byte boundary. It's weird the first time you see it, so +; it can be useful to work it out on paper. +; +; Args: +; RDX: word address +; +; Returns: +; RDX: word CFA +; + movzx rax, byte [rdx + 9] ; al = name length + lea rdx, [rdx + 10 + rax + 7] ; rdx > link, flags, length, and name + and rdx, 0xFFFFFFFFFFFFFFF8 ; rdx = code field address ret defcode ">NUMBER", 7, 0, TO_NUMBER - pop rcx ; string length - pop rdi ; string address +; Convert a string into a number. +; +; The digits permitted in the string depend on the value of the `base` variable +; defined at the bottom of this file. +; + pop rcx ; string length + pop rdi ; string address call _TO_NUMBER - push rax ; number - push rcx ; # of unparsed characters (0 => no error) + push rbx ; parsed number + push rcx ; # of unparsed characters (0 => no error) NEXT _TO_NUMBER: - ; Convert a string into a number. - ; Args: - ; RDI: string address - ; RCX: string length - ; - ; Returns: - ; RAX: parsed number - ; RCX: number of unparsed characters - - ; initialize parsed number to 0 +; Convert a string into a number. +; +; The number is initialized as 0 and is parsed incrementally. As the string is +; parsed from left to right, the existing number must be multipled by the +; value of `base` before adding each new digit. We use the R8 register as a +; sign flag, where 0 means positive and 1 means negative. +; +; Args: +; RDI: string address +; RCX: string length +; +; Returns: +; RBX: parsed number +; RCX: # of unparsed characters (0 => no error) +; xor rax, rax + xor rbx, rbx + xor r8, r8 + push rsi + mov rsi, rdi - ; if string is empty, handle error +.check_empty_string: +; Is the string empty? +; +; If so, we can't do any parsing and should give up. Otherwise, we initialize +; RDX to hold the value of `base` and we start reading characters into RAX. +; test rcx, rcx - jz .empty_string + jz .handle_empty_string - ; RDX: base - ; RBX: current character movzx rdx, byte [BASE] - movzx rbx, byte [rdi] ; read first character - inc rdi - - ; if positive - ; - push 0 on stack - ; - jump to '.to_ascii' - push rax - cmp bl, 0x2D - jnz .to_ascii + lodsb - ; if negative - ; - push '-' on stack - pop rax - push rbx +.check_sign: +; Does the string begin with '-'? +; + cmp al, '-' + jnz .to_numeric_value + inc r8 dec rcx + lodsb - ; if remaining string is non-empty - ; - jump to '.read_char' - jnz .read_char +.check_negative_empty_string: +; Is '-' the only character in the string? +; +; If so, we return the number 0 with 1 unparsed character. +; + test rcx, rcx + jnz .to_numeric_value - ; if remaining string is empty - ; - return 1 as number of unparsed characters - pop rbx mov rcx, 1 - ret - -.empty_string: - mov rcx, -1 + pop rsi ret .next_char: - imul rax, rdx ; multiply value by base - -.read_char: - movzx rbx, byte [rdi] - inc rdi +; Start parsing the next character. +; +; We also have to multiply the number by 'base' to make room for the next +; digit. +; + imul rbx, rdx + lodsb -.to_ascii: - ; if value below "0" - ; - stop parsing and return - sub rbx, 0x30 +.to_numeric_value: +; Convert an ASCII character to its numeric value. +; +; The characters '0'-'9' are represented by the ASCII codes 48-57, so we can +; convert them to the right value by subtracting 48. If the value is below 0 +; at that point, an invalid character was passed, and we should stop parsing. +; If it's below 10, we're good to start comparing to the value of 'base'. For +; values greater than 10, the only valid characters are 'A'-'F' which are +; represented by the ASCII codes 65-70. Since we've already substracted 48, +; we need to subtract 7 to line each character up with the value they +; represent (A=10, ..., F=15). If the value is below 10, then we should stop +; parsing as an invalid character was passed. Otherwise, we're good to start +; comparing to the value of 'base'. +; + sub rax, 48 jb .handle_sign - - ; if value is below "10" - ; - start parsing the next character - cmp rbx, 10 + cmp rax, 10 jb .check_base - - ; if value is below "A" - ; - stop parsing and return - sub rbx, 17 + sub rax, 7 + cmp rax, 10 jb .handle_sign - ; if value is above "A" - ; - convert to numeric value - ; - start parsing the next character - add rbx, 10 - .check_base: - ; if value is greater than or equal to base - ; - stop parsing and return - cmp rbx, rdx +; Check if the value is less than the value of 'base'. +; +; If the value is greater than 'base', we should stop parsing. Otherwise, we +; can add the value to our number and start parsing the next character. +; + cmp rax, rdx jge .handle_sign - ; if value is less than the base - ; - start parsing the next character - add rax, rbx - dec rcx - jnz .next_char + add rbx, rax + loop .next_char .handle_sign: - pop rbx - test rbx, rbx +; Check if the sign flag was set, then negate the number. +; + test r8, r8 jz .end - neg rax + neg rbx .end: + pop rsi + ret + +.handle_empty_string: +; If an empty string was detected, return -1 as an error code. +; + mov rcx, -1 + pop rsi ret defcode "INTERPRET", 9, 0, INTERPRET call _WORD - - mov rdi, rcx - mov rcx, rdx - - push rsi call _FIND - pop rsi test rdx, rdx jz .try_number - mov rdi, rdx - call _TO_CFA + mov rbx, rdx ; rbx = word address + call _TO_CFA ; rdx = word CFA .check_state: mov rax, [state] @@ -584,28 +681,28 @@ defcode "INTERPRET", 9, 0, INTERPRET jz .execute_word .check_immediate: - movzx rax, byte [rdx + 8] + movzx rax, byte [rbx + 8] test al, FLAG_IMMEDIATE jnz .execute_word - mov rax, rdi + mov rax, rdx call _COMMA NEXT .execute_word: - mov rax, rdi + mov rax, rdx jmp qword [rax] .try_number: call _TO_NUMBER test rcx, rcx jnz .not_found - mov rbx, rax .check_state_number: mov rax, [state] test rax, rax jz .execute_number +.compile_number: mov rax, LIT call _COMMA mov rax, rbx