commit 3ee9d8312c2784a36f0f51cda63ba987597e3fef
parent a3094c0c26956e84c96ed93bfa85c5e5ff49edc0
Author: Christian Ermann <christianermann@gmail.com>
Date: Sat, 6 Apr 2024 16:21:12 -0400
Add prose to Section 4
Diffstat:
M | forth.asm | | | 389 | +++++++++++++++++++++++++++++++++++++++++++++++++------------------------------ |
1 file changed, 243 insertions(+), 146 deletions(-)
diff --git a/forth.asm b/forth.asm
@@ -186,7 +186,6 @@ name_#label_:
label_:
dq code_#label_ ; 7. Set the codeword.
section '.text' code readable executable
-
align 1
code_#label_: ; 8. This is where our assembly code will go.
}
@@ -335,248 +334,346 @@ set_program initialize_stack_s3, program_s3
;
; Section 4 - Reading Words
;
+; In this section, we'll define words for processing user input and even build
+; a simple interpreter. At this point, the interpreter will only be able to
+; execute words that we've already defined. We'll need to do a bit more work
+; before we can start defining new words in the interpreter.
+;
defcode "KEY", 3, 0, KEY
+; Read a character.
+;
call sys_read_char
push rax
NEXT
defcode "WORD", 4, 0, WORD_
+; Read a word.
+;
+; A word is considered to be a group of ASCII characters surrounded by white
+; space.
+;
+; WORD is implemented using a pattern that we'll use a lot: We'll define
+; a label '_WORD' that holds most of the implementation, aside from stack
+; manipulation. This lets us easily re-use the implementation in other
+; primitive words, we just have to make sure we're not overwriting important
+; registers when we do this.
+;
call _WORD
- push rcx
- push rdx
+ push rdi ; word name address
+ push rcx ; word name length
NEXT
_WORD:
-.skip_whitespace_and_comments:
+; Read a word.
+;
+; Returns:
+; RDI: word name address
+; RCX: word name length
+;
+
+.skip_whitespace:
+; Read characters until we reach one that isn't white space.
+;
+; After a non-whitespace character is found, RDI is used to hold the address
+; of the next character to be stored.
+;
call sys_read_char
- cmp al, 0x20
- je .skip_whitespace_and_comments
+ cmp al, ' '
+ je .skip_whitespace
cmp al, 0xA
- je .skip_whitespace_and_comments
+ je .skip_whitespace
cmp al, 0xD
- je .skip_whitespace_and_comments
+ je .skip_whitespace
- mov rdx, .buffer
+ mov rdi, .buffer
.store_char:
- mov byte [rdx], al
- inc rdx
+; Store a character into the buffer.
+;
+; There are some assembly instructions such as 'stos', 'lods', etc. that
+; perform string operations for us. These instructions require us to use
+; registers in specific ways, so it's always a good idea to double check what
+; an instruction does.
+;
+; In this instance, 'stosb' stores the byte in AL at address RDI, then
+; increments RDI to point at the next byte in the buffer.
+;
+ stosb
.next_char:
+; Keep processing characters until we find the end of the word.
+;
call sys_read_char
- cmp al, 0x20
+ cmp al, ' '
je .end
cmp al, 0xA
+ je .end
+ cmp al, 0xD
jne .store_char
.end:
+; Return buffer address in RDI and length in RCX.
+;
mov rcx, .buffer
- sub rdx, rcx
+ sub rdi, rcx
+ xchg rdi, rcx
ret
defcode "FIND", 4, 0, FIND
- pop rcx ; string length
- pop rdi ; string address
- push rsi
+; Find a word in the dictionary.
+;
+; We search the dictionary starting from the end (the newest words) and move
+; towards the beginning (the oldest words).
+;
+ pop rcx ; word name length
+ pop rdi ; word name address
call _FIND
- pop rsi
- push rdx
+ push rdx ; word address
NEXT
-postpone {
_FIND:
+; Find a word in the dictionary.
+;
+; Args:
+; RDI: word name address
+; RCX: word name length
+;
+; Returns:
+; RDX: word address
+;
mov rdx, [latest]
+ jmp .check_out_of_words
+
+.next_word:
+; Move to the next word in the dictionary.
+;
+ mov rdx, [rdx]
-.match_word:
- test rdx, rdx ; check if null (start of dictionary / out of words)
+.check_out_of_words:
+; Are we at the beginning of the dictionary?
+;
+ test rdx, rdx
je .end
- movzx rax, byte [rdx + 8] ; al = flags
+.check_hidden:
+; Is the word we're looking at hidden?
+;
+ movzx rax, byte [rdx + 8] ; al = word flags
test al, FLAG_HIDDEN
- jnz .next_word
-
- movzx rax, byte [rdx + 9] ; al = name length
-
- ; TEMP: print current word for debugging
- ;push rax
- ;push rcx
- ;push rdx
- ;push rdi
- ;lea rcx, [rdx + 10]
- ;mov rdx, rax
- ;call sys_print_string
- ;pop rdi
- ;pop rdx
- ;pop rcx
- ;pop rax
-
- cmp cl, al ; check if word lengths match
jne .next_word
- push rdi ; Save word address
- push rcx ; Save word length
- lea rsi, [rdx + 10] ; Get word name address
+.check_length:
+; Is the word we're looking at the right length?
+;
+ movzx rax, byte [rdx + 9] ; al = word name length
+ cmp cl, al
+ jne .next_word
+
+.check_names:
+; Do the words actually match?
+;
+; We use the 'cmpsb' instruction to compare the names of the words which is
+; another of the special string instructions mentioned earlier. 'cmpsb'
+; compares the strings referenced by RDI and RSI for up to RCX characters.
+;
+; The 'repe' prefix causes 'cmpsb' to be called repeatedly until RCX is 0 or a
+; difference between the strings is found.
+;
+; This means that RSI, RDI, and RCX will all be modified. Since we need the
+; original values of these registers after the loop, we have to make sure to
+; save them to the stack, and then restore them afterwards.
+;
+ push rsi
+ push rdi
+ push rcx
- ; RSI = src string
- ; RDI = dst string
- ; RCX = # of characters to compare
+ lea rsi, [rdx + 10] ; rsi = word name address of current entry
repe cmpsb
+
pop rcx
pop rdi
+ pop rsi
jne .next_word
.end:
ret
-.next_word:
- mov rdx, [rdx]
- jmp .match_word
-}
-
defcode ">CFA", 4, 0, TO_CFA
- pop rdi
+; Convert the address of a word into it's code field address.
+;
+; The code field address, or CFA, is the memory address of the code that
+; actually implements this word.
+;
+ pop rdx
call _TO_CFA
- push rdi
+ push rdx
NEXT
_TO_CFA:
- xor rax, rax
- movzx rax, byte [rdi + 9] ; al = name length
-
- ; skip past flags, length, and name, then align to 8-byte boundary
- lea rdi, [rdi + 9 + rax + 7]
- and rdi, 0xFFFFFFFFFFFFFFF8
-
+; Convert the address of a word into it's code field address.
+;
+; The implementation of this word uses a common method for aligning an address
+; to the nearest 8-byte boundary. First, we add 8-1=7 to our address. This
+; ensures we're within the 8-byte region we want to end up in. Next we flip all
+; the bits for 7, then do a bit-by-bit AND with the current address to drop
+; down to the actual 8-byte boundary. It's weird the first time you see it, so
+; it can be useful to work it out on paper.
+;
+; Args:
+; RDX: word address
+;
+; Returns:
+; RDX: word CFA
+;
+ movzx rax, byte [rdx + 9] ; al = name length
+ lea rdx, [rdx + 10 + rax + 7] ; rdx > link, flags, length, and name
+ and rdx, 0xFFFFFFFFFFFFFFF8 ; rdx = code field address
ret
defcode ">NUMBER", 7, 0, TO_NUMBER
- pop rcx ; string length
- pop rdi ; string address
+; Convert a string into a number.
+;
+; The digits permitted in the string depend on the value of the `base` variable
+; defined at the bottom of this file.
+;
+ pop rcx ; string length
+ pop rdi ; string address
call _TO_NUMBER
- push rax ; number
- push rcx ; # of unparsed characters (0 => no error)
+ push rbx ; parsed number
+ push rcx ; # of unparsed characters (0 => no error)
NEXT
_TO_NUMBER:
- ; Convert a string into a number.
- ; Args:
- ; RDI: string address
- ; RCX: string length
- ;
- ; Returns:
- ; RAX: parsed number
- ; RCX: number of unparsed characters
-
- ; initialize parsed number to 0
+; Convert a string into a number.
+;
+; The number is initialized as 0 and is parsed incrementally. As the string is
+; parsed from left to right, the existing number must be multipled by the
+; value of `base` before adding each new digit. We use the R8 register as a
+; sign flag, where 0 means positive and 1 means negative.
+;
+; Args:
+; RDI: string address
+; RCX: string length
+;
+; Returns:
+; RBX: parsed number
+; RCX: # of unparsed characters (0 => no error)
+;
xor rax, rax
+ xor rbx, rbx
+ xor r8, r8
+ push rsi
+ mov rsi, rdi
- ; if string is empty, handle error
+.check_empty_string:
+; Is the string empty?
+;
+; If so, we can't do any parsing and should give up. Otherwise, we initialize
+; RDX to hold the value of `base` and we start reading characters into RAX.
+;
test rcx, rcx
- jz .empty_string
+ jz .handle_empty_string
- ; RDX: base
- ; RBX: current character
movzx rdx, byte [BASE]
- movzx rbx, byte [rdi] ; read first character
- inc rdi
-
- ; if positive
- ; - push 0 on stack
- ; - jump to '.to_ascii'
- push rax
- cmp bl, 0x2D
- jnz .to_ascii
+ lodsb
- ; if negative
- ; - push '-' on stack
- pop rax
- push rbx
+.check_sign:
+; Does the string begin with '-'?
+;
+ cmp al, '-'
+ jnz .to_numeric_value
+ inc r8
dec rcx
+ lodsb
- ; if remaining string is non-empty
- ; - jump to '.read_char'
- jnz .read_char
+.check_negative_empty_string:
+; Is '-' the only character in the string?
+;
+; If so, we return the number 0 with 1 unparsed character.
+;
+ test rcx, rcx
+ jnz .to_numeric_value
- ; if remaining string is empty
- ; - return 1 as number of unparsed characters
- pop rbx
mov rcx, 1
- ret
-
-.empty_string:
- mov rcx, -1
+ pop rsi
ret
.next_char:
- imul rax, rdx ; multiply value by base
-
-.read_char:
- movzx rbx, byte [rdi]
- inc rdi
+; Start parsing the next character.
+;
+; We also have to multiply the number by 'base' to make room for the next
+; digit.
+;
+ imul rbx, rdx
+ lodsb
-.to_ascii:
- ; if value below "0"
- ; - stop parsing and return
- sub rbx, 0x30
+.to_numeric_value:
+; Convert an ASCII character to its numeric value.
+;
+; The characters '0'-'9' are represented by the ASCII codes 48-57, so we can
+; convert them to the right value by subtracting 48. If the value is below 0
+; at that point, an invalid character was passed, and we should stop parsing.
+; If it's below 10, we're good to start comparing to the value of 'base'. For
+; values greater than 10, the only valid characters are 'A'-'F' which are
+; represented by the ASCII codes 65-70. Since we've already substracted 48,
+; we need to subtract 7 to line each character up with the value they
+; represent (A=10, ..., F=15). If the value is below 10, then we should stop
+; parsing as an invalid character was passed. Otherwise, we're good to start
+; comparing to the value of 'base'.
+;
+ sub rax, 48
jb .handle_sign
-
- ; if value is below "10"
- ; - start parsing the next character
- cmp rbx, 10
+ cmp rax, 10
jb .check_base
-
- ; if value is below "A"
- ; - stop parsing and return
- sub rbx, 17
+ sub rax, 7
+ cmp rax, 10
jb .handle_sign
- ; if value is above "A"
- ; - convert to numeric value
- ; - start parsing the next character
- add rbx, 10
-
.check_base:
- ; if value is greater than or equal to base
- ; - stop parsing and return
- cmp rbx, rdx
+; Check if the value is less than the value of 'base'.
+;
+; If the value is greater than 'base', we should stop parsing. Otherwise, we
+; can add the value to our number and start parsing the next character.
+;
+ cmp rax, rdx
jge .handle_sign
- ; if value is less than the base
- ; - start parsing the next character
- add rax, rbx
- dec rcx
- jnz .next_char
+ add rbx, rax
+ loop .next_char
.handle_sign:
- pop rbx
- test rbx, rbx
+; Check if the sign flag was set, then negate the number.
+;
+ test r8, r8
jz .end
- neg rax
+ neg rbx
.end:
+ pop rsi
+ ret
+
+.handle_empty_string:
+; If an empty string was detected, return -1 as an error code.
+;
+ mov rcx, -1
+ pop rsi
ret
defcode "INTERPRET", 9, 0, INTERPRET
call _WORD
-
- mov rdi, rcx
- mov rcx, rdx
-
- push rsi
call _FIND
- pop rsi
test rdx, rdx
jz .try_number
- mov rdi, rdx
- call _TO_CFA
+ mov rbx, rdx ; rbx = word address
+ call _TO_CFA ; rdx = word CFA
.check_state:
mov rax, [state]
@@ -584,28 +681,28 @@ defcode "INTERPRET", 9, 0, INTERPRET
jz .execute_word
.check_immediate:
- movzx rax, byte [rdx + 8]
+ movzx rax, byte [rbx + 8]
test al, FLAG_IMMEDIATE
jnz .execute_word
- mov rax, rdi
+ mov rax, rdx
call _COMMA
NEXT
.execute_word:
- mov rax, rdi
+ mov rax, rdx
jmp qword [rax]
.try_number:
call _TO_NUMBER
test rcx, rcx
jnz .not_found
- mov rbx, rax
.check_state_number:
mov rax, [state]
test rax, rax
jz .execute_number
+.compile_number:
mov rax, LIT
call _COMMA
mov rax, rbx