Add EMIT and TYPE - forth - A WIP implementation of Forth targeting UEFI x86-64.

commit cbee5318a1ccf055bfd099a58bf7ab69c79937c2
parent 56302df696f373955d894acabd53ba29d235cf43
Author: Christian Ermann <christianermann@gmail.com>
Date:   Sat,  2 Dec 2023 11:27:21 -0500

Add EMIT and TYPE

Diffstat:
M forth.asm  | 289 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--

1 file changed, 283 insertions(+), 6 deletions(-)
diff --git a/forth.asm b/forth.asm
@@ -10,20 +10,297 @@ section '.text' code executable readable
 
 include 'efi.asm'
 
-main:
-; Initialize the system and print a version string.
+;------------------------------------------------------------------------------
+;
+; Foreword
+;
+; I am writing this as an alternative to the "jonesforth" implementation of
+; the Forth language for learning how to write a Forth. I'm attempting to
+; design my Forth in such a way that a user can interact with the system as
+; soon as possible, and further progress is broken into incremental steps. My
+; hope is that this also makes it easier to port to different architectures by
+; providing natural checkpoints to verify functionality as more features are
+; added.
+;
+;------------------------------------------------------------------------------
+;
+; Table of Contents
+; 1. The Basic Structure of Forth
+; 2. Executing this Program
+; 3. Hello, World!
+;
+;------------------------------------------------------------------------------
+;
+; Section 1: Basic Structure
+;
+; In this section, we'll learn how to represent words in the Forth language in
+; memory, and how to execute those words to do meaningful work.
+;
+; The Dictionary
+;
+; All words in Forth are stored in a dictionary, just like in English or in
+; French. Each entry in a Forth dictionary stores a reference to the previous
+; word (known as the link pointer), some flags to denote special properties of
+; certain words, the length of the word's name, the name of the word, and
+; finally the definition of the word. For memory alignment, there may be some
+; padding placed inbetween
+; the name of the word and its definition. The memory layout of an entry for
+; soupforth can be seen below:
+;
+;     Dictionary Entry
+;     +-------------------------------+ <
+;     | link pointer | 8 bytes        | | Header
+;     +-------------------------------+ |
+;     | flags        | 1 byte         | |
+;     +-------------------------------+ |
+;     | length       | 1 byte         | |
+;     +-------------------------------+ |
+;     | name         | upto 255 bytes | |
+;     +-------------------------------+ |
+;     | padding      | upto 7 bytes   | |
+;     +-------------------------------+ <
+;     | definition   |                |
+;     +-------------------------------+
+;
+; All the words we define can be split into two main categories: primitive
+; words and Forth words. The terminology can be a bit confusing, as all the
+; words we define are part of our Forth, but what these terms actually compare
+; is the implementation of the words. Primitive words are implemented directly
+; in assembly, while Forth words are implemented in Forth itself.
+;
+; This distinction is mostly hidden from users of the language, however, the
+; choices you make affect the speed and portability of the implementation. As
+; the number of primitive words increases, so does the speed of the Forth
+; implementation. However, the fewer primitive words there are, the easier the
+; Forth is to port to new architectures.
+;
+; As implementers, we do need to be aware of the differences. For example, the
+; "definition" field, left blank in the diagram above, differs slightly between
+; primitive words and Forth words. Definitions always start with a codeword and
+; end with some sort of "return" statement, but the details of these codewords
+; and "return" tatements differ between the two types of words.
+;
+; For primitive words, the codeword is the address of the assembly code that
+; implements the word. The return statement is called `NEXT` and takes care of
+; loading and jumping to the address of the **next** word.
+;
+; As we execute words, we'll use the RAX register to store the address of the
+; word that we're currently executing and we'll use the RSI register to store
+; the address of the next word we need to execute.
+;
+; This leads us to a fairly straightforward definition of a macro for NEXT:
+
+macro NEXT
+{
+    lodsq           ; 1. Load value at address RSI into RAX
+    jmp qword [rax] ; 2. Jump to address in RAX
+}
+
+; For Forth words, the codeword is called DOCOL, short for "DO COLON", as Forth
+; definitions are started with a ":". DOCOL takes care of storing the address
+; of the next word we need to execute, and starting execution of the current
+; word. The return statement is called EXIT and loads the address of the next
+; word that DOCOL stored earlier and calls NEXT.
+;
+; In order to store and load addresses of words, Forth uses what's known as the
+; return stack. It's called the return stack as it stores the addresses we
+; **return** to at the end of a definition. We'll use the RBP register as our
+; return stack pointer.
+
+macro push_rs reg
+; Push the value of `reg` onto the return stack.
+;
+{
+    lea rbp, [rbp - 8] ; 1. Move the return stack pointer down 1 address.
+    mov [rbp], reg     ; 2. Push the word address onto the stack.
+}
+
+macro pop_rs reg
+; Pop the top value of the return stack into `reg`.
+;
+{
+    mov reg, [rbp]     ; 1. Pop a word address off of the stack.
+    lea rbp, [rbp + 8] ; 2. Move the return stack pointer up 1 address.
+}
+
+DOCOL:
+; Start execution of a Forth word.
+;
+    push_rs rsi        ; 1. Save the next word's address onto the return stack.
+    lea rsi, [rax + 8] ; 2. Load the address of the first data word into RSI.
+    NEXT               ; 3. Execute the word pointed to by RSI.
+
+; As EXIT is included in the definition of every Forth word, it has to have an
+; entry in the dictionary. EXIT will be the first primitive word we define.
 ;
-; Later on, this will start the interpreter as well.
+; Let's revisit what the memory layout of our entry will look like before we
+; start defining anything:
 ;
+;   Memory layout of EXIT
+;
+;   section '.rodata'
+;   +-------------------------------+ <- name_EXIT
+;   | link pointer | 8 bytes        |
+;   +-------------------------------+
+;   | flags        | 1 byte         |
+;   +-------------------------------+
+;   | length       | 1 byte         |
+;   +-------------------------------+
+;   | name         | upto 255 bytes |
+;   +-------------------------------+
+;   | padding      | upto 7 bytes   |
+;   +-------------------------------+ <- EXIT
+;   | code_EXIT    | 8 bytes        |
+;   +-------------------------------+
+;
+;   section '.text'
+;   +-------------+ <- code_EXIT
+;   | pop_rs rsi  |
+;   | NEXT        |
+;   +-------------+
+;
+; Some notes about the labels:
+; 1. name_EXIT is the label for the start of the dictionary entry.
+; 2. EXIT is the label for the location of the codeword.
+; 3. code_EXIT is the label of the assembly code implementing EXIT.
+;
+; We'll need to re-create this memory structure for every primitive word we
+; define so it's worthwhile to write a `defcode` macro that does it for you:
+
+macro defcode name, name_length, flags, label_
+; Define a primitive word.
+;
+{
+    section '.data' data readable
+    align 8
+name_#label_:
+    dq link               ; 1. Set the link pointer.
+    link equ name_#label_ ; 2. Update `link`.
+    db flags              ; 3. Set the flags.
+    db name_length        ; 4. Set the name length.
+    db name               ; 5. Set the name.
+
+    align 8               ; 6. Add any padding we may need.
+label_:
+    dq code_#label_       ; 7. Set the codeword.
+    section '.text' code readable executable
+
+    align 1
+code_#label_:             ; 8. This is where our assembly code will go.
+}
+
+; You'll have noticed that the macro expects the `link` variable to hold the
+; address of the previous word. As EXIT is our first word, we'll need to
+; initialize `link` to 0. The `defcode` macro will take care of updating `link`
+; for us as we define new words.
+;
+; After we set `link`, we can finally define our first primitive word:
+
+link dq 0
+
+defcode "EXIT", 4, 0, EXIT
+; Return from a Forth word.
+;
+    pop_rs rsi ; 1. Load the next word's address back into RSI.
+    NEXT       ; 2. Execute word pointed to by RSI.
+
+; It will be awhile before we get a chance to actually use EXIT, but it gives
+; us a taste of where we're heading.
+;
+;------------------------------------------------------------------------------
+;
+; Section 2 - Executing this Program
+;
+; In this section, we'll build some infrastructure to enable us to run small
+; demos at the end of each section without having to re-write any code.
+;
+; The Entry Point
+;
+; At the beginning of this file, we defined the entry point to be `main`. That
+; means we need to define what `main` is before we can run anything. I decided
+; that `main` should do 4 things:
+;
+; 1. Initialize the UEFI interface.
+; 2. Clear the screen, so we have a blank canvas.
+; 3. Store the version string for this Forth onto the stack.
+; 4. Start the execution of Forth code.
+;
+; These first two steps are handled by function calls provided by the UEFI
+; interface.
+;
+; The version string, and its length, arae defined in the `data` section at the
+; bottom of this file.
+;
+; If you remember from the previous section, we can use the macro NEXT to start
+; the execution of a primitive word. The only thing NEXT expects is for the RSI
+; register to contain the address of a primitive word.
+;
+; If we pass an address corresponding to a sequence of words,
+;
+; We'll use the variable `program` to store the address of the w
+;
+
+main:
+    cld
     call sys_initialize
     call sys_clear_screen
-    mov rcx, version_string
-    mov rdx, version_string.length
+    push version_string
+    push version_string.length
+    mov rsi, program
+    NEXT
+
+;------------------------------------------------------------------------------
+;
+; Section 3 - Hello, World!
+;
+; In this section, we'll define a few more primitive words and print a message
+; to screen using them.
+;
+; Basic Output
+;
+; There are two main words for printing to the screen in Forth: EMIT and TYPE.
+; EMIT **emits** a single character to the screen, while TYPE outputs an entire
+; string to the screen. Unlike many other languages at the time, Forth does not
+; use null-terminated strings. Instead, the address and length of a string are
+; expected to travel as a pair.
+;
+; The implementations of these words are fairly simple, as I've defined the
+; `sys_print_string` function as part of the EFI abstraction and it does most
+; of the work for us.
+;
+
+defcode "EMIT", 4, 0, EMIT
+; Print a character.
+;
+; The character is temporarily stored in the `.char_buffer` local variable.
+; `.char_buffer` is defined as a 1-byte variable in the data section at the
+; bottom of this file.
+;
+    pop rax
+    mov [.char_buffer], al
+    mov rcx, .char_buffer
+    mov rdx, 1
+    call sys_print_string
+    NEXT
+
+defcode "TYPE", 4, 0, TYPE
+; Print an ASCII string.
+;
+    pop rdx
+    pop rcx
     call sys_print_string
-    call sys_read_char
+    NEXT
+
+section '.rodata' readable
+
+program_s1:
+    dq TYPE
+
+program dq program_s1
 
 section '.data' readable writable
 
 version_string db 'soup forth v0.1', 0xA
 .length = $ - version_string
 
+code_EMIT.char_buffer db ?

	forth A WIP implementation of Forth targeting UEFI x86-64.
	git clone git://git.christianermann.dev/forth
	Log \| Files \| Refs