implemented lexer for a more complex subset of the

monkey language
added test for full lexer (missing impl)
2025-06-30 00:12:28 +02:00 · 2025-06-29 20:28:53 +02:00 · 2025-06-29 20:04:20 +02:00 · 2025-06-29 12:33:37 +02:00 · 2025-06-29 12:33:09 +02:00 · 2025-06-29 10:56:51 +02:00
10 changed files with 370 additions and 61 deletions
--- a/118
+++ b/118
@@ -1,70 +1,74 @@
-# ────────────────────────────────────
-# Compiler and flags
+# -------------------------------------------------------------------
+# Project‐wide settings
+# -------------------------------------------------------------------
 CXX       := g++
-CXXFLAGS := -Wall -WError -I./include
-
-# ────────────────────────────────────
-# Paths
+CXXFLAGS  := -std=c++17 -Wall -Wextra -Iinclude -Isrc -MMD -MP
+LDFLAGS   :=
 SRC_DIR   := src
-TEST_SRC  := test/test.cpp
-REPL_SRC  := $(SRC_DIR)/main.cpp
+TEST_DIR  := test
+BUILD_DIR := build
+OBJ_DIR   := $(BUILD_DIR)/objs
+BIN_DIR   := $(BUILD_DIR)/bin

-OBJ_DIR   := build/obj
-BIN_DIR   := build/bin
+# -------------------------------------------------------------------
+# Source & object lists
+# -------------------------------------------------------------------
+SRC_CPP    := $(shell find $(SRC_DIR)  -name '*.cpp')
+TEST_CPP   := $(shell find $(TEST_DIR) -name '*.cpp')
+OBJ        := $(patsubst $(SRC_DIR)/%.cpp,$(OBJ_DIR)/%.o,$(SRC_CPP))
+TEST_OBJ   := $(patsubst $(TEST_DIR)/%.cpp,$(OBJ_DIR)/test/%.o,$(TEST_CPP))
+DEPFILES   := $(OBJ:.o=.d) $(TEST_OBJ:.o=.d)

-# ────────────────────────────────────
-# Source listings
-# All .cpp under src/, but exclude your REPL main
-LIB_SRCS := $(filter-out $(REPL_SRC),$(shell find $(SRC_DIR) -name '*.cpp'))
-
-# Mirror src/.../*.cpp → build/obj/src/.../*.o
-LIB_OBJS := $(patsubst $(SRC_DIR)/%.cpp,$(OBJ_DIR)/$(SRC_DIR)/%.o,$(LIB_SRCS))
+# Identify your “real” main.cpp so we can exclude it from tests
+MAIN_SRC        := $(SRC_DIR)/main.cpp
+MAIN_OBJ        := $(MAIN_SRC:$(SRC_DIR)/%.cpp=$(OBJ_DIR)/%.o)
+SRC_OBJS_NO_MAIN := $(filter-out $(MAIN_OBJ),$(OBJ))

 # Binaries
-TEST_BIN := $(BIN_DIR)/tests
-REPL_BIN := $(BIN_DIR)/repl
+TARGET        := $(BIN_DIR)/monkey
+TEST_TARGET   := $(BIN_DIR)/monkey_tests

-# ────────────────────────────────────
-# Default target: build & run tests
-all: test
+# -------------------------------------------------------------------
+# Top‐level rules
+# -------------------------------------------------------------------
+.PHONY: all clean run tests
+all: $(TARGET) $(TEST_TARGET)

-# ─ Link test runner (test.cpp defines main via DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN)
-$(TEST_BIN): $(LIB_OBJS) | $(BIN_DIR)
-    @echo "⏳ Linking tests..."
-    $(CXX) $(CXXFLAGS) $(TEST_SRC) $(LIB_OBJS) -o $@
+clean:
+	@rm -rf $(BUILD_DIR)

-# ─ Link REPL
-$(REPL_BIN): $(LIB_OBJS) | $(BIN_DIR)
-    @echo "🚀 Linking REPL..."
-    $(CXX) $(CXXFLAGS) $(REPL_SRC) $(LIB_OBJS) -o $@
+# -------------------------------------------------------------------
+# Build & run
+# -------------------------------------------------------------------
+run: $(TARGET)
+	@$(TARGET)

-# ─ Compile each library .cpp → mirrored .o
-$(OBJ_DIR)/$(SRC_DIR)/%.o: $(SRC_DIR)/%.cpp
-    @echo "🛠  Compiling $<"
-    @mkdir -p $(dir $@)
+tests: $(TEST_TARGET)
+	@$(TEST_TARGET) $(if $(TEST),--test-case=$(TEST))
+
+# -------------------------------------------------------------------
+# Link binaries
+# -------------------------------------------------------------------
+$(TARGET): $(OBJ)
+	@mkdir -p $(BIN_DIR)
+	$(CXX) $(LDFLAGS) $^ -o $@
+
+$(TEST_TARGET): $(SRC_OBJS_NO_MAIN) $(TEST_OBJ)
+	@mkdir -p $(BIN_DIR)
+	$(CXX) $(LDFLAGS) $^ -o $@
+
+# -------------------------------------------------------------------
+# Compile rules
+# -------------------------------------------------------------------
+$(OBJ_DIR)/%.o: $(SRC_DIR)/%.cpp
+	@mkdir -p $(@D)
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-# ────────────────────────────────────
-# Run or launch targets
-.PHONY: test repl clean all
-
-test: $(TEST_BIN)
-    @echo "\n✅ Running tests..."
-    @$(TEST_BIN)
-    # @$(TEST_BIN) $(if $(TESTCASE),--test-case=$(TESTCASE))
-
-repl: $(REPL_BIN)
-    @echo "\n🔧 Starting REPL..."
-    @$(REPL_BIN)
-
-# ────────────────────────────────────
-# Ensure bin/ exists before linking
-$(BIN_DIR):
-    @mkdir -p $@
-
-# ────────────────────────────────────
-# Clean up everything
-clean:
-    @echo "🧹 Cleaning build artifacts"
-    @rm -rf $(OBJ_DIR) $(BIN_DIR)
+$(OBJ_DIR)/test/%.o: $(TEST_DIR)/%.cpp
+	@mkdir -p $(@D)
+	$(CXX) $(CXXFLAGS) -c $< -o $@

+# -------------------------------------------------------------------
+# Auto‐include dependencies
+# -------------------------------------------------------------------
+-include $(DEPFILES)
--- a/compile_flags.txt
+++ b/compile_flags.txt
@@ -0,0 +1,2 @@
+-I./include
+-I./src
--- a/src/lexer/lexer.cpp
+++ b/src/lexer/lexer.cpp
@@ -0,0 +1,68 @@
+#include "lexer.hpp"
+
+#include "token/token.hpp"
+#include "token/type.hpp"
+
+#include <cctype>
+#include <iostream>
+
+namespace lexer {
+    token::token lexer::next_token() {
+        char c;
+        if (!(input >> c))
+            return {token::type::END_OF_FILE, ""};
+
+        switch (c) {
+        case '=':
+            return {token::type::ASSIGN, c};
+        case '+':
+            return {token::type::PLUS, c};
+        case ',':
+            return {token::type::COMMA, c};
+        case ';':
+            return {token::type::SEMICOLON, c};
+        case '(':
+            return {token::type::LPAREN, c};
+        case ')':
+            return {token::type::RPAREN, c};
+        case '{':
+            return {token::type::LBRACE, c};
+        case '}':
+            return {token::type::RBRACE, c};
+        default:
+            if (is_letter(c)) {
+                std::string identifier_or_keyword = read_string(c);
+                return {
+                    token::lookup_identifier(identifier_or_keyword),
+                    identifier_or_keyword
+                };
+            }
+            if (std::isdigit(c))
+                return {token::type::INT, read_int(c)};
+
+            return {token::type::ILLEGAL, c};
+        }
+    }
+
+    bool lexer::is_letter(char c) {
+        return c == '_' || std::isalpha(static_cast<unsigned char>(c));
+    }
+
+    std::string lexer::read_string(char first_char) {
+        std::string result;
+        result.push_back(first_char);
+        for (char c = input.peek(); is_letter(c); c = input.peek())
+            result.push_back(input.get());
+        return result;
+    }
+
+    std::string lexer::read_int(char first_digit) {
+        std::string result;
+        result.push_back(first_digit);
+        for (char c = input.peek(); std::isdigit(c); c = input.peek())
+            result.push_back(input.get());
+        return result;
+    }
+
+
+} // namespace lexer
--- a/src/lexer/lexer.hpp
+++ b/src/lexer/lexer.hpp
@@ -0,0 +1,16 @@
+#include "token/token.hpp"
+
+#include <istream>
+
+namespace lexer {
+    struct lexer {
+        std::istream& input;
+        token::token next_token();
+
+      private:
+        bool is_letter(char);
+
+        std::string read_string(char);
+        std::string read_int(char);
+    };
+} // namespace lexer
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -0,0 +1,9 @@
+#include "token/type.hpp"
+
+#include <iostream>
+
+int main() {
+    token::type eof = token::type::ILLEGAL;
+    std::cout << eof << std::endl;
+    return 0;
+}
--- a/src/token/token.hpp
+++ b/src/token/token.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "type.hpp"
+
+#include <string>
+
+namespace token {
+    struct token {
+        ::token::type type;
+        std::string literal;
+
+        token(::token::type t, std::string s): type(t), literal(s) {}
+
+        token(::token::type t, char c): type(t), literal(1, c) {}
+    };
+} // namespace token
--- a/src/token/type.cpp
+++ b/src/token/type.cpp
@@ -0,0 +1,37 @@
+#include "type.hpp"
+
+#include <array>
+#include <unordered_map>
+
+namespace token {
+
+    // Array mapping enum values to their string representations
+    constexpr std::array<std::string_view, static_cast<size_t>(type::LET) + 1>
+        tokenTypeStrings = {
+#define X(name, str) str,
+            TOKEN_LIST
+#undef X
+    };
+
+    // Stream insertion operator using the lookup array
+    std::ostream& operator<<(std::ostream& os, type type) {
+        auto idx = static_cast<size_t>(type);
+        if (idx < tokenTypeStrings.size())
+            return os << tokenTypeStrings[idx];
+        return os << "Unknown";
+    }
+
+    static std::unordered_map<std::string, type> keywords{
+        {"fn", type::FUNCTION},
+        {"let", type::LET},
+    };
+
+    type lookup_identifier(std::string ident) {
+        try {
+            return keywords.at(ident);
+        } catch (const std::out_of_range&) {
+            return type::IDENTIFIER;
+        }
+    }
+
+} // namespace token
--- a/src/token/type.hpp
+++ b/src/token/type.hpp
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <ostream>
+
+namespace token {
+
+// X-macro list of token types and their string representations
+#define TOKEN_LIST                                                             \
+    X(ILLEGAL, "ILLEGAL")                                                      \
+    X(END_OF_FILE, "EOF")                                                      \
+    X(IDENTIFIER, "IDENTIFIER")                                                \
+    X(INT, "INT")                                                              \
+    X(ASSIGN, "=")                                                             \
+    X(PLUS, "+")                                                               \
+    X(COMMA, ",")                                                              \
+    X(SEMICOLON, ";")                                                          \
+    X(LPAREN, "(")                                                             \
+    X(RPAREN, ")")                                                             \
+    X(LBRACE, "{")                                                             \
+    X(RBRACE, "}")                                                             \
+    X(FUNCTION, "FUNCTION")                                                    \
+    X(LET, "LET")
+
+    // Define the TokenType enum using the X-macro
+    enum class type {
+#define X(name, str) name,
+        TOKEN_LIST
+#undef X
+    };
+
+    std::ostream& operator<<(std::ostream&, type);
+    type lookup_identifier(std::string);
+} // namespace token
--- a/test/lexer.cpp
+++ b/test/lexer.cpp
@@ -0,0 +1,104 @@
+#include "lexer/lexer.hpp"
+
+#include "token/type.hpp"
+
+#include <doctest.h>
+#include <sstream>
+#include <string>
+
+TEST_CASE("Single character token") {
+    struct test {
+        token::type expectedType;
+        std::string expectedLiteral;
+    };
+
+    std::string input = "=+(){},;";
+    std::istringstream ss(input);
+
+    lexer::lexer l{ss};
+
+    test tests[] = {
+        {token::type::ASSIGN, "="},
+        {token::type::PLUS, "+"},
+        {token::type::LPAREN, "("},
+        {token::type::RPAREN, ")"},
+        {token::type::LBRACE, "{"},
+        {token::type::RBRACE, "}"},
+        {token::type::COMMA, ","},
+        {token::type::SEMICOLON, ";"},
+        {token::type::END_OF_FILE, ""},
+    };
+
+    for (const auto& t : tests) {
+        token::token tok = l.next_token();
+        CHECK(tok.type == t.expectedType);
+        CHECK(tok.literal == t.expectedLiteral);
+    }
+};
+
+TEST_CASE("Full tokens") {
+    struct test {
+        token::type expectedType;
+        std::string expectedLiteral;
+    };
+
+    std::istringstream ss("let five = 5;\
+let ten = 10;\
+let add = fn(x, y) {\
+    x + y;\
+};\
+let result = add(five, ten);\
+");
+
+    lexer::lexer l{ss};
+
+    test tests[] = {
+        // clang-format off
+        {token::type::LET, "let"},
+        {token::type::IDENTIFIER, "five"},
+        {token::type::ASSIGN, "="},
+        {token::type::INT, "5"},
+        {token::type::SEMICOLON, ";"},
+
+        {token::type::LET, "let"},
+        {token::type::IDENTIFIER, "ten"},
+        {token::type::ASSIGN, "="},
+        {token::type::INT, "10"},
+        {token::type::SEMICOLON, ";"},
+
+        {token::type::LET, "let"},
+        {token::type::IDENTIFIER, "add"},
+        {token::type::ASSIGN, "="},
+        {token::type::FUNCTION, "fn"},
+        {token::type::LPAREN, "("},
+        {token::type::IDENTIFIER, "x"},
+        {token::type::COMMA, ","},
+        {token::type::IDENTIFIER, "y"},
+        {token::type::RPAREN, ")"},
+        {token::type::LBRACE, "{"},
+        {token::type::IDENTIFIER, "x"},
+        {token::type::PLUS, "+"},
+        {token::type::IDENTIFIER, "y"},
+        {token::type::SEMICOLON, ";"},
+        {token::type::RBRACE, "}"},
+        {token::type::SEMICOLON, ";"},
+
+        {token::type::LET, "let"},
+        {token::type::IDENTIFIER, "result"},
+        {token::type::ASSIGN, "="},
+        {token::type::IDENTIFIER, "add"},
+        {token::type::LPAREN, "("},
+        {token::type::IDENTIFIER, "five"},
+        {token::type::COMMA, ","},
+        {token::type::IDENTIFIER, "ten"},
+        {token::type::RPAREN, ")"},
+        {token::type::SEMICOLON, ";"},
+        // clang-format on
+    };
+
+    for (const auto& t : tests) {
+        token::token tok = l.next_token();
+        CHECK(tok.type == t.expectedType);
+        CHECK(tok.literal == t.expectedLiteral);
+    }
+};
--- a/test/test.cpp
+++ b/test/test.cpp
@@ -0,0 +1,20 @@
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include <doctest.h>
+
+int factorial(int number) {
+    return number <= 1 ? number : factorial(number - 1) * number;
+}
+
+TEST_CASE("fact") {
+    CHECK(factorial(1) == 1);
+    CHECK(factorial(2) == 2);
+    CHECK(factorial(3) == 6);
+    CHECK(factorial(10) == 3628800);
+}
+
+TEST_CASE("fact2") {
+    CHECK(factorial(1) == 1);
+    CHECK(factorial(2) == 2);
+    CHECK(factorial(3) == 6);
+    CHECK(factorial(10) == 3628800);
+}
Author	SHA1	Message	Date
Karma Riuk	dec93f8272	implemented lexer for a more complex subset of the monkey language	2025-06-30 00:12:28 +02:00
Karma Riuk	69217fdf90	added test for full lexer (missing impl)	2025-06-29 20:28:53 +02:00
Karma Riuk	c322b69590	renamed IDENT to IDENTIFIER because i kept reading indent	2025-06-29 20:04:20 +02:00
Karma Riuk	ffff13b2e0	lexer can now read single character tokens	2025-06-29 12:33:37 +02:00
Karma Riuk	ca05c3577a	renamed EOF_ to END_OF_FILE	2025-06-29 12:33:09 +02:00
Karma Riuk	1c928616a4	written structure and tests for lexer, missing implementation	2025-06-29 10:56:51 +02:00
Karma Riuk	ccfc3ed0f7	fixed bug	2025-06-29 10:56:32 +02:00
Karma Riuk	2aff81ba4c	fixed token header and made the tokenTypeStrings not seeable from outside modules	2025-06-29 10:43:12 +02:00
Karma Riuk	9ad9a0b85b	added src to inclusion for lsp	2025-06-29 10:14:29 +02:00
Karma Riuk	09a0dc7b6d	brought back namespaces because i think i get it now	2025-06-29 10:14:04 +02:00
Karma Riuk	65792464bb	changed make rule name	2025-06-29 10:13:27 +02:00
Karma Riuk	4771aa4f10	removed namespace perche mi rompeva le palle	2025-06-29 10:07:12 +02:00
Karma Riuk	81cdd0690d	made the token type less repetitive	2025-06-28 18:05:01 +02:00
Karma Riuk	4364afa111	added compile_flags.txt for lsp	2025-06-28 17:59:14 +02:00
Karma Riuk	9a13de97e1	initial code	2025-06-28 17:59:08 +02:00
Karma Riuk	8acce0f6a6	made makefile better	2025-06-28 17:57:44 +02:00
Karma Riuk	b966b6dfab	put back the tabs because makefiles are bitchy	2025-06-28 17:27:56 +02:00