From 696bca6f4e92e645b0c77a94d9544280fd2561e1 Mon Sep 17 00:00:00 2001 From: servostar Date: Wed, 21 Sep 2022 23:32:09 +0200 Subject: [PATCH] added tokenizer --- .gitignore | 1 + Cargo.lock | 49 +++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 10 ++++++++++ src/main.rs | 17 ++++++++++++++++ src/parser/mod.rs | 25 ++++++++++++++++++++++++ src/token/mod.rs | 46 ++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 148 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/main.rs create mode 100644 src/parser/mod.rs create mode 100644 src/token/mod.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..ad20d91 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,49 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "Yard" +version = "0.1.0" +dependencies = [ + "lazy_static", + "regex", +] + +[[package]] +name = "aho-corasick" +version = "0.7.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4f55bd91a0978cbfd91c457a164bab8b4001c833b7f323132c0a4e1922dd44e" +dependencies = [ + "memchr", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + +[[package]] +name = "regex" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..e989419 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "Yard" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +regex = "*" +lazy_static = "1.4.0" \ No newline at end of file diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..c2e7935 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,17 @@ + +mod token; +mod parser; + +use token::*; + +fn main() { + + let source = +r" +main() { + 3 * 5 # comment +} +"; + + tokenize(source).iter().for_each(|t| print!("{:?}", t)); +} diff --git a/src/parser/mod.rs b/src/parser/mod.rs new file mode 100644 index 0000000..6e5c2ae --- /dev/null +++ b/src/parser/mod.rs @@ -0,0 +1,25 @@ + +pub struct Function<'a> { + /// name + name: &'a str, + /// parameter names + params: Vec<&'a str>, + /// wether this function returns a single value or not + ret: bool, +} + +/// reorder and organize a listing of instructions to a RPN based format: +/// any program is made out of functions. +/// A function has a name followed by an optional parameter list, followed by an optional equal sign and block. +/// ```python +/// foo(a) = { +/// # function +/// } +/// ``` +pub fn parse<'a>(tokens: &Vec>) -> Vec> { + let mut functions = vec![]; + + + + functions +} \ No newline at end of file diff --git a/src/token/mod.rs b/src/token/mod.rs new file mode 100644 index 0000000..b0ea9c9 --- /dev/null +++ b/src/token/mod.rs @@ -0,0 +1,46 @@ + +#[derive(Debug)] +/// A token represents a basic building block for source code. +/// They give a meaning to patterns of chars allowing to interpret them. +pub enum Token<'a> { + Word(&'a str), + Delemiter(char), + Operator, + Number(&'a str), +} + +const TOKEN_REGEX_SRC: &'static str = r"(#.*)|([A-Za-z_]+)|(\d*\.?\d+)|([+\-*])|([(){}])"; + +lazy_static::lazy_static! { + static ref TOKEN_REGEX: regex::Regex = regex::Regex::new(TOKEN_REGEX_SRC).unwrap(); +} + +/// creates a vector of tokens from the specified str. +pub fn tokenize<'a>(source: &'a str) -> Vec> { + let mut tokens = vec![]; + + for cap in TOKEN_REGEX.captures_iter(source) { + for (i, group) in cap.iter().enumerate() { + + // ignore first group as its the entire match, + // as well as the 1st group (= comments) + if i <= 1 { + continue; + } + + // if we have a match, save it as token + if let Some(mat) = group { + tokens.push(match i { + 2 => Token::Word(mat.as_str()), + 3 => Token::Number(mat.as_str()), + 4 => Token::Operator, + 5 => Token::Delemiter(mat.as_str().chars().nth(0).unwrap()), + + _ => panic!("Unknown match to tokenize: {}", mat.as_str()) + }); + } + } + } + + return tokens; +} \ No newline at end of file