added tokenizer

This commit is contained in:
Sven Vogel 2022-09-21 23:32:09 +02:00
parent 31858f5013
commit 696bca6f4e
6 changed files with 148 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/target

49
Cargo.lock generated Normal file
View File

@ -0,0 +1,49 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "Yard"
version = "0.1.0"
dependencies = [
"lazy_static",
"regex",
]
[[package]]
name = "aho-corasick"
version = "0.7.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4f55bd91a0978cbfd91c457a164bab8b4001c833b7f323132c0a4e1922dd44e"
dependencies = [
"memchr",
]
[[package]]
name = "lazy_static"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "memchr"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
[[package]]
name = "regex"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.6.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244"

10
Cargo.toml Normal file
View File

@ -0,0 +1,10 @@
[package]
name = "Yard"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
regex = "*"
lazy_static = "1.4.0"

17
src/main.rs Normal file
View File

@ -0,0 +1,17 @@
mod token;
mod parser;
use token::*;
fn main() {
let source =
r"
main() {
3 * 5 # comment
}
";
tokenize(source).iter().for_each(|t| print!("{:?}", t));
}

25
src/parser/mod.rs Normal file
View File

@ -0,0 +1,25 @@
pub struct Function<'a> {
/// name
name: &'a str,
/// parameter names
params: Vec<&'a str>,
/// wether this function returns a single value or not
ret: bool,
}
/// reorder and organize a listing of instructions to a RPN based format:
/// any program is made out of functions.
/// A function has a name followed by an optional parameter list, followed by an optional equal sign and block.
/// ```python
/// foo(a) = {
/// # function
/// }
/// ```
pub fn parse<'a>(tokens: &Vec<crate::Token<'a>>) -> Vec<Function<'a>> {
let mut functions = vec![];
functions
}

46
src/token/mod.rs Normal file
View File

@ -0,0 +1,46 @@
#[derive(Debug)]
/// A token represents a basic building block for source code.
/// They give a meaning to patterns of chars allowing to interpret them.
pub enum Token<'a> {
Word(&'a str),
Delemiter(char),
Operator,
Number(&'a str),
}
const TOKEN_REGEX_SRC: &'static str = r"(#.*)|([A-Za-z_]+)|(\d*\.?\d+)|([+\-*])|([(){}])";
lazy_static::lazy_static! {
static ref TOKEN_REGEX: regex::Regex = regex::Regex::new(TOKEN_REGEX_SRC).unwrap();
}
/// creates a vector of tokens from the specified str.
pub fn tokenize<'a>(source: &'a str) -> Vec<Token<'a>> {
let mut tokens = vec![];
for cap in TOKEN_REGEX.captures_iter(source) {
for (i, group) in cap.iter().enumerate() {
// ignore first group as its the entire match,
// as well as the 1st group (= comments)
if i <= 1 {
continue;
}
// if we have a match, save it as token
if let Some(mat) = group {
tokens.push(match i {
2 => Token::Word(mat.as_str()),
3 => Token::Number(mat.as_str()),
4 => Token::Operator,
5 => Token::Delemiter(mat.as_str().chars().nth(0).unwrap()),
_ => panic!("Unknown match to tokenize: {}", mat.as_str())
});
}
}
}
return tokens;
}