diff --git a/src/main.rs b/src/main.rs index c2e7935..07e40c0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,15 +3,22 @@ mod token; mod parser; use token::*; +use parser::*; fn main() { - + let source = r" -main() { +foo = 5 * 6 + 4 + +foo() = { + c +} + +main()(x) { 3 * 5 # comment } "; - tokenize(source).iter().for_each(|t| print!("{:?}", t)); + parse(&mut tokenize(source)); } diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 6e5c2ae..5c9abd2 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1,25 +1,124 @@ +use std::collections::{VecDeque, HashSet}; +use crate::token::Token; + +#[derive(Eq, Hash)] pub struct Function<'a> { - /// name - name: &'a str, /// parameter names - params: Vec<&'a str>, - /// wether this function returns a single value or not - ret: bool, + pub params: Option>, + /// raw tokens + pub raw: Option>> +} + +impl<'a> Function<'a> { + pub fn new() -> Self { + Self { + params: None, + raw: None + } + } +} + +impl<'a> PartialEq for Function<'a> { + fn eq(&self, other: &Self) -> bool { + self.params == other.params + } +} + +/// simple brace-counting parser to detect abstract token syntaxes +fn discover_functions<'a>(tokens: &mut VecDeque>) -> HashSet> { + let mut funcs = HashSet::new(); + + let mut name = None; + let mut cur_fun = Function::new(); + + let mut assigned = false; + let mut brace_cnt = 0; + let mut parent_cnt = 0; + + while let Some(top) = tokens.pop_front() { + + match &top { + crate::Token::Operator(op) => { + match op { + crate::Operator::Assign => if cur_fun.raw.is_none() { + assigned = true; + cur_fun.raw = Some(VecDeque::new()); + continue; + }, + _ => () + } + } + crate::Token::LineBreak => if name.is_some() && cur_fun.raw.is_some() && assigned { + funcs.insert(cur_fun); + cur_fun = Function::new(); + continue; + } + crate::Token::Delemiter(char) => { + match char { + + '{' => { + brace_cnt += 1; + if brace_cnt == 1 { + // start a new body + cur_fun.raw = Some(VecDeque::new()); + assigned = false; + continue; + } + }, + '}' => { + brace_cnt -= 1; + + // we have a full body! + if brace_cnt == 0 { + funcs.insert(cur_fun); + cur_fun = Function::new(); + continue; + } + }, + + '(' => if cur_fun.raw.is_none() { + parent_cnt += 1; + if parent_cnt == 1 { + // start a new arg list + cur_fun.params = Some(Vec::new()); + continue; + } + }, + ')' => if cur_body.is_none() { + parent_cnt -= 1; + + // we have a full body! + if parent_cnt == 0 { + funcs.insert(cur_fun); + cur_fun = Function::new(); + continue; + } + }, + _ => () + } + } + _ => (), + } + + if let Some(body) = &mut cur_body { + body.push_back(top); + } else if let Some(args) = &mut cur_args { + match &top { + Token::Word(text) => args.push(text), + _ => panic!("Argument in list is not a word") + } + } else { + body.push_back(top) + } + } + + funcs } /// reorder and organize a listing of instructions to a RPN based format: /// any program is made out of functions. /// A function has a name followed by an optional parameter list, followed by an optional equal sign and block. -/// ```python -/// foo(a) = { -/// # function -/// } -/// ``` -pub fn parse<'a>(tokens: &Vec>) -> Vec> { - let mut functions = vec![]; - +pub fn parse<'a>(tokens: &mut VecDeque>) { - - functions } \ No newline at end of file diff --git a/src/token/mod.rs b/src/token/mod.rs index b0ea9c9..b3890f3 100644 --- a/src/token/mod.rs +++ b/src/token/mod.rs @@ -1,23 +1,51 @@ +use std::collections::{VecDeque}; -#[derive(Debug)] +#[derive(Debug, Hash, PartialEq, Eq)] +pub enum Operator { + Assign, + + Add, + Sub, + Mul, + Div +} + +impl Operator { + pub fn parse<'a>(str: &'a str) -> Self { + return match str { + "=" => Operator::Assign, + + "+" => Operator::Add, + "-" => Operator::Sub, + "*" => Operator::Mul, + "/" => Operator::Div, + + _ => panic!("Unspecified operator") + }; + } +} + +#[derive(Debug, Hash, PartialEq, Eq)] /// A token represents a basic building block for source code. /// They give a meaning to patterns of chars allowing to interpret them. pub enum Token<'a> { + // base tokens that can simply be split to from raw source code Word(&'a str), Delemiter(char), - Operator, + Operator(Operator), Number(&'a str), + LineBreak } -const TOKEN_REGEX_SRC: &'static str = r"(#.*)|([A-Za-z_]+)|(\d*\.?\d+)|([+\-*])|([(){}])"; +const TOKEN_REGEX_SRC: &'static str = r"(#.*)|([A-Za-z_]+)|(\d*\.?\d+)|([+\-*=])|([(){}])|(\n)"; lazy_static::lazy_static! { static ref TOKEN_REGEX: regex::Regex = regex::Regex::new(TOKEN_REGEX_SRC).unwrap(); } /// creates a vector of tokens from the specified str. -pub fn tokenize<'a>(source: &'a str) -> Vec> { - let mut tokens = vec![]; +pub fn tokenize<'a>(source: &'a str) -> VecDeque> { + let mut tokens = VecDeque::new(); for cap in TOKEN_REGEX.captures_iter(source) { for (i, group) in cap.iter().enumerate() { @@ -30,11 +58,12 @@ pub fn tokenize<'a>(source: &'a str) -> Vec> { // if we have a match, save it as token if let Some(mat) = group { - tokens.push(match i { + tokens.push_back(match i { 2 => Token::Word(mat.as_str()), 3 => Token::Number(mat.as_str()), - 4 => Token::Operator, + 4 => Token::Operator(Operator::parse(mat.as_str())), 5 => Token::Delemiter(mat.as_str().chars().nth(0).unwrap()), + 6 => Token::LineBreak, _ => panic!("Unknown match to tokenize: {}", mat.as_str()) });