From 404916048acd1b9456d85886d4bc3fb8822dc8f5 Mon Sep 17 00:00:00 2001 From: servostar Date: Mon, 3 Oct 2022 14:11:49 +0200 Subject: [PATCH] added shunting yard expr parser --- .vscode/tasks.json | 13 ++ src/main.rs | 10 +- src/parser/data.rs | 106 ++++++++++++ src/parser/mod.rs | 390 +++++++++++++++++++++++++++++++++++---------- src/token/mod.rs | 52 ++++-- 5 files changed, 465 insertions(+), 106 deletions(-) create mode 100644 .vscode/tasks.json create mode 100644 src/parser/data.rs diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..ffef723 --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,13 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "type": "cargo", + "command": "run", + "problemMatcher": [ + "$rustc" + ], + "label": "rust: cargo run" + } + ] +} \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index 07e40c0..b679e21 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,14 +9,12 @@ fn main() { let source = r" -foo = 5 * 6 + 4 +foo(c) = 3 * c -foo() = { - c -} +main() { + x = foo(5 * 6) -main()(x) { - 3 * 5 # comment + 4 * 3 } "; diff --git a/src/parser/data.rs b/src/parser/data.rs new file mode 100644 index 0000000..a57352c --- /dev/null +++ b/src/parser/data.rs @@ -0,0 +1,106 @@ +use std::collections::{VecDeque}; +use crate::token::{Token}; + +#[derive(Eq, Debug)] +pub struct Func<'a> { + /// name of this function + pub name: Option<&'a str>, + /// parameter names + pub args: Option>, + /// raw tokens + pub raw: Option>>, + /// if the function returns a single value + pub results: bool, + /// parsed content + pub expr: Option>, +} + +impl<'a> Func<'a> { + pub fn new() -> Self { + Self { + args: None, + raw: None, + name: None, + results: false, + expr: None, + } + } +} + +impl<'a> PartialEq for Func<'a> { + fn eq(&self, other: &Self) -> bool { + self.args == other.args && self.name == self.name + } +} + +impl<'a> std::fmt::Display for Func<'a> { + /// print this functions declaration in the form of ```foo(x,y) = {}``` + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_fmt(format_args!("{}", &self.name.unwrap()))?; + + // format the arguments + if let Some(args) = &self.args { + f.write_str("(")?; + for (x, arg) in args.iter().enumerate() { + if x == 0 { + f.write_fmt(format_args!("{}", arg))?; + continue; + } + + f.write_fmt(format_args!(", {}", arg))?; + } + f.write_str(")")?; + } + + if self.results { + f.write_str(" =")?; + } + + f.write_str(" {}") + } +} + +pub type Block<'a> = VecDeque>; + +#[derive(Debug)] +pub enum Expr<'a> { + /// group of more expressions + Block(Block<'a>), + /// single term + Term(VecDeque>) +} + +pub struct Scope<'a> { + pub funcs: Vec<&'a str>, + pub args: Option<&'a Vec<&'a str>>, + /// stack of scoped block variables + pub vars: Vec>, +} + +impl<'a> Scope<'a> { + pub fn alloc_scope(&mut self) { + self.vars.push(Vec::new()) + } + + pub fn pop_scope(&mut self) { + self.vars.pop(); + } + + pub fn is_func(&self, name: &'a str) -> bool { + self.funcs.contains(&name) + } + pub fn is_arg(&self, name: &'a str) -> bool { + if let Some(args) = self.args { + return args.contains(&name); + } + false + } + pub fn is_var(&self, name: &'a str) -> bool { + for vars in self.vars.iter() { + if vars.contains(&name) { + return true; + } + } + false + } +} \ No newline at end of file diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 5c9abd2..bdc938f 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1,124 +1,338 @@ -use std::collections::{VecDeque, HashSet}; +use core::panic; +use std::{collections::{VecDeque}, vec}; +use crate::token::{Token, Operator, Assoc}; -use crate::token::Token; +mod data; -#[derive(Eq, Hash)] -pub struct Function<'a> { - /// parameter names - pub params: Option>, - /// raw tokens - pub raw: Option>> -} +use data::*; -impl<'a> Function<'a> { - pub fn new() -> Self { - Self { - params: None, - raw: None - } +/// simple brace-counting parser to detect functions +fn discover_functions<'a>(tokens: &mut VecDeque>) -> Vec> { + let mut funcs = Vec::new(); + + // function to currently identifiy + let mut func = Func::new(); + + // count open brackets + let mut brace_cnt = 0; + let mut paren_cnt = 0; + + let mut single_line = false; + + macro_rules! finish_func { + () => { + if funcs.contains(&func) { + panic!("Function already defined: {func}") + } + + funcs.push(func); + func = Func::new(); + single_line = false; + }; } -} - -impl<'a> PartialEq for Function<'a> { - fn eq(&self, other: &Self) -> bool { - self.params == other.params - } -} - -/// simple brace-counting parser to detect abstract token syntaxes -fn discover_functions<'a>(tokens: &mut VecDeque>) -> HashSet> { - let mut funcs = HashSet::new(); - - let mut name = None; - let mut cur_fun = Function::new(); - - let mut assigned = false; - let mut brace_cnt = 0; - let mut parent_cnt = 0; while let Some(top) = tokens.pop_front() { - + + // function body detection + // has highest priority match &top { - crate::Token::Operator(op) => { - match op { - crate::Operator::Assign => if cur_fun.raw.is_none() { - assigned = true; - cur_fun.raw = Some(VecDeque::new()); + Token::Delemiter(char) => match char { + '{' => { + brace_cnt += 1; + if brace_cnt == 1 { + if func.name.is_none() { + panic!("Anonymous function not permitted"); + } + single_line = false; + func.raw = Some(VecDeque::new()); continue; - }, - _ => () + } + }, + '}' => { + brace_cnt -= 1; + if brace_cnt == 0 { + finish_func!(); + continue; + } } + _ => () } - crate::Token::LineBreak => if name.is_some() && cur_fun.raw.is_some() && assigned { - funcs.insert(cur_fun); - cur_fun = Function::new(); + + Token::LineBreak => if single_line { + finish_func!(); continue; } - crate::Token::Delemiter(char) => { - match char { - '{' => { - brace_cnt += 1; - if brace_cnt == 1 { - // start a new body - cur_fun.raw = Some(VecDeque::new()); - assigned = false; - continue; + _ => if single_line && func.raw.is_none() { + func.raw = Some(VecDeque::new()); + } + } + + if func.raw.is_none() { + match &top { + Token::Operator(op) => match op { + Operator::Assign => { + if func.results { + panic!("double function assignment not permitted") } - }, - '}' => { - brace_cnt -= 1; - - // we have a full body! - if brace_cnt == 0 { - funcs.insert(cur_fun); - cur_fun = Function::new(); - continue; + if func.name.is_none() { + panic!("Anonymous function not permitted"); } - }, - '(' => if cur_fun.raw.is_none() { - parent_cnt += 1; - if parent_cnt == 1 { - // start a new arg list - cur_fun.params = Some(Vec::new()); - continue; - } - }, - ')' => if cur_body.is_none() { - parent_cnt -= 1; - - // we have a full body! - if parent_cnt == 0 { - funcs.insert(cur_fun); - cur_fun = Function::new(); - continue; - } - }, + func.results = true; + single_line = true; + continue; + } _ => () } + + Token::Assign(name) => { + if func.results { + panic!("double function assignment not permitted") + } + if func.name.is_some() { + panic!("function already named"); + } + + func.raw = Some(VecDeque::new()); + func.name = Some(name); + func.results = true; + single_line = true; + continue; + } + + Token::Delemiter(char) => match char { + + '(' => if func.raw.is_none() { + paren_cnt += 1; + if paren_cnt == 1 { + + if func.args.is_some() { + panic!("double parameter list not permitted"); + } + + func.args = Some(Vec::new()); + continue; + } + }, + ')' => { + paren_cnt -= 1; + if paren_cnt == 0 { + continue; + } + } + _ => () + } + + Token::Word(text) => { + + if func.name.is_some() { + if func.args.is_none() { + panic!("Function name already set: {text}") + } + } + else { + func.name = Some(text); + continue; + } + } + _ => () } - _ => (), } - if let Some(body) = &mut cur_body { + if let Some(body) = &mut func.raw { body.push_back(top); - } else if let Some(args) = &mut cur_args { + continue; + } + else if let Some(args) = &mut func.args { + + if paren_cnt == 0 { + panic!("Token is not in parameter list: {:?}", top) + } + match &top { Token::Word(text) => args.push(text), - _ => panic!("Argument in list is not a word") + _ => panic!("Argument is not a word {:?}", &top) } - } else { - body.push_back(top) + continue; + } + + // if we have anything left it might be an error + match &top { + Token::LineBreak => (), // valid whitespace + _ => panic!("Invalid token: {:?}", top) } } funcs } +/// parse the functions raw content to expr for easy compilation using a brace-counter. +/// - ```{...}``` surround a block +/// - line breaks seperate expressions +fn discover_exprs<'a>(functions: &mut Vec>) { + for func in functions.iter_mut() { + + let mut blocks = vec![Block::new()]; + + let mut expr = VecDeque::new(); + + while let Some(top) = func.raw.as_mut().unwrap().pop_front() { + + match &top { + Token::LineBreak => if !expr.is_empty() { + blocks.last_mut().expect("Curly brace missmatch").push_back(Expr::Term(expr)); + expr = VecDeque::new(); + continue; + } + Token::Delemiter(char) => match char { + '{' => { + blocks.last_mut().expect("Curly brace missmatch").push_back(Expr::Term(expr)); + expr = VecDeque::new(); + blocks.push(Block::new()); + continue; + }, + '}' => { + // pop topmost block of the stack, storing it in the next lower block + if let Some(block) = blocks.pop() { + blocks.last_mut().expect("Curly brace missmatch").push_back(Expr::Block(block)); + } else { + panic!("Curly brace missmatch") + } + continue; + }, + _ => () + }, + _ => () + } + + expr.push_back(top) + } + + if !expr.is_empty() { + blocks.last_mut().expect("Curly brace missmatch").push_back(Expr::Term(expr)); + } + + func.expr = Some(Expr::Block(blocks.pop().expect("Curly brace missmmatch"))); + } +} + +/// parse a single term using a modified shunting yard +fn parse_term<'a>(term: &mut VecDeque>, scope: &mut Scope) { + let mut op_stack = vec![]; + let mut output = VecDeque::new(); + + 'outer: + while let Some(token) = term.pop_front() { + match &token { + Token::Word(text) => { + if scope.is_func(text) { + op_stack.push(Token::Func(text)); + continue; + } else if scope.is_arg(text) { + output.push_back(Token::Arg(text)); + continue; + } else if scope.is_var(text) { + output.push_back(Token::Var(text)); + continue; + } + panic!("Unknwon word: {text}") + } + Token::Number(_) => output.push_back(token), + Token::Assign(_) => op_stack.push(token), + + Token::Delemiter(char) => { + match char { + '(' => op_stack.push(token), + ')' => { + while let Some(token) = op_stack.pop() { + match &token { + Token::Delemiter(char) => if *char == '(' { + if let Some(next) = op_stack.last() { + match &next { + Token::Func(_) => output.push_back(op_stack.pop().unwrap()), + _ => () + } + } + continue 'outer; + }, + _ => output.push_back(token) + } + } + panic!("Mismatched right parenthesis") + }, + _ => panic!("Misplaced character: '{char}'") + } + } + + Token::Operator(op) => { + let prec0 = op.prec(); + while let Some(top) = op_stack.last(){ + match &top { + Token::Operator(op1) => { + let prec1 = op1.prec(); + + if prec1 > prec0 || prec0 == prec1 && op.assoc() == Assoc::Left { + output.push_back(op_stack.pop().unwrap()) + } + }, + _ => break + } + } + op_stack.push(token); + } + _ => () + } + } + + while let Some(token) = op_stack.pop() { + match &token { + Token::Delemiter(char) => if *char == '(' { + panic!("Mismatched parenthesis") + }, + _ => output.push_back(token) + } + } + + term.append(&mut output); +} + +fn parse_block(block: &mut Block, scope: &mut Scope) { + for expr in block.iter_mut() { + match expr { + Expr::Block(block) => parse_block(block, scope), + Expr::Term(term) => parse_term(term, scope) + } + } +} + +fn parse_exprs<'a>(funcs: &mut Vec>) { + let mut scope = Scope { + funcs: funcs.iter().map(|f| f.name.unwrap()).collect(), + args: None, + vars: vec![] + }; + + for func in funcs.iter_mut() { + match func.expr.as_mut().expect("Function has no body") { + Expr::Block(block) => { + scope.args = func.args.as_ref(); + + parse_block(block, &mut scope) + }, + _ => panic!("Fatal-Compilier-Error: function must have a block") + } + } +} + /// reorder and organize a listing of instructions to a RPN based format: /// any program is made out of functions. /// A function has a name followed by an optional parameter list, followed by an optional equal sign and block. pub fn parse<'a>(tokens: &mut VecDeque>) { - + let mut funcs = discover_functions(tokens); + + discover_exprs(&mut funcs); + parse_exprs(&mut funcs); + + funcs.iter().for_each(|f| println!("{:?}", f)); } \ No newline at end of file diff --git a/src/token/mod.rs b/src/token/mod.rs index b3890f3..1ddb627 100644 --- a/src/token/mod.rs +++ b/src/token/mod.rs @@ -2,27 +2,50 @@ use std::collections::{VecDeque}; #[derive(Debug, Hash, PartialEq, Eq)] pub enum Operator { - Assign, - Add, Sub, Mul, - Div + Div, + + Assign +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum Assoc { + Right, + Left } impl Operator { pub fn parse<'a>(str: &'a str) -> Self { return match str { - "=" => Operator::Assign, - "+" => Operator::Add, "-" => Operator::Sub, "*" => Operator::Mul, "/" => Operator::Div, + "=" => Operator::Assign, _ => panic!("Unspecified operator") }; } + + pub fn prec(&self) -> usize { + return match self { + Operator::Add => 3, + Operator::Sub => 3, + + Operator::Mul => 4, + Operator::Div => 4, + + _ => 0 + } + } + + pub fn assoc(&self) -> Assoc { + match self { + _ => Assoc::Right + } + } } #[derive(Debug, Hash, PartialEq, Eq)] @@ -34,10 +57,14 @@ pub enum Token<'a> { Delemiter(char), Operator(Operator), Number(&'a str), - LineBreak + LineBreak, + Func(&'a str), + Var(&'a str), + Arg(&'a str), + Assign(&'a str), } -const TOKEN_REGEX_SRC: &'static str = r"(#.*)|([A-Za-z_]+)|(\d*\.?\d+)|([+\-*=])|([(){}])|(\n)"; +const TOKEN_REGEX_SRC: &'static str = r"(#.*)|([A-Za-z_]+)\s*=|([A-Za-z_]+)|(\d*\.?\d+)|([+\-*/=])|([(){}])|(\n)"; lazy_static::lazy_static! { static ref TOKEN_REGEX: regex::Regex = regex::Regex::new(TOKEN_REGEX_SRC).unwrap(); @@ -59,11 +86,12 @@ pub fn tokenize<'a>(source: &'a str) -> VecDeque> { // if we have a match, save it as token if let Some(mat) = group { tokens.push_back(match i { - 2 => Token::Word(mat.as_str()), - 3 => Token::Number(mat.as_str()), - 4 => Token::Operator(Operator::parse(mat.as_str())), - 5 => Token::Delemiter(mat.as_str().chars().nth(0).unwrap()), - 6 => Token::LineBreak, + 2 => Token::Assign(mat.as_str()), + 3 => Token::Word(mat.as_str()), + 4 => Token::Number(mat.as_str()), + 5 => Token::Operator(Operator::parse(mat.as_str())), + 6 => Token::Delemiter(mat.as_str().chars().nth(0).unwrap()), + 7 => Token::LineBreak, _ => panic!("Unknown match to tokenize: {}", mat.as_str()) });