added shunting yard expr parser

This commit is contained in:
Sven Vogel 2022-10-03 14:11:49 +02:00
parent d1d3fe73df
commit 404916048a
5 changed files with 465 additions and 106 deletions

13
.vscode/tasks.json vendored Normal file
View File

@ -0,0 +1,13 @@
{
"version": "2.0.0",
"tasks": [
{
"type": "cargo",
"command": "run",
"problemMatcher": [
"$rustc"
],
"label": "rust: cargo run"
}
]
}

View File

@ -9,14 +9,12 @@ fn main() {
let source =
r"
foo = 5 * 6 + 4
foo(c) = 3 * c
foo() = {
c
}
main() {
x = foo(5 * 6)
main()(x) {
3 * 5 # comment
4 * 3
}
";

106
src/parser/data.rs Normal file
View File

@ -0,0 +1,106 @@
use std::collections::{VecDeque};
use crate::token::{Token};
#[derive(Eq, Debug)]
pub struct Func<'a> {
/// name of this function
pub name: Option<&'a str>,
/// parameter names
pub args: Option<Vec<&'a str>>,
/// raw tokens
pub raw: Option<VecDeque<Token<'a>>>,
/// if the function returns a single value
pub results: bool,
/// parsed content
pub expr: Option<Expr<'a>>,
}
impl<'a> Func<'a> {
pub fn new() -> Self {
Self {
args: None,
raw: None,
name: None,
results: false,
expr: None,
}
}
}
impl<'a> PartialEq for Func<'a> {
fn eq(&self, other: &Self) -> bool {
self.args == other.args && self.name == self.name
}
}
impl<'a> std::fmt::Display for Func<'a> {
/// print this functions declaration in the form of ```foo(x,y) = {}```
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_fmt(format_args!("{}", &self.name.unwrap()))?;
// format the arguments
if let Some(args) = &self.args {
f.write_str("(")?;
for (x, arg) in args.iter().enumerate() {
if x == 0 {
f.write_fmt(format_args!("{}", arg))?;
continue;
}
f.write_fmt(format_args!(", {}", arg))?;
}
f.write_str(")")?;
}
if self.results {
f.write_str(" =")?;
}
f.write_str(" {}")
}
}
pub type Block<'a> = VecDeque<Expr<'a>>;
#[derive(Debug)]
pub enum Expr<'a> {
/// group of more expressions
Block(Block<'a>),
/// single term
Term(VecDeque<Token<'a>>)
}
pub struct Scope<'a> {
pub funcs: Vec<&'a str>,
pub args: Option<&'a Vec<&'a str>>,
/// stack of scoped block variables
pub vars: Vec<Vec<&'a str>>,
}
impl<'a> Scope<'a> {
pub fn alloc_scope(&mut self) {
self.vars.push(Vec::new())
}
pub fn pop_scope(&mut self) {
self.vars.pop();
}
pub fn is_func(&self, name: &'a str) -> bool {
self.funcs.contains(&name)
}
pub fn is_arg(&self, name: &'a str) -> bool {
if let Some(args) = self.args {
return args.contains(&name);
}
false
}
pub fn is_var(&self, name: &'a str) -> bool {
for vars in self.vars.iter() {
if vars.contains(&name) {
return true;
}
}
false
}
}

View File

@ -1,124 +1,338 @@
use std::collections::{VecDeque, HashSet};
use core::panic;
use std::{collections::{VecDeque}, vec};
use crate::token::{Token, Operator, Assoc};
use crate::token::Token;
mod data;
#[derive(Eq, Hash)]
pub struct Function<'a> {
/// parameter names
pub params: Option<Vec<&'a str>>,
/// raw tokens
pub raw: Option<VecDeque<Token<'a>>>
}
use data::*;
impl<'a> Function<'a> {
pub fn new() -> Self {
Self {
params: None,
raw: None
}
/// simple brace-counting parser to detect functions
fn discover_functions<'a>(tokens: &mut VecDeque<crate::Token<'a>>) -> Vec<Func<'a>> {
let mut funcs = Vec::new();
// function to currently identifiy
let mut func = Func::new();
// count open brackets
let mut brace_cnt = 0;
let mut paren_cnt = 0;
let mut single_line = false;
macro_rules! finish_func {
() => {
if funcs.contains(&func) {
panic!("Function already defined: {func}")
}
funcs.push(func);
func = Func::new();
single_line = false;
};
}
}
impl<'a> PartialEq for Function<'a> {
fn eq(&self, other: &Self) -> bool {
self.params == other.params
}
}
/// simple brace-counting parser to detect abstract token syntaxes
fn discover_functions<'a>(tokens: &mut VecDeque<crate::Token<'a>>) -> HashSet<Function<'a>> {
let mut funcs = HashSet::new();
let mut name = None;
let mut cur_fun = Function::new();
let mut assigned = false;
let mut brace_cnt = 0;
let mut parent_cnt = 0;
while let Some(top) = tokens.pop_front() {
// function body detection
// has highest priority
match &top {
crate::Token::Operator(op) => {
match op {
crate::Operator::Assign => if cur_fun.raw.is_none() {
assigned = true;
cur_fun.raw = Some(VecDeque::new());
Token::Delemiter(char) => match char {
'{' => {
brace_cnt += 1;
if brace_cnt == 1 {
if func.name.is_none() {
panic!("Anonymous function not permitted");
}
single_line = false;
func.raw = Some(VecDeque::new());
continue;
},
_ => ()
}
},
'}' => {
brace_cnt -= 1;
if brace_cnt == 0 {
finish_func!();
continue;
}
}
_ => ()
}
crate::Token::LineBreak => if name.is_some() && cur_fun.raw.is_some() && assigned {
funcs.insert(cur_fun);
cur_fun = Function::new();
Token::LineBreak => if single_line {
finish_func!();
continue;
}
crate::Token::Delemiter(char) => {
match char {
'{' => {
brace_cnt += 1;
if brace_cnt == 1 {
// start a new body
cur_fun.raw = Some(VecDeque::new());
assigned = false;
continue;
_ => if single_line && func.raw.is_none() {
func.raw = Some(VecDeque::new());
}
}
if func.raw.is_none() {
match &top {
Token::Operator(op) => match op {
Operator::Assign => {
if func.results {
panic!("double function assignment not permitted")
}
},
'}' => {
brace_cnt -= 1;
// we have a full body!
if brace_cnt == 0 {
funcs.insert(cur_fun);
cur_fun = Function::new();
continue;
if func.name.is_none() {
panic!("Anonymous function not permitted");
}
},
'(' => if cur_fun.raw.is_none() {
parent_cnt += 1;
if parent_cnt == 1 {
// start a new arg list
cur_fun.params = Some(Vec::new());
continue;
}
},
')' => if cur_body.is_none() {
parent_cnt -= 1;
// we have a full body!
if parent_cnt == 0 {
funcs.insert(cur_fun);
cur_fun = Function::new();
continue;
}
},
func.results = true;
single_line = true;
continue;
}
_ => ()
}
Token::Assign(name) => {
if func.results {
panic!("double function assignment not permitted")
}
if func.name.is_some() {
panic!("function already named");
}
func.raw = Some(VecDeque::new());
func.name = Some(name);
func.results = true;
single_line = true;
continue;
}
Token::Delemiter(char) => match char {
'(' => if func.raw.is_none() {
paren_cnt += 1;
if paren_cnt == 1 {
if func.args.is_some() {
panic!("double parameter list not permitted");
}
func.args = Some(Vec::new());
continue;
}
},
')' => {
paren_cnt -= 1;
if paren_cnt == 0 {
continue;
}
}
_ => ()
}
Token::Word(text) => {
if func.name.is_some() {
if func.args.is_none() {
panic!("Function name already set: {text}")
}
}
else {
func.name = Some(text);
continue;
}
}
_ => ()
}
_ => (),
}
if let Some(body) = &mut cur_body {
if let Some(body) = &mut func.raw {
body.push_back(top);
} else if let Some(args) = &mut cur_args {
continue;
}
else if let Some(args) = &mut func.args {
if paren_cnt == 0 {
panic!("Token is not in parameter list: {:?}", top)
}
match &top {
Token::Word(text) => args.push(text),
_ => panic!("Argument in list is not a word")
_ => panic!("Argument is not a word {:?}", &top)
}
} else {
body.push_back(top)
continue;
}
// if we have anything left it might be an error
match &top {
Token::LineBreak => (), // valid whitespace
_ => panic!("Invalid token: {:?}", top)
}
}
funcs
}
/// parse the functions raw content to expr for easy compilation using a brace-counter.
/// - ```{...}``` surround a block
/// - line breaks seperate expressions
fn discover_exprs<'a>(functions: &mut Vec<Func<'a>>) {
for func in functions.iter_mut() {
let mut blocks = vec![Block::new()];
let mut expr = VecDeque::new();
while let Some(top) = func.raw.as_mut().unwrap().pop_front() {
match &top {
Token::LineBreak => if !expr.is_empty() {
blocks.last_mut().expect("Curly brace missmatch").push_back(Expr::Term(expr));
expr = VecDeque::new();
continue;
}
Token::Delemiter(char) => match char {
'{' => {
blocks.last_mut().expect("Curly brace missmatch").push_back(Expr::Term(expr));
expr = VecDeque::new();
blocks.push(Block::new());
continue;
},
'}' => {
// pop topmost block of the stack, storing it in the next lower block
if let Some(block) = blocks.pop() {
blocks.last_mut().expect("Curly brace missmatch").push_back(Expr::Block(block));
} else {
panic!("Curly brace missmatch")
}
continue;
},
_ => ()
},
_ => ()
}
expr.push_back(top)
}
if !expr.is_empty() {
blocks.last_mut().expect("Curly brace missmatch").push_back(Expr::Term(expr));
}
func.expr = Some(Expr::Block(blocks.pop().expect("Curly brace missmmatch")));
}
}
/// parse a single term using a modified shunting yard
fn parse_term<'a>(term: &mut VecDeque<Token<'a>>, scope: &mut Scope) {
let mut op_stack = vec![];
let mut output = VecDeque::new();
'outer:
while let Some(token) = term.pop_front() {
match &token {
Token::Word(text) => {
if scope.is_func(text) {
op_stack.push(Token::Func(text));
continue;
} else if scope.is_arg(text) {
output.push_back(Token::Arg(text));
continue;
} else if scope.is_var(text) {
output.push_back(Token::Var(text));
continue;
}
panic!("Unknwon word: {text}")
}
Token::Number(_) => output.push_back(token),
Token::Assign(_) => op_stack.push(token),
Token::Delemiter(char) => {
match char {
'(' => op_stack.push(token),
')' => {
while let Some(token) = op_stack.pop() {
match &token {
Token::Delemiter(char) => if *char == '(' {
if let Some(next) = op_stack.last() {
match &next {
Token::Func(_) => output.push_back(op_stack.pop().unwrap()),
_ => ()
}
}
continue 'outer;
},
_ => output.push_back(token)
}
}
panic!("Mismatched right parenthesis")
},
_ => panic!("Misplaced character: '{char}'")
}
}
Token::Operator(op) => {
let prec0 = op.prec();
while let Some(top) = op_stack.last(){
match &top {
Token::Operator(op1) => {
let prec1 = op1.prec();
if prec1 > prec0 || prec0 == prec1 && op.assoc() == Assoc::Left {
output.push_back(op_stack.pop().unwrap())
}
},
_ => break
}
}
op_stack.push(token);
}
_ => ()
}
}
while let Some(token) = op_stack.pop() {
match &token {
Token::Delemiter(char) => if *char == '(' {
panic!("Mismatched parenthesis")
},
_ => output.push_back(token)
}
}
term.append(&mut output);
}
fn parse_block(block: &mut Block, scope: &mut Scope) {
for expr in block.iter_mut() {
match expr {
Expr::Block(block) => parse_block(block, scope),
Expr::Term(term) => parse_term(term, scope)
}
}
}
fn parse_exprs<'a>(funcs: &mut Vec<Func<'a>>) {
let mut scope = Scope {
funcs: funcs.iter().map(|f| f.name.unwrap()).collect(),
args: None,
vars: vec![]
};
for func in funcs.iter_mut() {
match func.expr.as_mut().expect("Function has no body") {
Expr::Block(block) => {
scope.args = func.args.as_ref();
parse_block(block, &mut scope)
},
_ => panic!("Fatal-Compilier-Error: function must have a block")
}
}
}
/// reorder and organize a listing of instructions to a RPN based format:
/// any program is made out of functions.
/// A function has a name followed by an optional parameter list, followed by an optional equal sign and block.
pub fn parse<'a>(tokens: &mut VecDeque<crate::Token<'a>>) {
let mut funcs = discover_functions(tokens);
discover_exprs(&mut funcs);
parse_exprs(&mut funcs);
funcs.iter().for_each(|f| println!("{:?}", f));
}

View File

@ -2,27 +2,50 @@ use std::collections::{VecDeque};
#[derive(Debug, Hash, PartialEq, Eq)]
pub enum Operator {
Assign,
Add,
Sub,
Mul,
Div
Div,
Assign
}
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub enum Assoc {
Right,
Left
}
impl Operator {
pub fn parse<'a>(str: &'a str) -> Self {
return match str {
"=" => Operator::Assign,
"+" => Operator::Add,
"-" => Operator::Sub,
"*" => Operator::Mul,
"/" => Operator::Div,
"=" => Operator::Assign,
_ => panic!("Unspecified operator")
};
}
pub fn prec(&self) -> usize {
return match self {
Operator::Add => 3,
Operator::Sub => 3,
Operator::Mul => 4,
Operator::Div => 4,
_ => 0
}
}
pub fn assoc(&self) -> Assoc {
match self {
_ => Assoc::Right
}
}
}
#[derive(Debug, Hash, PartialEq, Eq)]
@ -34,10 +57,14 @@ pub enum Token<'a> {
Delemiter(char),
Operator(Operator),
Number(&'a str),
LineBreak
LineBreak,
Func(&'a str),
Var(&'a str),
Arg(&'a str),
Assign(&'a str),
}
const TOKEN_REGEX_SRC: &'static str = r"(#.*)|([A-Za-z_]+)|(\d*\.?\d+)|([+\-*=])|([(){}])|(\n)";
const TOKEN_REGEX_SRC: &'static str = r"(#.*)|([A-Za-z_]+)\s*=|([A-Za-z_]+)|(\d*\.?\d+)|([+\-*/=])|([(){}])|(\n)";
lazy_static::lazy_static! {
static ref TOKEN_REGEX: regex::Regex = regex::Regex::new(TOKEN_REGEX_SRC).unwrap();
@ -59,11 +86,12 @@ pub fn tokenize<'a>(source: &'a str) -> VecDeque<Token<'a>> {
// if we have a match, save it as token
if let Some(mat) = group {
tokens.push_back(match i {
2 => Token::Word(mat.as_str()),
3 => Token::Number(mat.as_str()),
4 => Token::Operator(Operator::parse(mat.as_str())),
5 => Token::Delemiter(mat.as_str().chars().nth(0).unwrap()),
6 => Token::LineBreak,
2 => Token::Assign(mat.as_str()),
3 => Token::Word(mat.as_str()),
4 => Token::Number(mat.as_str()),
5 => Token::Operator(Operator::parse(mat.as_str())),
6 => Token::Delemiter(mat.as_str().chars().nth(0).unwrap()),
7 => Token::LineBreak,
_ => panic!("Unknown match to tokenize: {}", mat.as_str())
});