commit cadefc09f8a6fdad3dafa63dfc4e4c9c2956651c Author: neon - rustystriker's laptop Date: Sat Dec 30 14:01:19 2023 +0200 first commit diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..935d796 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "cla" +version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..ebb8ad8 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "cla" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] \ No newline at end of file diff --git a/cla b/cla new file mode 100755 index 0000000..4cf0a34 Binary files /dev/null and b/cla differ diff --git a/example.ou b/example.ou new file mode 100644 index 0000000..636aeb2 --- /dev/null +++ b/example.ou @@ -0,0 +1,20 @@ +int hello; +if(a == 1.56) { + b = 5 + 20.1; + static_cast(1.2); + a >= b + 1. + $ +} + +break case default else float if input int output switch while +(){},:;= +==!=<><=>= ++- +|| && ! static_cast static_cast +someID123 +123 123.32 +/* plus some +multi line comments */ + +static_cast diff --git a/example.tok b/example.tok new file mode 100644 index 0000000..c97fee5 --- /dev/null +++ b/example.tok @@ -0,0 +1,67 @@ +Token: Int, Lexeme: int +Token: Id, Lexeme: hello +Token: SemiColon, Lexeme: ; +Token: If, Lexeme: if +Token: BracketOpen, Lexeme: ( +Token: Id, Lexeme: a +Token: RelOp(Equal), Lexeme: == +Token: Num, Lexeme: 1.56 +Token: BracketClose, Lexeme: ) +Token: CBracketOpen, Lexeme: { +Token: Id, Lexeme: b +Token: Equal, Lexeme: = +Token: Num, Lexeme: 5 +Token: AddOp(Add), Lexeme: + +Token: Num, Lexeme: 20.1 +Token: SemiColon, Lexeme: ; +Token: Cast(Int), Lexeme: static_cast +Token: BracketOpen, Lexeme: ( +Token: Num, Lexeme: 1.2 +Token: BracketClose, Lexeme: ) +Token: SemiColon, Lexeme: ; +Token: Id, Lexeme: a +Token: RelOp(GreaterEq), Lexeme: >= +Token: Id, Lexeme: b +Token: Num, Lexeme: 1 +Token: CBracketClose, Lexeme: } +Token: Break, Lexeme: break +Token: Case, Lexeme: case +Token: Default, Lexeme: default +Token: Else, Lexeme: else +Token: Float, Lexeme: float +Token: If, Lexeme: if +Token: Input, Lexeme: input +Token: Int, Lexeme: int +Token: Output, Lexeme: output +Token: Switch, Lexeme: switch +Token: While, Lexeme: while +Token: BracketOpen, Lexeme: ( +Token: BracketClose, Lexeme: ) +Token: CBracketOpen, Lexeme: { +Token: CBracketClose, Lexeme: } +Token: Comma, Lexeme: , +Token: Colon, Lexeme: : +Token: SemiColon, Lexeme: ; +Token: Equal, Lexeme: = +Token: RelOp(Equal), Lexeme: == +Token: RelOp(NotEqual), Lexeme: != +Token: RelOp(Less), Lexeme: < +Token: RelOp(Greater), Lexeme: > +Token: RelOp(LessEq), Lexeme: <= +Token: RelOp(GreaterEq), Lexeme: >= +Token: AddOp(Add), Lexeme: + +Token: AddOp(Subtract), Lexeme: - +Token: Or, Lexeme: || +Token: And, Lexeme: && +Token: Not, Lexeme: ! +Token: Cast(Float), Lexeme: static_cast +Token: Cast(Int), Lexeme: static_cast +Token: Id, Lexeme: someID123 +Token: Num, Lexeme: 123 +Token: Num, Lexeme: 123.32 +Token: MulOp(Divide), Lexeme: / +Token: Id, Lexeme: static +Token: Id, Lexeme: cast +Token: RelOp(Less), Lexeme: < +Token: Id, Lexeme: floa +Token: RelOp(Greater), Lexeme: > diff --git a/src/dfa.rs b/src/dfa.rs new file mode 100644 index 0000000..e69de29 diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..a66cc9d --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,345 @@ +use std::collections::VecDeque; + +#[derive(Debug, PartialEq)] +pub enum Token { + // keywords + Break, Case, Default, Else, Float, If, Input, Int, Output, Switch, While, + // symbols + BracketOpen, BracketClose, CBracketOpen, CBracketClose, Comma, Colon, SemiColon, Equal, + // Operators + RelOp(RelOpType), AddOp(AddOpType), MulOp(MulOpType), Or, And, Not, Cast(CastType), + // Additional + Id, Num, + EOF, // Represents the end of file, or a read error(as both will be handled the same currently) +} +#[derive(Debug, PartialEq)] +pub enum RelOpType { + Equal, NotEqual, Less, LessEq, Greater, GreaterEq +} +#[derive(Debug, PartialEq)] +pub enum AddOpType { + Add, Subtract +} +#[derive(Debug, PartialEq)] +pub enum MulOpType { + Multiply, Divide +} +#[derive(Debug, PartialEq)] +pub enum CastType { + Int, Float +} +pub struct Lexer<'a> { + /// Current state(states will be numbers) + state: u32, + /// Last active state + last_accepting: u32, + /// currently accepted lexeme + lexeme: String, + /// chars read but not treaded + last_char: VecDeque, + /// chars read but not accepted + last_char_read: VecDeque, + /// reader(usually a file, can be a tcp socket as well) + reader: &'a mut dyn std::io::Read, + /// line counter + line_num: usize, +} +impl<'a> Lexer<'a> { + /// Creates a new State instance + pub fn new(reader: &mut dyn std::io::Read) -> Lexer { + Lexer { + state: 0, + last_accepting: 0, + lexeme: String::new(), + last_char: VecDeque::with_capacity(2), + last_char_read: VecDeque::with_capacity(2), + reader, + line_num: 1, // line numbers start with 1, its not an array + } + } + /// Reads from the given reader until a token is matched + pub fn get_next_token(&mut self) -> (Token, String) { + // reset the states before searching for the next token + self.last_accepting = 0; + self.state = 0; + let mut comment = false; + loop { + // query next char and proceed to next step + let c = self.next_char(); + self.state = self.next_state(c); + if self.state == 0 { + // something made the lexer reset(got stuck), attempt to tokenize the accepted lexeme + if self.last_accepting != 0 { + // push the "read but not accepted" buffer to read later + self.last_char.push_front(c); + while let Some(lc) = self.last_char_read.pop_back() { + self.last_char.push_front(lc); + } + return self.accept_token() + } + // no accepted lexeme, if we are not in a comment and it is not a whitespace, write an error + else if !c.is_whitespace() && !comment { + eprintln!("Unexpected character at line {}: {}", self.line_num, self.last_char_read.front().unwrap_or(&' ')); + } + // make sure we are no longer in a comment, as comment ends upon reseting to 0 + comment = false; + } + else if c == '\0' { + // Exit when we get a read error/end of file + return (Token::EOF, String::new()) + } + else if self.state == u32::MAX { + // error state returned from state 0 to signal an unexpected character, print error and reset + eprintln!("Unexpected character at line {}: {}", self.line_num, c); + self.state = 0; + self.lexeme = String::new(); + } + else if self.is_accepting() { + // if we are at an accepting state, save the current state + // and push everything in the last_char_read buffer and current char into lexeme, as we can accept them + self.last_accepting = self.state; + while let Some(lc) = self.last_char_read.pop_front() { + self.lexeme.push(lc); + } + self.lexeme.push(c); + } + else if self.state >= 400 && self.state <= 402{ + // comment states + comment = true; + } + else { + // if we are not accepting, push last char into the buffer, since we dont know if it will be part of the + // next lexeme + self.last_char_read.push_back(c); + } + if c == '\n' { + self.line_num += 1; + } + } + } + + /// returns next state, If returns 0, it got stuck and needs to restart, u32::MAX is invalid character + fn next_state(&self, c: char) -> u32 { + // the DFA state machine, state 0 has a lot of edges, but most states are straight forward, though are many due to the keywords + match self.state { + 0 => match c { + 'b' => 1, + 'c' => 6, + 'd' => 10, + 'e' => 17, + 'f' => 21, + 'i' => 26, + 'o' => 33, + 's' => 39, + 'w' => 45, + '>' => 100, + '<' => 101, + '=' => 102, + '!' => 104, + '(' => 105, + ')' => 106, + '{' => 107, + '}' => 108, + ',' => 109, + ':' => 110, + ';' => 111, + '+' => 112, + '-' => 113, + '&' => 114, + '*' => 116, + '/' => 117, + '|' => 118, + _ if c.is_numeric() => 300, + _ if c.is_ascii_alphabetic() => 201, + _ if c.is_whitespace() => 0, + _ => u32::MAX + } + // 0 -b-> 1 + 1 => letter_or_201(c, 'r', 2), + 2 => letter_or_201(c, 'e', 3), + 3 => letter_or_201(c, 'a', 4), + 4 => letter_or_201(c, 'k', 5), + 5 => if c.is_ascii_alphanumeric() { 201 } else { 0 } + // 0 -c-> 6 + 6 => letter_or_201(c, 'a', 7), + 7 => letter_or_201(c, 's', 8), + 8 => letter_or_201(c, 'e', 9), + 9 => if c.is_ascii_alphanumeric() { 201 } else { 0 } + // 0 -d->10 + 10 => letter_or_201(c, 'e', 11), + 11 => letter_or_201(c, 'f', 12), + 12 => letter_or_201(c, 'a', 13), + 13 => letter_or_201(c, 'u', 14), + 14 => letter_or_201(c, 'l', 15), + 15 => letter_or_201(c, 't', 16), + 16 => if c.is_ascii_alphanumeric() { 201 } else { 0 } + // 0 -e-> 17 + 17 => letter_or_201(c, 'l', 18), + 18 => letter_or_201(c, 's', 19), + 19 => letter_or_201(c, 'e', 20), + 20 => if c.is_ascii_alphanumeric() { 201 } else { 0 } + // 0 -f-> 21 + 21 => letter_or_201(c, 'l', 22), + 22 => letter_or_201(c, 'o', 23), + 23 => letter_or_201(c, 'a', 24), + 24 => letter_or_201(c, 't', 25), + 25 => if c.is_ascii_alphanumeric() { 201 } else { 0 } + // 0 -i-> 26 + 26 => match c { + 'f' => 27, + 'n' => 28, + _ if c.is_ascii_alphanumeric() => 201, + _ => 0 + } + 27 => if c.is_ascii_alphanumeric() { 201 } else { 0 } // if + // 26 -n-> 28 + 28 => match c { + 't' => 29, + 'p' => 30, + _ if c.is_ascii_alphanumeric() => 201, + _ => 0, + } + // 0 -i-> 26 -n-> 28 -t-> 29 + 29 => if c.is_ascii_alphanumeric() { 201 } else { 0 } + // 0 -i-> 26 -n-> 28 -p-> 30 + 30 => letter_or_201(c, 'u', 31), + 31 => letter_or_201(c, 't', 32), + 32 => if c.is_ascii_alphanumeric() { 201 } else { 0 } + // 0 -o-> 33 + 33 => letter_or_201(c, 'u', 34), + 34 => letter_or_201(c, 't', 35), + 35 => letter_or_201(c, 'p', 36), + 36 => letter_or_201(c, 'u', 37), + 37 => letter_or_201(c, 't', 38), + 38 => if c.is_ascii_alphanumeric() { 201 } else { 0 } + // 0 -s-> 39 + 39 => if c == 'w' { 40 } else if c == 't' { 50 } else { 201 }, + 40 => letter_or_201(c, 'i', 41), + 41 => letter_or_201(c, 't', 42), + 42 => letter_or_201(c, 'c', 43), + 43 => letter_or_201(c, 'h', 44), + 44 => if c.is_ascii_alphanumeric() { 201 } else { 0 } + 45 => letter_or_201(c, 'h', 46), + 46 => letter_or_201(c, 'i', 47), + 47 => letter_or_201(c, 'l', 48), + 48 => letter_or_201(c, 'e', 49), + 49 => if c.is_ascii_alphanumeric() { 201 } else { 0 } + // 0 -s-> 30 -t-> 50 + 50 => letter_or_201(c, 'a', 51), + 51 => letter_or_201(c, 't', 52), + 52 => letter_or_201(c, 'i', 53), + 53 => letter_or_201(c, 'c', 54), + 54 => letter_or_201(c, '_', 55), + 55 => if c == 'c' { 56 } else { 0 }, + 56 => if c == 'a' { 57 } else { 0 }, + 57 => if c == 's' { 58 } else { 0 }, + 58 => if c == 't' { 59 } else { 0 }, + 59 => if c == '<' { 60 } else { 0 }, + 60 => if c == 'i' { 61 } else if c == 'f' { 63 } else { 0 }, + 61 => if c == 'n' { 62 } else { 0 }, + 62 => if c == 't' { 67 } else { 0 }, + // 60 -f-> 63 + 63 => if c == 'l' { 64 } else { 0 }, + 64 => if c == 'o' { 65 } else { 0 }, + 65 => if c == 'a' { 66 } else { 0 }, + 66 => if c == 't' { 67 } else { 0 }, + 67 => if c == '>' { 68 } else { 0 }, + 100 | 101 | 102 | 104 => if c == '=' { 103 } else { 0 } + 114 => if c == '&' { 115 } else { 0 } + 117 => if c == '*' { 400 } else { 0 } + 118 => if c == '|' { 119 } else { 0 } + 201 => if c.is_ascii_alphanumeric() { 201 } else { 0 } + 300 if c.is_numeric() => 300, + 300 if c == '.' => 301, + 301 | 302 => if c.is_numeric() { 302 } else { 0 } + 400 => if c == '*' { 401 } else { 400 } + 401 => if c == '/' { 402 } else { 400 } + _ => 0 + } + } + fn is_accepting(&self) -> bool { + self.state != 0 && + self.state != 301 && + self.state != 114 && + self.state != 118 && + (self.state < 55 || self.state > 67) && + (self.state < 400 || self.state > 403) + } + fn accept_token(&mut self) -> (Token, String) { + // take the last saved lexeme and put an empty one instead + let mut lexeme = String::new(); + std::mem::swap(&mut lexeme, &mut self.lexeme); + // match based on the last accepting state + let token = match self.last_accepting { + 5 => Token::Break, + 9 => Token::Case, + 16 => Token::Default, + 20 => Token::Else, + 25 => Token::Float, + 27 => Token::If, + 29 => Token::Int, + 32 => Token::Input, + 38 => Token::Output, + 44 => Token::Switch, + 49 => Token::While, + 68 => Token::Cast(if lexeme.ends_with("int>") { CastType::Int } else { CastType::Float }), + 100 => Token::RelOp(RelOpType::Greater), + 101 => Token::RelOp(RelOpType::Less), + 102 => Token::Equal, + 103 => Token::RelOp(relop_from_str(&lexeme)), + 104 => Token::Not, + 105 => Token::BracketOpen, + 106 => Token::BracketClose, + 107 => Token::CBracketOpen, + 108 => Token::CBracketClose, + 109 => Token::Comma, + 110 => Token::Colon, + 111 => Token::SemiColon, + 112 => Token::AddOp(AddOpType::Add), + 113 => Token::AddOp(AddOpType::Subtract), + 115 => Token::And, + 116 => Token::MulOp(MulOpType::Multiply), + 117 => Token::MulOp(MulOpType::Divide), + 119 => Token::Or, + 300 | 302 => Token::Num, + _ => Token::Id, // if called when `self.last_acceptin` is false, will give false results + }; + (token, lexeme) + } + + fn next_char(&mut self) -> char { + if let Some(c) = self.last_char.pop_front() { + c + } + else { + // create a 1 byte buffer + let mut b = [0u8; 1]; + // no error handling, as we will just return a \0 in that case + let _ = self.reader.read(&mut b); + char::from_u32(b[0].into()).unwrap_or('\0') + } + } +} + +fn letter_or_201(c: char, w: char, ok: u32) -> u32 { + if c == w { + ok + } + else { + match c { + 'a'..='z' | 'A'..='Z' | '0'..='9' => 201, + _ => 0, + } + } +} + +fn relop_from_str(s: &str) -> RelOpType { + // matches the first letter in s, assuming s is a relop token lexeme, it will provide correct result + match s.chars().next().expect("Invalid call to relop_from_str!") { + '<' => RelOpType::LessEq, + '>' => RelOpType::GreaterEq, + '!' => RelOpType::NotEqual, + '=' => RelOpType::Equal, + _ => panic!("Invalid call to relop_from_str") + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..e198e0f --- /dev/null +++ b/src/main.rs @@ -0,0 +1,41 @@ +use std::{path::Path, io::{BufReader, Write}}; + +use cla::{Lexer, Token}; + +fn main() { + eprintln!("Student: Aviv Romem"); + if std::env::args().count() == 2 { + // Safety: we make sure we have 2 items in the args iterator + let file_name = std::env::args().skip(1).next().unwrap(); + if file_name.ends_with(".ou") && Path::new(&file_name).exists() { + let file = std::fs::File::open(&file_name).expect("INVALID FILE"); + let mut buf_reader = BufReader::new(file); // use a buf reader for faster reading + let mut lexer = Lexer::new(&mut buf_reader); // set up a new lexer instance + + // open output file + let mut output_file = file_name.chars().take(file_name.chars().count() - 3).collect::(); + output_file.push_str(&".tok"); + let mut output = std::fs::File::create(Path::new(&output_file)).expect("Cannot create output file!"); + + // Loop over the tokens until we get Token::EOF, meaning no more tokens available + loop { + let (token, lexeme) = lexer.get_next_token(); + if token == Token::EOF { + break; + } + // string to print, using `fmt::Debug` derive for easier printing of the token + let line = format!("Token: {:?}, Lexeme: {}\n", token, lexeme); + // turn to bytes(UTF-8 by default), write and flush to file + let as_bytes = line.as_bytes(); + output.write(as_bytes).expect("Failed writing to output file!"); + output.flush().expect("Failed writing to output file!"); + } + } + else { + eprintln!("USAGE: cla .ou"); + } + } + else { + eprintln!("USAGE: cla .ou"); + } +}