first commit
This commit is contained in:
commit
cadefc09f8
8 changed files with 488 additions and 0 deletions
7
Cargo.lock
generated
Normal file
7
Cargo.lock
generated
Normal file
|
@ -0,0 +1,7 @@
|
|||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 3
|
||||
|
||||
[[package]]
|
||||
name = "cla"
|
||||
version = "0.1.0"
|
8
Cargo.toml
Normal file
8
Cargo.toml
Normal file
|
@ -0,0 +1,8 @@
|
|||
[package]
|
||||
name = "cla"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
BIN
cla
Executable file
BIN
cla
Executable file
Binary file not shown.
20
example.ou
Normal file
20
example.ou
Normal file
|
@ -0,0 +1,20 @@
|
|||
int hello;
|
||||
if(a == 1.56) {
|
||||
b = 5 + 20.1;
|
||||
static_cast<int>(1.2);
|
||||
a >= b
|
||||
1.
|
||||
$
|
||||
}
|
||||
|
||||
break case default else float if input int output switch while
|
||||
(){},:;=
|
||||
==!=<><=>=
|
||||
+-
|
||||
|| && ! static_cast<float> static_cast<int>
|
||||
someID123
|
||||
123 123.32
|
||||
/* plus some
|
||||
multi line comments */
|
||||
|
||||
static_cast<floa>
|
67
example.tok
Normal file
67
example.tok
Normal file
|
@ -0,0 +1,67 @@
|
|||
Token: Int, Lexeme: int
|
||||
Token: Id, Lexeme: hello
|
||||
Token: SemiColon, Lexeme: ;
|
||||
Token: If, Lexeme: if
|
||||
Token: BracketOpen, Lexeme: (
|
||||
Token: Id, Lexeme: a
|
||||
Token: RelOp(Equal), Lexeme: ==
|
||||
Token: Num, Lexeme: 1.56
|
||||
Token: BracketClose, Lexeme: )
|
||||
Token: CBracketOpen, Lexeme: {
|
||||
Token: Id, Lexeme: b
|
||||
Token: Equal, Lexeme: =
|
||||
Token: Num, Lexeme: 5
|
||||
Token: AddOp(Add), Lexeme: +
|
||||
Token: Num, Lexeme: 20.1
|
||||
Token: SemiColon, Lexeme: ;
|
||||
Token: Cast(Int), Lexeme: static_cast<int>
|
||||
Token: BracketOpen, Lexeme: (
|
||||
Token: Num, Lexeme: 1.2
|
||||
Token: BracketClose, Lexeme: )
|
||||
Token: SemiColon, Lexeme: ;
|
||||
Token: Id, Lexeme: a
|
||||
Token: RelOp(GreaterEq), Lexeme: >=
|
||||
Token: Id, Lexeme: b
|
||||
Token: Num, Lexeme: 1
|
||||
Token: CBracketClose, Lexeme: }
|
||||
Token: Break, Lexeme: break
|
||||
Token: Case, Lexeme: case
|
||||
Token: Default, Lexeme: default
|
||||
Token: Else, Lexeme: else
|
||||
Token: Float, Lexeme: float
|
||||
Token: If, Lexeme: if
|
||||
Token: Input, Lexeme: input
|
||||
Token: Int, Lexeme: int
|
||||
Token: Output, Lexeme: output
|
||||
Token: Switch, Lexeme: switch
|
||||
Token: While, Lexeme: while
|
||||
Token: BracketOpen, Lexeme: (
|
||||
Token: BracketClose, Lexeme: )
|
||||
Token: CBracketOpen, Lexeme: {
|
||||
Token: CBracketClose, Lexeme: }
|
||||
Token: Comma, Lexeme: ,
|
||||
Token: Colon, Lexeme: :
|
||||
Token: SemiColon, Lexeme: ;
|
||||
Token: Equal, Lexeme: =
|
||||
Token: RelOp(Equal), Lexeme: ==
|
||||
Token: RelOp(NotEqual), Lexeme: !=
|
||||
Token: RelOp(Less), Lexeme: <
|
||||
Token: RelOp(Greater), Lexeme: >
|
||||
Token: RelOp(LessEq), Lexeme: <=
|
||||
Token: RelOp(GreaterEq), Lexeme: >=
|
||||
Token: AddOp(Add), Lexeme: +
|
||||
Token: AddOp(Subtract), Lexeme: -
|
||||
Token: Or, Lexeme: ||
|
||||
Token: And, Lexeme: &&
|
||||
Token: Not, Lexeme: !
|
||||
Token: Cast(Float), Lexeme: static_cast<float>
|
||||
Token: Cast(Int), Lexeme: static_cast<int>
|
||||
Token: Id, Lexeme: someID123
|
||||
Token: Num, Lexeme: 123
|
||||
Token: Num, Lexeme: 123.32
|
||||
Token: MulOp(Divide), Lexeme: /
|
||||
Token: Id, Lexeme: static
|
||||
Token: Id, Lexeme: cast
|
||||
Token: RelOp(Less), Lexeme: <
|
||||
Token: Id, Lexeme: floa
|
||||
Token: RelOp(Greater), Lexeme: >
|
0
src/dfa.rs
Normal file
0
src/dfa.rs
Normal file
345
src/lib.rs
Normal file
345
src/lib.rs
Normal file
|
@ -0,0 +1,345 @@
|
|||
use std::collections::VecDeque;
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum Token {
|
||||
// keywords
|
||||
Break, Case, Default, Else, Float, If, Input, Int, Output, Switch, While,
|
||||
// symbols
|
||||
BracketOpen, BracketClose, CBracketOpen, CBracketClose, Comma, Colon, SemiColon, Equal,
|
||||
// Operators
|
||||
RelOp(RelOpType), AddOp(AddOpType), MulOp(MulOpType), Or, And, Not, Cast(CastType),
|
||||
// Additional
|
||||
Id, Num,
|
||||
EOF, // Represents the end of file, or a read error(as both will be handled the same currently)
|
||||
}
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum RelOpType {
|
||||
Equal, NotEqual, Less, LessEq, Greater, GreaterEq
|
||||
}
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum AddOpType {
|
||||
Add, Subtract
|
||||
}
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum MulOpType {
|
||||
Multiply, Divide
|
||||
}
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum CastType {
|
||||
Int, Float
|
||||
}
|
||||
pub struct Lexer<'a> {
|
||||
/// Current state(states will be numbers)
|
||||
state: u32,
|
||||
/// Last active state
|
||||
last_accepting: u32,
|
||||
/// currently accepted lexeme
|
||||
lexeme: String,
|
||||
/// chars read but not treaded
|
||||
last_char: VecDeque<char>,
|
||||
/// chars read but not accepted
|
||||
last_char_read: VecDeque<char>,
|
||||
/// reader(usually a file, can be a tcp socket as well)
|
||||
reader: &'a mut dyn std::io::Read,
|
||||
/// line counter
|
||||
line_num: usize,
|
||||
}
|
||||
impl<'a> Lexer<'a> {
|
||||
/// Creates a new State instance
|
||||
pub fn new(reader: &mut dyn std::io::Read) -> Lexer {
|
||||
Lexer {
|
||||
state: 0,
|
||||
last_accepting: 0,
|
||||
lexeme: String::new(),
|
||||
last_char: VecDeque::with_capacity(2),
|
||||
last_char_read: VecDeque::with_capacity(2),
|
||||
reader,
|
||||
line_num: 1, // line numbers start with 1, its not an array
|
||||
}
|
||||
}
|
||||
/// Reads from the given reader until a token is matched
|
||||
pub fn get_next_token(&mut self) -> (Token, String) {
|
||||
// reset the states before searching for the next token
|
||||
self.last_accepting = 0;
|
||||
self.state = 0;
|
||||
let mut comment = false;
|
||||
loop {
|
||||
// query next char and proceed to next step
|
||||
let c = self.next_char();
|
||||
self.state = self.next_state(c);
|
||||
if self.state == 0 {
|
||||
// something made the lexer reset(got stuck), attempt to tokenize the accepted lexeme
|
||||
if self.last_accepting != 0 {
|
||||
// push the "read but not accepted" buffer to read later
|
||||
self.last_char.push_front(c);
|
||||
while let Some(lc) = self.last_char_read.pop_back() {
|
||||
self.last_char.push_front(lc);
|
||||
}
|
||||
return self.accept_token()
|
||||
}
|
||||
// no accepted lexeme, if we are not in a comment and it is not a whitespace, write an error
|
||||
else if !c.is_whitespace() && !comment {
|
||||
eprintln!("Unexpected character at line {}: {}", self.line_num, self.last_char_read.front().unwrap_or(&' '));
|
||||
}
|
||||
// make sure we are no longer in a comment, as comment ends upon reseting to 0
|
||||
comment = false;
|
||||
}
|
||||
else if c == '\0' {
|
||||
// Exit when we get a read error/end of file
|
||||
return (Token::EOF, String::new())
|
||||
}
|
||||
else if self.state == u32::MAX {
|
||||
// error state returned from state 0 to signal an unexpected character, print error and reset
|
||||
eprintln!("Unexpected character at line {}: {}", self.line_num, c);
|
||||
self.state = 0;
|
||||
self.lexeme = String::new();
|
||||
}
|
||||
else if self.is_accepting() {
|
||||
// if we are at an accepting state, save the current state
|
||||
// and push everything in the last_char_read buffer and current char into lexeme, as we can accept them
|
||||
self.last_accepting = self.state;
|
||||
while let Some(lc) = self.last_char_read.pop_front() {
|
||||
self.lexeme.push(lc);
|
||||
}
|
||||
self.lexeme.push(c);
|
||||
}
|
||||
else if self.state >= 400 && self.state <= 402{
|
||||
// comment states
|
||||
comment = true;
|
||||
}
|
||||
else {
|
||||
// if we are not accepting, push last char into the buffer, since we dont know if it will be part of the
|
||||
// next lexeme
|
||||
self.last_char_read.push_back(c);
|
||||
}
|
||||
if c == '\n' {
|
||||
self.line_num += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// returns next state, If returns 0, it got stuck and needs to restart, u32::MAX is invalid character
|
||||
fn next_state(&self, c: char) -> u32 {
|
||||
// the DFA state machine, state 0 has a lot of edges, but most states are straight forward, though are many due to the keywords
|
||||
match self.state {
|
||||
0 => match c {
|
||||
'b' => 1,
|
||||
'c' => 6,
|
||||
'd' => 10,
|
||||
'e' => 17,
|
||||
'f' => 21,
|
||||
'i' => 26,
|
||||
'o' => 33,
|
||||
's' => 39,
|
||||
'w' => 45,
|
||||
'>' => 100,
|
||||
'<' => 101,
|
||||
'=' => 102,
|
||||
'!' => 104,
|
||||
'(' => 105,
|
||||
')' => 106,
|
||||
'{' => 107,
|
||||
'}' => 108,
|
||||
',' => 109,
|
||||
':' => 110,
|
||||
';' => 111,
|
||||
'+' => 112,
|
||||
'-' => 113,
|
||||
'&' => 114,
|
||||
'*' => 116,
|
||||
'/' => 117,
|
||||
'|' => 118,
|
||||
_ if c.is_numeric() => 300,
|
||||
_ if c.is_ascii_alphabetic() => 201,
|
||||
_ if c.is_whitespace() => 0,
|
||||
_ => u32::MAX
|
||||
}
|
||||
// 0 -b-> 1
|
||||
1 => letter_or_201(c, 'r', 2),
|
||||
2 => letter_or_201(c, 'e', 3),
|
||||
3 => letter_or_201(c, 'a', 4),
|
||||
4 => letter_or_201(c, 'k', 5),
|
||||
5 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
|
||||
// 0 -c-> 6
|
||||
6 => letter_or_201(c, 'a', 7),
|
||||
7 => letter_or_201(c, 's', 8),
|
||||
8 => letter_or_201(c, 'e', 9),
|
||||
9 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
|
||||
// 0 -d->10
|
||||
10 => letter_or_201(c, 'e', 11),
|
||||
11 => letter_or_201(c, 'f', 12),
|
||||
12 => letter_or_201(c, 'a', 13),
|
||||
13 => letter_or_201(c, 'u', 14),
|
||||
14 => letter_or_201(c, 'l', 15),
|
||||
15 => letter_or_201(c, 't', 16),
|
||||
16 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
|
||||
// 0 -e-> 17
|
||||
17 => letter_or_201(c, 'l', 18),
|
||||
18 => letter_or_201(c, 's', 19),
|
||||
19 => letter_or_201(c, 'e', 20),
|
||||
20 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
|
||||
// 0 -f-> 21
|
||||
21 => letter_or_201(c, 'l', 22),
|
||||
22 => letter_or_201(c, 'o', 23),
|
||||
23 => letter_or_201(c, 'a', 24),
|
||||
24 => letter_or_201(c, 't', 25),
|
||||
25 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
|
||||
// 0 -i-> 26
|
||||
26 => match c {
|
||||
'f' => 27,
|
||||
'n' => 28,
|
||||
_ if c.is_ascii_alphanumeric() => 201,
|
||||
_ => 0
|
||||
}
|
||||
27 => if c.is_ascii_alphanumeric() { 201 } else { 0 } // if
|
||||
// 26 -n-> 28
|
||||
28 => match c {
|
||||
't' => 29,
|
||||
'p' => 30,
|
||||
_ if c.is_ascii_alphanumeric() => 201,
|
||||
_ => 0,
|
||||
}
|
||||
// 0 -i-> 26 -n-> 28 -t-> 29
|
||||
29 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
|
||||
// 0 -i-> 26 -n-> 28 -p-> 30
|
||||
30 => letter_or_201(c, 'u', 31),
|
||||
31 => letter_or_201(c, 't', 32),
|
||||
32 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
|
||||
// 0 -o-> 33
|
||||
33 => letter_or_201(c, 'u', 34),
|
||||
34 => letter_or_201(c, 't', 35),
|
||||
35 => letter_or_201(c, 'p', 36),
|
||||
36 => letter_or_201(c, 'u', 37),
|
||||
37 => letter_or_201(c, 't', 38),
|
||||
38 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
|
||||
// 0 -s-> 39
|
||||
39 => if c == 'w' { 40 } else if c == 't' { 50 } else { 201 },
|
||||
40 => letter_or_201(c, 'i', 41),
|
||||
41 => letter_or_201(c, 't', 42),
|
||||
42 => letter_or_201(c, 'c', 43),
|
||||
43 => letter_or_201(c, 'h', 44),
|
||||
44 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
|
||||
45 => letter_or_201(c, 'h', 46),
|
||||
46 => letter_or_201(c, 'i', 47),
|
||||
47 => letter_or_201(c, 'l', 48),
|
||||
48 => letter_or_201(c, 'e', 49),
|
||||
49 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
|
||||
// 0 -s-> 30 -t-> 50
|
||||
50 => letter_or_201(c, 'a', 51),
|
||||
51 => letter_or_201(c, 't', 52),
|
||||
52 => letter_or_201(c, 'i', 53),
|
||||
53 => letter_or_201(c, 'c', 54),
|
||||
54 => letter_or_201(c, '_', 55),
|
||||
55 => if c == 'c' { 56 } else { 0 },
|
||||
56 => if c == 'a' { 57 } else { 0 },
|
||||
57 => if c == 's' { 58 } else { 0 },
|
||||
58 => if c == 't' { 59 } else { 0 },
|
||||
59 => if c == '<' { 60 } else { 0 },
|
||||
60 => if c == 'i' { 61 } else if c == 'f' { 63 } else { 0 },
|
||||
61 => if c == 'n' { 62 } else { 0 },
|
||||
62 => if c == 't' { 67 } else { 0 },
|
||||
// 60 -f-> 63
|
||||
63 => if c == 'l' { 64 } else { 0 },
|
||||
64 => if c == 'o' { 65 } else { 0 },
|
||||
65 => if c == 'a' { 66 } else { 0 },
|
||||
66 => if c == 't' { 67 } else { 0 },
|
||||
67 => if c == '>' { 68 } else { 0 },
|
||||
100 | 101 | 102 | 104 => if c == '=' { 103 } else { 0 }
|
||||
114 => if c == '&' { 115 } else { 0 }
|
||||
117 => if c == '*' { 400 } else { 0 }
|
||||
118 => if c == '|' { 119 } else { 0 }
|
||||
201 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
|
||||
300 if c.is_numeric() => 300,
|
||||
300 if c == '.' => 301,
|
||||
301 | 302 => if c.is_numeric() { 302 } else { 0 }
|
||||
400 => if c == '*' { 401 } else { 400 }
|
||||
401 => if c == '/' { 402 } else { 400 }
|
||||
_ => 0
|
||||
}
|
||||
}
|
||||
fn is_accepting(&self) -> bool {
|
||||
self.state != 0 &&
|
||||
self.state != 301 &&
|
||||
self.state != 114 &&
|
||||
self.state != 118 &&
|
||||
(self.state < 55 || self.state > 67) &&
|
||||
(self.state < 400 || self.state > 403)
|
||||
}
|
||||
fn accept_token(&mut self) -> (Token, String) {
|
||||
// take the last saved lexeme and put an empty one instead
|
||||
let mut lexeme = String::new();
|
||||
std::mem::swap(&mut lexeme, &mut self.lexeme);
|
||||
// match based on the last accepting state
|
||||
let token = match self.last_accepting {
|
||||
5 => Token::Break,
|
||||
9 => Token::Case,
|
||||
16 => Token::Default,
|
||||
20 => Token::Else,
|
||||
25 => Token::Float,
|
||||
27 => Token::If,
|
||||
29 => Token::Int,
|
||||
32 => Token::Input,
|
||||
38 => Token::Output,
|
||||
44 => Token::Switch,
|
||||
49 => Token::While,
|
||||
68 => Token::Cast(if lexeme.ends_with("int>") { CastType::Int } else { CastType::Float }),
|
||||
100 => Token::RelOp(RelOpType::Greater),
|
||||
101 => Token::RelOp(RelOpType::Less),
|
||||
102 => Token::Equal,
|
||||
103 => Token::RelOp(relop_from_str(&lexeme)),
|
||||
104 => Token::Not,
|
||||
105 => Token::BracketOpen,
|
||||
106 => Token::BracketClose,
|
||||
107 => Token::CBracketOpen,
|
||||
108 => Token::CBracketClose,
|
||||
109 => Token::Comma,
|
||||
110 => Token::Colon,
|
||||
111 => Token::SemiColon,
|
||||
112 => Token::AddOp(AddOpType::Add),
|
||||
113 => Token::AddOp(AddOpType::Subtract),
|
||||
115 => Token::And,
|
||||
116 => Token::MulOp(MulOpType::Multiply),
|
||||
117 => Token::MulOp(MulOpType::Divide),
|
||||
119 => Token::Or,
|
||||
300 | 302 => Token::Num,
|
||||
_ => Token::Id, // if called when `self.last_acceptin` is false, will give false results
|
||||
};
|
||||
(token, lexeme)
|
||||
}
|
||||
|
||||
fn next_char(&mut self) -> char {
|
||||
if let Some(c) = self.last_char.pop_front() {
|
||||
c
|
||||
}
|
||||
else {
|
||||
// create a 1 byte buffer
|
||||
let mut b = [0u8; 1];
|
||||
// no error handling, as we will just return a \0 in that case
|
||||
let _ = self.reader.read(&mut b);
|
||||
char::from_u32(b[0].into()).unwrap_or('\0')
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn letter_or_201(c: char, w: char, ok: u32) -> u32 {
|
||||
if c == w {
|
||||
ok
|
||||
}
|
||||
else {
|
||||
match c {
|
||||
'a'..='z' | 'A'..='Z' | '0'..='9' => 201,
|
||||
_ => 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn relop_from_str(s: &str) -> RelOpType {
|
||||
// matches the first letter in s, assuming s is a relop token lexeme, it will provide correct result
|
||||
match s.chars().next().expect("Invalid call to relop_from_str!") {
|
||||
'<' => RelOpType::LessEq,
|
||||
'>' => RelOpType::GreaterEq,
|
||||
'!' => RelOpType::NotEqual,
|
||||
'=' => RelOpType::Equal,
|
||||
_ => panic!("Invalid call to relop_from_str")
|
||||
}
|
||||
}
|
41
src/main.rs
Normal file
41
src/main.rs
Normal file
|
@ -0,0 +1,41 @@
|
|||
use std::{path::Path, io::{BufReader, Write}};
|
||||
|
||||
use cla::{Lexer, Token};
|
||||
|
||||
fn main() {
|
||||
eprintln!("Student: Aviv Romem");
|
||||
if std::env::args().count() == 2 {
|
||||
// Safety: we make sure we have 2 items in the args iterator
|
||||
let file_name = std::env::args().skip(1).next().unwrap();
|
||||
if file_name.ends_with(".ou") && Path::new(&file_name).exists() {
|
||||
let file = std::fs::File::open(&file_name).expect("INVALID FILE");
|
||||
let mut buf_reader = BufReader::new(file); // use a buf reader for faster reading
|
||||
let mut lexer = Lexer::new(&mut buf_reader); // set up a new lexer instance
|
||||
|
||||
// open output file
|
||||
let mut output_file = file_name.chars().take(file_name.chars().count() - 3).collect::<String>();
|
||||
output_file.push_str(&".tok");
|
||||
let mut output = std::fs::File::create(Path::new(&output_file)).expect("Cannot create output file!");
|
||||
|
||||
// Loop over the tokens until we get Token::EOF, meaning no more tokens available
|
||||
loop {
|
||||
let (token, lexeme) = lexer.get_next_token();
|
||||
if token == Token::EOF {
|
||||
break;
|
||||
}
|
||||
// string to print, using `fmt::Debug` derive for easier printing of the token
|
||||
let line = format!("Token: {:?}, Lexeme: {}\n", token, lexeme);
|
||||
// turn to bytes(UTF-8 by default), write and flush to file
|
||||
let as_bytes = line.as_bytes();
|
||||
output.write(as_bytes).expect("Failed writing to output file!");
|
||||
output.flush().expect("Failed writing to output file!");
|
||||
}
|
||||
}
|
||||
else {
|
||||
eprintln!("USAGE: cla <file_name>.ou");
|
||||
}
|
||||
}
|
||||
else {
|
||||
eprintln!("USAGE: cla <file_name>.ou");
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue