first commit

This commit is contained in:
neon - rustystriker's laptop 2023-12-30 14:01:19 +02:00
commit cadefc09f8
8 changed files with 488 additions and 0 deletions

7
Cargo.lock generated Normal file
View file

@ -0,0 +1,7 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "cla"
version = "0.1.0"

8
Cargo.toml Normal file
View file

@ -0,0 +1,8 @@
[package]
name = "cla"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]

BIN
cla Executable file

Binary file not shown.

20
example.ou Normal file
View file

@ -0,0 +1,20 @@
int hello;
if(a == 1.56) {
b = 5 + 20.1;
static_cast<int>(1.2);
a >= b
1.
$
}
break case default else float if input int output switch while
(){},:;=
==!=<><=>=
+-
|| && ! static_cast<float> static_cast<int>
someID123
123 123.32
/* plus some
multi line comments */
static_cast<floa>

67
example.tok Normal file
View file

@ -0,0 +1,67 @@
Token: Int, Lexeme: int
Token: Id, Lexeme: hello
Token: SemiColon, Lexeme: ;
Token: If, Lexeme: if
Token: BracketOpen, Lexeme: (
Token: Id, Lexeme: a
Token: RelOp(Equal), Lexeme: ==
Token: Num, Lexeme: 1.56
Token: BracketClose, Lexeme: )
Token: CBracketOpen, Lexeme: {
Token: Id, Lexeme: b
Token: Equal, Lexeme: =
Token: Num, Lexeme: 5
Token: AddOp(Add), Lexeme: +
Token: Num, Lexeme: 20.1
Token: SemiColon, Lexeme: ;
Token: Cast(Int), Lexeme: static_cast<int>
Token: BracketOpen, Lexeme: (
Token: Num, Lexeme: 1.2
Token: BracketClose, Lexeme: )
Token: SemiColon, Lexeme: ;
Token: Id, Lexeme: a
Token: RelOp(GreaterEq), Lexeme: >=
Token: Id, Lexeme: b
Token: Num, Lexeme: 1
Token: CBracketClose, Lexeme: }
Token: Break, Lexeme: break
Token: Case, Lexeme: case
Token: Default, Lexeme: default
Token: Else, Lexeme: else
Token: Float, Lexeme: float
Token: If, Lexeme: if
Token: Input, Lexeme: input
Token: Int, Lexeme: int
Token: Output, Lexeme: output
Token: Switch, Lexeme: switch
Token: While, Lexeme: while
Token: BracketOpen, Lexeme: (
Token: BracketClose, Lexeme: )
Token: CBracketOpen, Lexeme: {
Token: CBracketClose, Lexeme: }
Token: Comma, Lexeme: ,
Token: Colon, Lexeme: :
Token: SemiColon, Lexeme: ;
Token: Equal, Lexeme: =
Token: RelOp(Equal), Lexeme: ==
Token: RelOp(NotEqual), Lexeme: !=
Token: RelOp(Less), Lexeme: <
Token: RelOp(Greater), Lexeme: >
Token: RelOp(LessEq), Lexeme: <=
Token: RelOp(GreaterEq), Lexeme: >=
Token: AddOp(Add), Lexeme: +
Token: AddOp(Subtract), Lexeme: -
Token: Or, Lexeme: ||
Token: And, Lexeme: &&
Token: Not, Lexeme: !
Token: Cast(Float), Lexeme: static_cast<float>
Token: Cast(Int), Lexeme: static_cast<int>
Token: Id, Lexeme: someID123
Token: Num, Lexeme: 123
Token: Num, Lexeme: 123.32
Token: MulOp(Divide), Lexeme: /
Token: Id, Lexeme: static
Token: Id, Lexeme: cast
Token: RelOp(Less), Lexeme: <
Token: Id, Lexeme: floa
Token: RelOp(Greater), Lexeme: >

0
src/dfa.rs Normal file
View file

345
src/lib.rs Normal file
View file

@ -0,0 +1,345 @@
use std::collections::VecDeque;
#[derive(Debug, PartialEq)]
pub enum Token {
// keywords
Break, Case, Default, Else, Float, If, Input, Int, Output, Switch, While,
// symbols
BracketOpen, BracketClose, CBracketOpen, CBracketClose, Comma, Colon, SemiColon, Equal,
// Operators
RelOp(RelOpType), AddOp(AddOpType), MulOp(MulOpType), Or, And, Not, Cast(CastType),
// Additional
Id, Num,
EOF, // Represents the end of file, or a read error(as both will be handled the same currently)
}
#[derive(Debug, PartialEq)]
pub enum RelOpType {
Equal, NotEqual, Less, LessEq, Greater, GreaterEq
}
#[derive(Debug, PartialEq)]
pub enum AddOpType {
Add, Subtract
}
#[derive(Debug, PartialEq)]
pub enum MulOpType {
Multiply, Divide
}
#[derive(Debug, PartialEq)]
pub enum CastType {
Int, Float
}
pub struct Lexer<'a> {
/// Current state(states will be numbers)
state: u32,
/// Last active state
last_accepting: u32,
/// currently accepted lexeme
lexeme: String,
/// chars read but not treaded
last_char: VecDeque<char>,
/// chars read but not accepted
last_char_read: VecDeque<char>,
/// reader(usually a file, can be a tcp socket as well)
reader: &'a mut dyn std::io::Read,
/// line counter
line_num: usize,
}
impl<'a> Lexer<'a> {
/// Creates a new State instance
pub fn new(reader: &mut dyn std::io::Read) -> Lexer {
Lexer {
state: 0,
last_accepting: 0,
lexeme: String::new(),
last_char: VecDeque::with_capacity(2),
last_char_read: VecDeque::with_capacity(2),
reader,
line_num: 1, // line numbers start with 1, its not an array
}
}
/// Reads from the given reader until a token is matched
pub fn get_next_token(&mut self) -> (Token, String) {
// reset the states before searching for the next token
self.last_accepting = 0;
self.state = 0;
let mut comment = false;
loop {
// query next char and proceed to next step
let c = self.next_char();
self.state = self.next_state(c);
if self.state == 0 {
// something made the lexer reset(got stuck), attempt to tokenize the accepted lexeme
if self.last_accepting != 0 {
// push the "read but not accepted" buffer to read later
self.last_char.push_front(c);
while let Some(lc) = self.last_char_read.pop_back() {
self.last_char.push_front(lc);
}
return self.accept_token()
}
// no accepted lexeme, if we are not in a comment and it is not a whitespace, write an error
else if !c.is_whitespace() && !comment {
eprintln!("Unexpected character at line {}: {}", self.line_num, self.last_char_read.front().unwrap_or(&' '));
}
// make sure we are no longer in a comment, as comment ends upon reseting to 0
comment = false;
}
else if c == '\0' {
// Exit when we get a read error/end of file
return (Token::EOF, String::new())
}
else if self.state == u32::MAX {
// error state returned from state 0 to signal an unexpected character, print error and reset
eprintln!("Unexpected character at line {}: {}", self.line_num, c);
self.state = 0;
self.lexeme = String::new();
}
else if self.is_accepting() {
// if we are at an accepting state, save the current state
// and push everything in the last_char_read buffer and current char into lexeme, as we can accept them
self.last_accepting = self.state;
while let Some(lc) = self.last_char_read.pop_front() {
self.lexeme.push(lc);
}
self.lexeme.push(c);
}
else if self.state >= 400 && self.state <= 402{
// comment states
comment = true;
}
else {
// if we are not accepting, push last char into the buffer, since we dont know if it will be part of the
// next lexeme
self.last_char_read.push_back(c);
}
if c == '\n' {
self.line_num += 1;
}
}
}
/// returns next state, If returns 0, it got stuck and needs to restart, u32::MAX is invalid character
fn next_state(&self, c: char) -> u32 {
// the DFA state machine, state 0 has a lot of edges, but most states are straight forward, though are many due to the keywords
match self.state {
0 => match c {
'b' => 1,
'c' => 6,
'd' => 10,
'e' => 17,
'f' => 21,
'i' => 26,
'o' => 33,
's' => 39,
'w' => 45,
'>' => 100,
'<' => 101,
'=' => 102,
'!' => 104,
'(' => 105,
')' => 106,
'{' => 107,
'}' => 108,
',' => 109,
':' => 110,
';' => 111,
'+' => 112,
'-' => 113,
'&' => 114,
'*' => 116,
'/' => 117,
'|' => 118,
_ if c.is_numeric() => 300,
_ if c.is_ascii_alphabetic() => 201,
_ if c.is_whitespace() => 0,
_ => u32::MAX
}
// 0 -b-> 1
1 => letter_or_201(c, 'r', 2),
2 => letter_or_201(c, 'e', 3),
3 => letter_or_201(c, 'a', 4),
4 => letter_or_201(c, 'k', 5),
5 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
// 0 -c-> 6
6 => letter_or_201(c, 'a', 7),
7 => letter_or_201(c, 's', 8),
8 => letter_or_201(c, 'e', 9),
9 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
// 0 -d->10
10 => letter_or_201(c, 'e', 11),
11 => letter_or_201(c, 'f', 12),
12 => letter_or_201(c, 'a', 13),
13 => letter_or_201(c, 'u', 14),
14 => letter_or_201(c, 'l', 15),
15 => letter_or_201(c, 't', 16),
16 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
// 0 -e-> 17
17 => letter_or_201(c, 'l', 18),
18 => letter_or_201(c, 's', 19),
19 => letter_or_201(c, 'e', 20),
20 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
// 0 -f-> 21
21 => letter_or_201(c, 'l', 22),
22 => letter_or_201(c, 'o', 23),
23 => letter_or_201(c, 'a', 24),
24 => letter_or_201(c, 't', 25),
25 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
// 0 -i-> 26
26 => match c {
'f' => 27,
'n' => 28,
_ if c.is_ascii_alphanumeric() => 201,
_ => 0
}
27 => if c.is_ascii_alphanumeric() { 201 } else { 0 } // if
// 26 -n-> 28
28 => match c {
't' => 29,
'p' => 30,
_ if c.is_ascii_alphanumeric() => 201,
_ => 0,
}
// 0 -i-> 26 -n-> 28 -t-> 29
29 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
// 0 -i-> 26 -n-> 28 -p-> 30
30 => letter_or_201(c, 'u', 31),
31 => letter_or_201(c, 't', 32),
32 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
// 0 -o-> 33
33 => letter_or_201(c, 'u', 34),
34 => letter_or_201(c, 't', 35),
35 => letter_or_201(c, 'p', 36),
36 => letter_or_201(c, 'u', 37),
37 => letter_or_201(c, 't', 38),
38 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
// 0 -s-> 39
39 => if c == 'w' { 40 } else if c == 't' { 50 } else { 201 },
40 => letter_or_201(c, 'i', 41),
41 => letter_or_201(c, 't', 42),
42 => letter_or_201(c, 'c', 43),
43 => letter_or_201(c, 'h', 44),
44 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
45 => letter_or_201(c, 'h', 46),
46 => letter_or_201(c, 'i', 47),
47 => letter_or_201(c, 'l', 48),
48 => letter_or_201(c, 'e', 49),
49 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
// 0 -s-> 30 -t-> 50
50 => letter_or_201(c, 'a', 51),
51 => letter_or_201(c, 't', 52),
52 => letter_or_201(c, 'i', 53),
53 => letter_or_201(c, 'c', 54),
54 => letter_or_201(c, '_', 55),
55 => if c == 'c' { 56 } else { 0 },
56 => if c == 'a' { 57 } else { 0 },
57 => if c == 's' { 58 } else { 0 },
58 => if c == 't' { 59 } else { 0 },
59 => if c == '<' { 60 } else { 0 },
60 => if c == 'i' { 61 } else if c == 'f' { 63 } else { 0 },
61 => if c == 'n' { 62 } else { 0 },
62 => if c == 't' { 67 } else { 0 },
// 60 -f-> 63
63 => if c == 'l' { 64 } else { 0 },
64 => if c == 'o' { 65 } else { 0 },
65 => if c == 'a' { 66 } else { 0 },
66 => if c == 't' { 67 } else { 0 },
67 => if c == '>' { 68 } else { 0 },
100 | 101 | 102 | 104 => if c == '=' { 103 } else { 0 }
114 => if c == '&' { 115 } else { 0 }
117 => if c == '*' { 400 } else { 0 }
118 => if c == '|' { 119 } else { 0 }
201 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
300 if c.is_numeric() => 300,
300 if c == '.' => 301,
301 | 302 => if c.is_numeric() { 302 } else { 0 }
400 => if c == '*' { 401 } else { 400 }
401 => if c == '/' { 402 } else { 400 }
_ => 0
}
}
fn is_accepting(&self) -> bool {
self.state != 0 &&
self.state != 301 &&
self.state != 114 &&
self.state != 118 &&
(self.state < 55 || self.state > 67) &&
(self.state < 400 || self.state > 403)
}
fn accept_token(&mut self) -> (Token, String) {
// take the last saved lexeme and put an empty one instead
let mut lexeme = String::new();
std::mem::swap(&mut lexeme, &mut self.lexeme);
// match based on the last accepting state
let token = match self.last_accepting {
5 => Token::Break,
9 => Token::Case,
16 => Token::Default,
20 => Token::Else,
25 => Token::Float,
27 => Token::If,
29 => Token::Int,
32 => Token::Input,
38 => Token::Output,
44 => Token::Switch,
49 => Token::While,
68 => Token::Cast(if lexeme.ends_with("int>") { CastType::Int } else { CastType::Float }),
100 => Token::RelOp(RelOpType::Greater),
101 => Token::RelOp(RelOpType::Less),
102 => Token::Equal,
103 => Token::RelOp(relop_from_str(&lexeme)),
104 => Token::Not,
105 => Token::BracketOpen,
106 => Token::BracketClose,
107 => Token::CBracketOpen,
108 => Token::CBracketClose,
109 => Token::Comma,
110 => Token::Colon,
111 => Token::SemiColon,
112 => Token::AddOp(AddOpType::Add),
113 => Token::AddOp(AddOpType::Subtract),
115 => Token::And,
116 => Token::MulOp(MulOpType::Multiply),
117 => Token::MulOp(MulOpType::Divide),
119 => Token::Or,
300 | 302 => Token::Num,
_ => Token::Id, // if called when `self.last_acceptin` is false, will give false results
};
(token, lexeme)
}
fn next_char(&mut self) -> char {
if let Some(c) = self.last_char.pop_front() {
c
}
else {
// create a 1 byte buffer
let mut b = [0u8; 1];
// no error handling, as we will just return a \0 in that case
let _ = self.reader.read(&mut b);
char::from_u32(b[0].into()).unwrap_or('\0')
}
}
}
fn letter_or_201(c: char, w: char, ok: u32) -> u32 {
if c == w {
ok
}
else {
match c {
'a'..='z' | 'A'..='Z' | '0'..='9' => 201,
_ => 0,
}
}
}
fn relop_from_str(s: &str) -> RelOpType {
// matches the first letter in s, assuming s is a relop token lexeme, it will provide correct result
match s.chars().next().expect("Invalid call to relop_from_str!") {
'<' => RelOpType::LessEq,
'>' => RelOpType::GreaterEq,
'!' => RelOpType::NotEqual,
'=' => RelOpType::Equal,
_ => panic!("Invalid call to relop_from_str")
}
}

41
src/main.rs Normal file
View file

@ -0,0 +1,41 @@
use std::{path::Path, io::{BufReader, Write}};
use cla::{Lexer, Token};
fn main() {
eprintln!("Student: Aviv Romem");
if std::env::args().count() == 2 {
// Safety: we make sure we have 2 items in the args iterator
let file_name = std::env::args().skip(1).next().unwrap();
if file_name.ends_with(".ou") && Path::new(&file_name).exists() {
let file = std::fs::File::open(&file_name).expect("INVALID FILE");
let mut buf_reader = BufReader::new(file); // use a buf reader for faster reading
let mut lexer = Lexer::new(&mut buf_reader); // set up a new lexer instance
// open output file
let mut output_file = file_name.chars().take(file_name.chars().count() - 3).collect::<String>();
output_file.push_str(&".tok");
let mut output = std::fs::File::create(Path::new(&output_file)).expect("Cannot create output file!");
// Loop over the tokens until we get Token::EOF, meaning no more tokens available
loop {
let (token, lexeme) = lexer.get_next_token();
if token == Token::EOF {
break;
}
// string to print, using `fmt::Debug` derive for easier printing of the token
let line = format!("Token: {:?}, Lexeme: {}\n", token, lexeme);
// turn to bytes(UTF-8 by default), write and flush to file
let as_bytes = line.as_bytes();
output.write(as_bytes).expect("Failed writing to output file!");
output.flush().expect("Failed writing to output file!");
}
}
else {
eprintln!("USAGE: cla <file_name>.ou");
}
}
else {
eprintln!("USAGE: cla <file_name>.ou");
}
}