first commit

2023-12-30 14:01:19 +02:00 · 2023-12-30 14:01:19 +02:00 · cadefc09f8
commit cadefc09f8
8 changed files with 488 additions and 0 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -0,0 +1,7 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "cla"
+version = "0.1.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,8 @@
+[package]
+name = "cla"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
--- a/BIN
+++ b/BIN
--- a/example.ou
+++ b/example.ou
@ -0,0 +1,20 @@
+int hello;
+if(a == 1.56) {
+	b = 5 + 20.1;
+	static_cast<int>(1.2);
+	a >= b
+	1.
+	$
+}
+
+break case default else float if input int output switch while
+(){},:;=
+==!=<><=>=
+-
+|| && ! static_cast<float> static_cast<int>
+someID123
+123 123.32
+/* plus some
+multi line comments */
+
+static_cast<floa>
--- a/example.tok
+++ b/example.tok
@ -0,0 +1,67 @@
+Token: Int, Lexeme: int
+Token: Id, Lexeme: hello
+Token: SemiColon, Lexeme: ;
+Token: If, Lexeme: if
+Token: BracketOpen, Lexeme: (
+Token: Id, Lexeme: a
+Token: RelOp(Equal), Lexeme: ==
+Token: Num, Lexeme: 1.56
+Token: BracketClose, Lexeme: )
+Token: CBracketOpen, Lexeme: {
+Token: Id, Lexeme: b
+Token: Equal, Lexeme: =
+Token: Num, Lexeme: 5
+Token: AddOp(Add), Lexeme: +
+Token: Num, Lexeme: 20.1
+Token: SemiColon, Lexeme: ;
+Token: Cast(Int), Lexeme: static_cast<int>
+Token: BracketOpen, Lexeme: (
+Token: Num, Lexeme: 1.2
+Token: BracketClose, Lexeme: )
+Token: SemiColon, Lexeme: ;
+Token: Id, Lexeme: a
+Token: RelOp(GreaterEq), Lexeme: >=
+Token: Id, Lexeme: b
+Token: Num, Lexeme: 1
+Token: CBracketClose, Lexeme: }
+Token: Break, Lexeme: break
+Token: Case, Lexeme: case
+Token: Default, Lexeme: default
+Token: Else, Lexeme: else
+Token: Float, Lexeme: float
+Token: If, Lexeme: if
+Token: Input, Lexeme: input
+Token: Int, Lexeme: int
+Token: Output, Lexeme: output
+Token: Switch, Lexeme: switch
+Token: While, Lexeme: while
+Token: BracketOpen, Lexeme: (
+Token: BracketClose, Lexeme: )
+Token: CBracketOpen, Lexeme: {
+Token: CBracketClose, Lexeme: }
+Token: Comma, Lexeme: ,
+Token: Colon, Lexeme: :
+Token: SemiColon, Lexeme: ;
+Token: Equal, Lexeme: =
+Token: RelOp(Equal), Lexeme: ==
+Token: RelOp(NotEqual), Lexeme: !=
+Token: RelOp(Less), Lexeme: <
+Token: RelOp(Greater), Lexeme: >
+Token: RelOp(LessEq), Lexeme: <=
+Token: RelOp(GreaterEq), Lexeme: >=
+Token: AddOp(Add), Lexeme: +
+Token: AddOp(Subtract), Lexeme: -
+Token: Or, Lexeme: ||
+Token: And, Lexeme: &&
+Token: Not, Lexeme: !
+Token: Cast(Float), Lexeme: static_cast<float>
+Token: Cast(Int), Lexeme: static_cast<int>
+Token: Id, Lexeme: someID123
+Token: Num, Lexeme: 123
+Token: Num, Lexeme: 123.32
+Token: MulOp(Divide), Lexeme: /
+Token: Id, Lexeme: static
+Token: Id, Lexeme: cast
+Token: RelOp(Less), Lexeme: <
+Token: Id, Lexeme: floa
+Token: RelOp(Greater), Lexeme: >
--- a/src/dfa.rs
+++ b/src/dfa.rs
--- a/src/lib.rs
+++ b/src/lib.rs
@ -0,0 +1,345 @@
+use std::collections::VecDeque;
+
+#[derive(Debug, PartialEq)]
+pub enum Token {
+    // keywords
+    Break, Case, Default, Else, Float, If, Input, Int, Output, Switch, While,
+    // symbols
+    BracketOpen, BracketClose, CBracketOpen, CBracketClose, Comma, Colon, SemiColon, Equal,
+    // Operators
+    RelOp(RelOpType), AddOp(AddOpType), MulOp(MulOpType), Or, And, Not, Cast(CastType),
+    // Additional
+    Id, Num,
+    EOF, // Represents the end of file, or a read error(as both will be handled the same currently)
+}
+#[derive(Debug, PartialEq)]
+pub enum RelOpType {
+    Equal, NotEqual, Less, LessEq, Greater, GreaterEq
+}
+#[derive(Debug, PartialEq)]
+pub enum AddOpType {
+    Add, Subtract
+}
+#[derive(Debug, PartialEq)]
+pub enum MulOpType {
+    Multiply, Divide
+}
+#[derive(Debug, PartialEq)]
+pub enum CastType {
+    Int, Float
+}
+pub struct Lexer<'a> {
+    /// Current state(states will be numbers)
+    state: u32, 
+    /// Last active state
+    last_accepting: u32,
+    /// currently accepted lexeme
+    lexeme: String,
+    /// chars read but not treaded
+    last_char: VecDeque<char>,
+    /// chars read but not accepted
+    last_char_read: VecDeque<char>,
+    /// reader(usually a file, can be a tcp socket as well)
+    reader: &'a mut dyn std::io::Read,
+    /// line counter
+    line_num: usize,
+}
+impl<'a> Lexer<'a> {
+    /// Creates a new State instance
+    pub fn new(reader: &mut dyn std::io::Read) -> Lexer {
+        Lexer { 
+            state: 0, 
+            last_accepting: 0,
+            lexeme: String::new(), 
+            last_char: VecDeque::with_capacity(2), 
+            last_char_read: VecDeque::with_capacity(2),
+            reader,
+            line_num: 1, // line numbers start with 1, its not an array
+        }
+    }
+    /// Reads from the given reader until a token is matched
+    pub fn get_next_token(&mut self) -> (Token, String) {
+        // reset the states before searching for the next token
+        self.last_accepting = 0;
+        self.state = 0;
+        let mut comment = false;
+        loop {
+            // query next char and proceed to next step
+            let c = self.next_char();
+            self.state = self.next_state(c);
+            if self.state == 0 {
+                // something made the lexer reset(got stuck), attempt to tokenize the accepted lexeme
+                if self.last_accepting != 0 {
+                    // push the "read but not accepted" buffer to read later
+                    self.last_char.push_front(c);
+                    while let Some(lc) = self.last_char_read.pop_back() {
+                        self.last_char.push_front(lc);
+                    }
+                    return self.accept_token()
+                }
+                // no accepted lexeme, if we are not in a comment and it is not a whitespace, write an error
+                else if !c.is_whitespace() && !comment {
+                    eprintln!("Unexpected character at line {}: {}", self.line_num, self.last_char_read.front().unwrap_or(&' '));
+                }
+                // make sure we are no longer in a comment, as comment ends upon reseting to 0
+                comment = false;
+            }
+            else if c == '\0' {
+                // Exit when we get a read error/end of file
+                return (Token::EOF, String::new())
+            }
+            else if self.state == u32::MAX {
+                // error state returned from state 0 to signal an unexpected character, print error and reset
+                eprintln!("Unexpected character at line {}: {}", self.line_num, c);
+                self.state = 0;
+                self.lexeme = String::new();
+            }
+            else if self.is_accepting() {
+                // if we are at an accepting state, save the current state
+                // and push everything in the last_char_read buffer and current char into lexeme, as we can accept them
+                self.last_accepting = self.state;
+                while let Some(lc) = self.last_char_read.pop_front() {
+                    self.lexeme.push(lc);
+                }
+                self.lexeme.push(c);
+            }
+            else if self.state >= 400 && self.state <= 402{
+                // comment states
+                comment = true;
+            }
+            else {
+                // if we are not accepting, push last char into the buffer, since we dont know if it will be part of the 
+                // next lexeme
+                self.last_char_read.push_back(c);
+            }
+            if c == '\n' {
+                self.line_num += 1;
+            }
+        }
+    }
+    
+    /// returns next state, If returns 0, it got stuck and needs to restart, u32::MAX is invalid character
+    fn next_state(&self, c: char) -> u32 {
+        // the DFA state machine, state 0 has a lot of edges, but most states are straight forward, though are many due to the keywords
+        match self.state {
+            0 => match c {
+                    'b' => 1,
+                    'c' => 6,
+                    'd' => 10,
+                    'e' => 17,
+                    'f' => 21,
+                    'i' => 26,
+                    'o' => 33,
+                    's' => 39,
+                    'w' => 45,
+                    '>' => 100,
+                    '<' => 101,
+                    '=' => 102,
+                    '!' => 104,
+                    '(' => 105,
+                    ')' => 106,
+                    '{' => 107,
+                    '}' => 108,
+                    ',' => 109,
+                    ':' => 110,
+                    ';' => 111,
+                    '+' => 112,
+                    '-' => 113,
+                    '&' => 114,
+                    '*' => 116,
+                    '/' => 117,
+                    '|' => 118,
+                    _ if c.is_numeric() => 300,
+                    _ if c.is_ascii_alphabetic() => 201,
+                    _ if c.is_whitespace() => 0,
+                    _ => u32::MAX
+            }
+            // 0 -b-> 1
+            1 => letter_or_201(c, 'r', 2),
+            2 => letter_or_201(c, 'e', 3),
+            3 => letter_or_201(c, 'a', 4),
+            4 => letter_or_201(c, 'k', 5),
+            5 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
+            // 0 -c-> 6
+            6 => letter_or_201(c, 'a', 7),
+            7 => letter_or_201(c, 's', 8),
+            8 => letter_or_201(c, 'e', 9),
+            9 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
+            // 0 -d->10
+            10 => letter_or_201(c, 'e', 11),
+            11 => letter_or_201(c, 'f', 12),
+            12 => letter_or_201(c, 'a', 13),
+            13 => letter_or_201(c, 'u', 14),
+            14 => letter_or_201(c, 'l', 15),
+            15 => letter_or_201(c, 't', 16),
+            16 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
+            // 0 -e-> 17
+            17 => letter_or_201(c, 'l', 18),
+            18 => letter_or_201(c, 's', 19),
+            19 => letter_or_201(c, 'e', 20),
+            20 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
+            // 0 -f-> 21
+            21 => letter_or_201(c, 'l', 22),
+            22 => letter_or_201(c, 'o', 23),
+            23 => letter_or_201(c, 'a', 24),
+            24 => letter_or_201(c, 't', 25),
+            25 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
+            // 0 -i-> 26
+            26 => match c {
+                'f' => 27,
+                'n' => 28,
+                _ if c.is_ascii_alphanumeric() => 201,
+                _ => 0
+            }
+            27 => if c.is_ascii_alphanumeric() { 201 } else { 0 } // if
+            // 26 -n-> 28
+            28 => match c {
+                't' => 29,
+                'p' => 30,
+                _ if c.is_ascii_alphanumeric() => 201,
+                _ => 0,
+            }
+            // 0 -i-> 26 -n-> 28 -t-> 29
+            29 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
+            // 0 -i-> 26 -n-> 28 -p-> 30
+            30 => letter_or_201(c, 'u', 31),
+            31 => letter_or_201(c, 't', 32),
+            32 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
+            // 0 -o-> 33
+            33 => letter_or_201(c, 'u', 34),
+            34 => letter_or_201(c, 't', 35),
+            35 => letter_or_201(c, 'p', 36),
+            36 => letter_or_201(c, 'u', 37),
+            37 => letter_or_201(c, 't', 38),
+            38 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
+            // 0 -s-> 39
+            39 => if c == 'w' { 40 } else if c == 't' { 50 } else { 201 },
+            40 => letter_or_201(c, 'i', 41),
+            41 => letter_or_201(c, 't', 42),
+            42 => letter_or_201(c, 'c', 43),
+            43 => letter_or_201(c, 'h', 44),
+            44 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
+            45 => letter_or_201(c, 'h', 46),
+            46 => letter_or_201(c, 'i', 47),
+            47 => letter_or_201(c, 'l', 48),
+            48 => letter_or_201(c, 'e', 49),
+            49 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
+            // 0 -s-> 30 -t-> 50
+            50 => letter_or_201(c, 'a', 51),
+            51 => letter_or_201(c, 't', 52),
+            52 => letter_or_201(c, 'i', 53),
+            53 => letter_or_201(c, 'c', 54),
+            54 => letter_or_201(c, '_', 55),
+            55 => if c == 'c' { 56 } else { 0 },
+            56 => if c == 'a' { 57 } else { 0 },
+            57 => if c == 's' { 58 } else { 0 },
+            58 => if c == 't' { 59 } else { 0 },
+            59 => if c == '<' { 60 } else { 0 },
+            60 => if c == 'i' { 61 } else if c == 'f' { 63 } else { 0 },
+            61 => if c == 'n' { 62 } else { 0 },
+            62 => if c == 't' { 67 } else { 0 },
+            // 60 -f-> 63
+            63 => if c == 'l' { 64 } else { 0 },
+            64 => if c == 'o' { 65 } else { 0 },
+            65 => if c == 'a' { 66 } else { 0 },
+            66 => if c == 't' { 67 } else { 0 },
+            67 => if c == '>' { 68 } else { 0 },
+            100 | 101 | 102 | 104 => if c == '=' { 103 } else { 0 }
+            114 => if c == '&' { 115 } else { 0 }
+            117 => if c == '*' { 400 } else { 0 }
+            118 => if c == '|' { 119 } else { 0 }
+            201 => if c.is_ascii_alphanumeric() { 201 } else { 0 }
+            300 if c.is_numeric() => 300,
+            300 if c == '.' => 301,
+            301 | 302 => if c.is_numeric() { 302 } else { 0 }
+            400 => if c == '*' { 401 } else { 400 }
+            401 => if c == '/' { 402 } else { 400 }
+            _ => 0
+        }
+    }
+    fn is_accepting(&self) -> bool {
+        self.state != 0 && 
+        self.state != 301 && 
+        self.state != 114 && 
+        self.state != 118 && 
+        (self.state < 55 || self.state > 67) &&
+        (self.state < 400 || self.state > 403)
+    }
+    fn accept_token(&mut self) -> (Token, String) {
+        // take the last saved lexeme and put an empty one instead
+        let mut lexeme = String::new();
+        std::mem::swap(&mut lexeme, &mut self.lexeme);
+        // match based on the last accepting state
+        let token = match self.last_accepting {
+            5 => Token::Break,
+            9 => Token::Case,
+            16 => Token::Default,
+            20 => Token::Else,
+            25 => Token::Float,
+            27 => Token::If,
+            29 => Token::Int,
+            32 => Token::Input,
+            38 => Token::Output,
+            44 => Token::Switch,
+            49 => Token::While,
+            68 => Token::Cast(if lexeme.ends_with("int>") { CastType::Int } else { CastType::Float }),
+            100 => Token::RelOp(RelOpType::Greater),
+            101 => Token::RelOp(RelOpType::Less),
+            102 => Token::Equal,
+            103 => Token::RelOp(relop_from_str(&lexeme)),
+            104 => Token::Not,
+            105 => Token::BracketOpen,
+            106 => Token::BracketClose,
+            107 => Token::CBracketOpen,
+            108 => Token::CBracketClose,
+            109 => Token::Comma,
+            110 => Token::Colon,
+            111 => Token::SemiColon,
+            112 => Token::AddOp(AddOpType::Add),
+            113 => Token::AddOp(AddOpType::Subtract),
+            115 => Token::And,
+            116 => Token::MulOp(MulOpType::Multiply),
+            117 => Token::MulOp(MulOpType::Divide),
+            119 => Token::Or,
+            300 | 302 => Token::Num,
+            _ => Token::Id, // if called when `self.last_acceptin` is false, will give false results
+        };
+        (token, lexeme)
+    }
+    
+    fn next_char(&mut self) -> char {
+        if let Some(c) = self.last_char.pop_front() {
+            c
+        }
+        else {
+            // create a 1 byte buffer
+            let mut b = [0u8; 1];
+            // no error handling, as we will just return a \0 in that case
+            let _ = self.reader.read(&mut b);
+            char::from_u32(b[0].into()).unwrap_or('\0')
+        }
+    }
+}
+
+fn letter_or_201(c: char, w: char, ok: u32) -> u32 {
+    if c == w {
+        ok
+    }
+    else { 
+            match c {
+            'a'..='z' | 'A'..='Z' | '0'..='9' => 201,
+            _ => 0,
+        }
+    }
+}
+
+fn relop_from_str(s: &str) -> RelOpType {
+    // matches the first letter in s, assuming s is a relop token lexeme, it will provide correct result
+    match s.chars().next().expect("Invalid call to relop_from_str!") {
+        '<' => RelOpType::LessEq,
+        '>' => RelOpType::GreaterEq,
+        '!' => RelOpType::NotEqual,
+        '=' => RelOpType::Equal,
+        _ => panic!("Invalid call to relop_from_str")
+    }
+}
--- a/src/main.rs
+++ b/src/main.rs
@ -0,0 +1,41 @@
+use std::{path::Path, io::{BufReader, Write}};
+
+use cla::{Lexer, Token};
+
+fn main() {
+    eprintln!("Student: Aviv Romem");
+    if std::env::args().count() == 2 {
+        // Safety: we make sure we have 2 items in the args iterator
+        let file_name = std::env::args().skip(1).next().unwrap();
+        if file_name.ends_with(".ou") && Path::new(&file_name).exists() {
+            let file = std::fs::File::open(&file_name).expect("INVALID FILE");
+            let mut buf_reader = BufReader::new(file); // use a buf reader for faster reading
+            let mut lexer = Lexer::new(&mut buf_reader); // set up a new lexer instance
+            
+            // open output file
+            let mut output_file = file_name.chars().take(file_name.chars().count() - 3).collect::<String>();
+            output_file.push_str(&".tok");
+            let mut output = std::fs::File::create(Path::new(&output_file)).expect("Cannot create output file!");
+            
+            // Loop over the tokens until we get Token::EOF, meaning no more tokens available
+            loop {
+                let (token, lexeme) = lexer.get_next_token();
+                if token == Token::EOF {
+                    break;
+                }
+                // string to print, using `fmt::Debug` derive for easier printing of the token
+                let line = format!("Token: {:?}, Lexeme: {}\n", token, lexeme);
+                // turn to bytes(UTF-8 by default), write and flush to file
+                let as_bytes = line.as_bytes();
+                output.write(as_bytes).expect("Failed writing to output file!");
+                output.flush().expect("Failed writing to output file!");
+            }
+        }
+        else {
+            eprintln!("USAGE: cla <file_name>.ou");
+        }
+    }
+    else {
+        eprintln!("USAGE: cla <file_name>.ou");
+    }
+}