use crate::lexer::ScriptTokenType; #[derive(Clone, Copy, PartialEq, Debug)] pub struct PositionState { offset: usize, line: usize, column: usize, } impl Default for PositionState { fn default() -> Self { Self { offset: 0, column: 0, line: 1, } } } impl PositionState { pub fn advance(&mut self, amount: usize) { self.offset += amount; self.column += amount; } pub fn new_line(&mut self) { self.offset += 1; self.line += 1; self.column = 0; } pub fn offset(&self) -> usize { self.offset } pub fn line(&self) -> usize { self.line } pub fn column(&self) -> usize { self.column } } #[derive(Clone, Copy, PartialEq, Debug, Default)] pub struct ScanningState { lexeme_start: usize, lexeme_current: usize, } impl ScanningState { pub fn new(idx: usize) -> Self { ScanningState { lexeme_start: idx, lexeme_current: idx, } } pub fn with_width(idx: usize, width: usize) -> Self { ScanningState { lexeme_start: idx, lexeme_current: idx + width, } } pub fn skip_first_n(&mut self, amount: usize) { self.lexeme_start = self .lexeme_current .min(self.lexeme_start.saturating_add(amount)); } pub fn advance(&mut self, amount: usize) { self.lexeme_current += amount; } pub fn start(&self) -> usize { self.lexeme_start } pub fn current(&self) -> usize { self.lexeme_current } } #[derive(Clone, Copy, Debug)] pub struct Scanner<'a> { source: &'a str, position: PositionState, scan_state: ScanningState, } #[derive(Clone, Debug, PartialEq)] pub enum ScannerErrorKind { BadIdentifier, UnexpectedEof, UnexpectedToken { span: TokenSpan }, InvalidLiteral { ltype: &'static str }, } #[derive(Clone, Debug, PartialEq)] pub struct ScannerError { pub kind: ScannerErrorKind, pub position: PositionState, } #[derive(Clone, Copy, Debug, PartialEq)] pub struct TokenSpan { pub position: PositionState, pub length: usize, } impl<'a, 'b: 'a> From<&'b Scanner<'a>> for TokenSpan { fn from(value: &Scanner) -> TokenSpan { TokenSpan { position: value.position, length: value .scan_state .lexeme_current .saturating_sub(value.scan_state.lexeme_start), } } } fn gen_token_span(scanner: &Scanner) -> TokenSpan { TokenSpan { position: scanner.position, length: scanner .scan_state .lexeme_current .saturating_sub(scanner.scan_state.lexeme_start), } } #[derive(Clone, Debug)] pub struct ScannerToken { pub location: TokenSpan, pub token: ScriptTokenType, } pub type ScannerResult = Result<ScannerToken, ScannerError>; macro_rules! next_match { ($target: expr, $val: expr) => {{ if $target.is_finished() { false } else if $target.source.chars().nth($target.position.offset) == Some($val) { $target.position.advance(1); $target.scan_state.advance(1); true } else { false } }}; } impl<'a> Scanner<'a> { pub fn new<'b>(source: &'b str) -> Scanner<'b> { Scanner { source, position: PositionState::default(), scan_state: ScanningState::default(), } } pub fn is_finished(&self) -> bool { self.position.offset == self.source.len() } pub fn scan_token(&'a mut self) -> ScannerResult { self.scan_state = ScanningState::new(self.position.offset); if self.is_finished() { return Ok(ScannerToken { token: ScriptTokenType::Eof, location: gen_token_span(self), }); } let next_char = self.next(); match next_char { Some('=') => { if next_match!(self, '=') { Ok(self.tokenise(ScriptTokenType::EqualEqual)) } else { Ok(self.tokenise(ScriptTokenType::Equal)) } } Some('%') => Ok(self.tokenise(ScriptTokenType::Modulo)), Some('^') => Ok(self.tokenise(ScriptTokenType::Caret)), Some('!') => { if next_match!(self, '=') { Ok(self.tokenise(ScriptTokenType::BangEqual)) } else { Ok(self.tokenise(ScriptTokenType::Bang)) } } Some('(') => Ok(self.tokenise(ScriptTokenType::LeftParen)), Some(')') => Ok(self.tokenise(ScriptTokenType::RightParen)), Some('{') => Ok(self.tokenise(ScriptTokenType::LeftBrace)), Some('}') => Ok(self.tokenise(ScriptTokenType::RightBrace)), Some(',') => Ok(self.tokenise(ScriptTokenType::Comma)), Some('.') => Ok(self.tokenise(ScriptTokenType::Dot)), Some('-') => Ok(self.tokenise(ScriptTokenType::Minus)), Some('+') => Ok(self.tokenise(ScriptTokenType::Plus)), Some(';') => Ok(self.tokenise(ScriptTokenType::Semicolon)), Some('/') => Ok(self.tokenise(ScriptTokenType::Slash)), Some('*') => Ok(self.tokenise(ScriptTokenType::Asterisk)), Some('&') => { if next_match!(self, '&') { Ok(self.tokenise(ScriptTokenType::DoubleAmpersand)) } else { Err(ScannerError { kind: ScannerErrorKind::UnexpectedToken { span: gen_token_span(self), }, position: self.position, }) } } Some('|') => { if next_match!(self, '|') { Ok(self.tokenise(ScriptTokenType::DoublePipe)) } else { Err(ScannerError { kind: ScannerErrorKind::UnexpectedToken { span: gen_token_span(self), }, position: self.position, }) } } Some('<') => { if next_match!(self, '=') { Ok(self.tokenise(ScriptTokenType::LessEqual)) } else { Ok(self.tokenise(ScriptTokenType::Less)) } } Some('>') => { if next_match!(self, '=') { Ok(self.tokenise(ScriptTokenType::GreaterEqual)) } else { Ok(self.tokenise(ScriptTokenType::Greater)) } } Some('"') => { let val = self.capture_string()?; Ok(self.tokenise(val)) } Some(other) => { if other.is_numeric() { let val = self.capture_number()?; Ok(self.tokenise(val)) } else if other.is_alphabetic() { let val = self.capture_keyword_or_ident()?; Ok(self.tokenise(val)) } else { Err(ScannerError { position: self.position, kind: ScannerErrorKind::UnexpectedToken { span: gen_token_span(self), }, }) } } None => Err(ScannerError { position: self.position, kind: ScannerErrorKind::UnexpectedEof, }), } } pub fn consume_ws(&mut self) { loop { match self.peek() { Some('\n') => self.increment_line(), Some('/') => { if self.peek_nth(1) == Some('/') { while !self.is_finished() && self.peek() != Some('\n') { self.increment_cursor(); } } } Some(other) => { if other.is_whitespace() { self.increment_cursor() } else { break; } } None => break, } } } fn capture_string(&mut self) -> Result<ScriptTokenType, ScannerError> { loop { if self.is_finished() { return Err(ScannerError { position: self.position, kind: ScannerErrorKind::UnexpectedEof, }); } match self.peek() { Some('\"') => { self.scan_state.skip_first_n(1); // Don't include first quote in capture let tok = ScriptTokenType::String(String::from( &self.source[self.scan_state.lexeme_start..self.scan_state.lexeme_current], )); self.increment_cursor(); return Ok(tok); } Some('\n') => self.increment_line(), Some(_) => self.increment_cursor(), _ => {} } } } fn capture_number(&mut self) -> Result<ScriptTokenType, ScannerError> { let mut can_have_dot = true; let mut can_have_underscore = true; loop { if self.is_finished() { break; } let char = self.peek().ok_or(ScannerError { position: self.position, kind: ScannerErrorKind::UnexpectedEof, })?; if char.is_numeric() { self.increment_cursor(); can_have_underscore = true; } else if char == '_' && can_have_underscore { self.increment_cursor(); can_have_underscore = false; } else if char == '.' && can_have_dot { self.increment_cursor(); can_have_dot = false; can_have_underscore = false; } else { break; } } let result = if can_have_dot { // We haven't consumed a dot, it's an int (&self.source[self.scan_state.lexeme_start..self.scan_state.lexeme_current]) .replace('_', "") .parse::<i64>() .map_err(|e| ScannerError { position: self.position, kind: ScannerErrorKind::InvalidLiteral { ltype: "integer" }, }) .map(ScriptTokenType::Integer) } else { // We have consumed a dot, it's a float (&self.source[self.scan_state.lexeme_start..self.scan_state.lexeme_current]) .replace('_', "") .parse::<f64>() .map_err(|e| ScannerError { position: self.position, kind: ScannerErrorKind::InvalidLiteral { ltype: "float" }, }) .map(ScriptTokenType::Float) }; self.increment_cursor(); result } fn capture_keyword_or_ident(&mut self) -> Result<ScriptTokenType, ScannerError> { while self .peek() .map(|c| c.is_alphanumeric() || c == '_') .unwrap_or(false) { self.increment_cursor(); } let val = self.peek_current_slice().ok_or(ScannerError { position: self.position, kind: ScannerErrorKind::BadIdentifier, })?; let bad_kw = || { Err(ScannerError { position: self.position, kind: ScannerErrorKind::BadIdentifier, }) }; let ident = || Ok(ScriptTokenType::Identifier(String::from(val))); eprintln!("Found ident {}", val); let mut val_chars = val.chars(); match val_chars.next() { Some('a') => self.check_ident(1, val, "s", ScriptTokenType::Alias), Some('e') => match val_chars.next() { Some('l') => self.check_ident(2, val, "se", ScriptTokenType::Else), Some('x') => self.check_ident(2, val, "port", ScriptTokenType::Export), _ => ident(), }, Some('f') => match val_chars.next() { Some('n') => self.check_ident(2, val, "", ScriptTokenType::Function), Some('r') => self.check_ident(2, val, "om", ScriptTokenType::From), Some('o') => self.check_ident(2, val, "r", ScriptTokenType::For), Some('i') => self.check_ident(2, val, "nally", ScriptTokenType::Finally), _ => ident(), }, Some('i') => match val_chars.next() { Some('m') => self.check_ident(2, val, "port", ScriptTokenType::Import), Some('f') => self.check_ident(2, val, "", ScriptTokenType::If), _ => ident(), }, Some('l') => self.check_ident(1, val, "et", ScriptTokenType::Let), Some('n') => self.check_ident(1, val, "ull", ScriptTokenType::Null), Some('p') => self.check_ident(1, val, "rint", ScriptTokenType::Print), Some('r') => self.check_ident(1, val, "eturn", ScriptTokenType::Return), Some('s') => match val_chars.next() { Some('t') => self.check_ident(2, val, "ruct", ScriptTokenType::Class), Some('u') => self.check_ident(2, val, "per", ScriptTokenType::Super), _ => ident(), }, Some('t') => match val_chars.next() { Some('h') => self.check_ident(2, val, "is", ScriptTokenType::This), Some('y') => self.check_ident(2, val, "peof", ScriptTokenType::Typeof), _ => ident(), }, Some('w') => self.check_ident(1, val, "hile", ScriptTokenType::While), Some(_) => ident(), None => bad_kw(), } } fn check_ident( &self, start: usize, val: &str, expected: &str, success: ScriptTokenType, ) -> Result<ScriptTokenType, ScannerError> { let sub = val.get(start..).ok_or(ScannerError { position: self.position, kind: ScannerErrorKind::BadIdentifier, })?; if sub == expected { Ok(success) } else { Ok(ScriptTokenType::Identifier(String::from(val))) } } #[inline(always)] fn peek(&self) -> Option<char> { self.peek_nth(0) } fn peek_nth(&self, n: usize) -> Option<char> { self.source.chars().nth(self.position.offset + n) } fn peek_current_slice(&self) -> Option<&str> { self.source .get(self.scan_state.lexeme_start..self.scan_state.lexeme_current) } fn increment_cursor(&mut self) { self.position.advance(1); self.scan_state.advance(1); } fn increment_line(&mut self) { self.position.new_line(); self.scan_state.advance(1); } fn next_matches(&mut self, ch: char) -> bool { if self.is_finished() { false } else if self.source.chars().nth(self.position.offset) == Some(ch) { self.position.advance(1); self.scan_state.advance(1); true } else { false } } fn next(&mut self) -> Option<char> { let ch = self.source.chars().nth(self.position.offset); self.increment_cursor(); ch } fn next_checked(&mut self) -> Result<char, ScannerError> { self.next().ok_or(ScannerError { position: self.position, kind: ScannerErrorKind::UnexpectedEof, }) } fn tokenise(&self, inner: ScriptTokenType) -> ScannerToken { ScannerToken { token: inner, location: gen_token_span(self), } } }