Skip to content
Snippets Groups Projects
parser.rs 12.2 KiB
Newer Older
Louis's avatar
Louis committed
use crate::lexer::ScriptTokenType;

#[derive(Clone, Copy, PartialEq, Debug)]
pub struct PositionState {
	offset: usize,
	line: usize,
	column: usize,
}

impl Default for PositionState {
	fn default() -> Self {
		Self {
			offset: 0,
			column: 0,
			line: 1,
		}
	}
}

impl PositionState {
	pub fn advance(&mut self, amount: usize) {
		self.offset += amount;
		self.column += amount;
	}

	pub fn new_line(&mut self) {
		self.offset += 1;
		self.line += 1;
		self.column = 0;
	}

	pub fn offset(&self) -> usize {
		self.offset
	}
	pub fn line(&self) -> usize {
		self.line
	}
	pub fn column(&self) -> usize {
		self.column
	}
}

#[derive(Clone, Copy, PartialEq, Debug, Default)]
pub struct ScanningState {
	lexeme_start: usize,
	lexeme_current: usize,
}

impl ScanningState {
	pub fn new(idx: usize) -> Self {
		ScanningState {
			lexeme_start: idx,
			lexeme_current: idx,
		}
	}

	pub fn with_width(idx: usize, width: usize) -> Self {
		ScanningState {
			lexeme_start: idx,
			lexeme_current: idx + width,
		}
	}

Louis's avatar
Louis committed
	pub fn skip_first_n(&mut self, amount: usize) {
		self.lexeme_start = self
			.lexeme_current
			.min(self.lexeme_start.saturating_add(amount));
	}

Louis's avatar
Louis committed
	pub fn advance(&mut self, amount: usize) {
		self.lexeme_current += amount;
	}

	pub fn start(&self) -> usize {
		self.lexeme_start
	}
	pub fn current(&self) -> usize {
		self.lexeme_current
	}
}

#[derive(Clone, Copy, Debug)]
pub struct Scanner<'a> {
	source: &'a str,
	position: PositionState,
	scan_state: ScanningState,
}

#[derive(Clone, Debug, PartialEq)]
Louis's avatar
Louis committed
pub enum ScannerErrorKind {
	BadIdentifier,
Louis's avatar
Louis committed
	UnexpectedEof,
Louis's avatar
Louis committed
	UnexpectedToken { span: TokenSpan },
	InvalidLiteral { ltype: &'static str },
Louis's avatar
Louis committed
}

#[derive(Clone, Debug, PartialEq)]
Louis's avatar
Louis committed
pub struct ScannerError {
	pub kind: ScannerErrorKind,
	pub position: PositionState,
Louis's avatar
Louis committed
}

#[derive(Clone, Copy, Debug, PartialEq)]
Louis's avatar
Louis committed
pub struct TokenSpan {
Louis's avatar
Louis committed
	pub position: PositionState,
	pub length: usize,
}

Louis's avatar
Louis committed
impl<'a, 'b: 'a> From<&'b Scanner<'a>> for TokenSpan {
	fn from(value: &Scanner) -> TokenSpan {
Louis's avatar
Louis committed
		TokenSpan {
			position: value.position,
			length: value
				.scan_state
				.lexeme_current
				.saturating_sub(value.scan_state.lexeme_start),
		}
	}
}

Louis's avatar
Louis committed
fn gen_token_span(scanner: &Scanner) -> TokenSpan {
	TokenSpan {
		position: scanner.position,
		length: scanner
			.scan_state
			.lexeme_current
			.saturating_sub(scanner.scan_state.lexeme_start),
	}
}

Louis's avatar
Louis committed
#[derive(Clone, Debug)]
Louis's avatar
Louis committed
pub struct ScannerToken {
	pub location: TokenSpan,
	pub token: ScriptTokenType,
Louis's avatar
Louis committed
}

Louis's avatar
Louis committed
pub type ScannerResult = Result<ScannerToken, ScannerError>;

macro_rules! next_match {
	($target: expr, $val: expr) => {{
		if $target.is_finished() {
			false
		} else if $target.source.chars().nth($target.position.offset) == Some($val) {
			$target.position.advance(1);
			$target.scan_state.advance(1);
			true
		} else {
			false
		}
	}};
}
Louis's avatar
Louis committed

impl<'a> Scanner<'a> {
Louis's avatar
Louis committed
	pub fn new<'b>(source: &'b str) -> Scanner<'b> {
Louis's avatar
Louis committed
		Scanner {
			source,
			position: PositionState::default(),
			scan_state: ScanningState::default(),
		}
	}

	pub fn is_finished(&self) -> bool {
		self.position.offset == self.source.len()
	}

Louis's avatar
Louis committed
	pub fn scan_token(&'a mut self) -> ScannerResult {
		self.scan_state = ScanningState::new(self.position.offset);

Louis's avatar
Louis committed
		if self.is_finished() {
Louis's avatar
Louis committed
			return Ok(ScannerToken {
Louis's avatar
Louis committed
				token: ScriptTokenType::Eof,
Louis's avatar
Louis committed
				location: gen_token_span(self),
			});
		}

		let next_char = self.next();
Louis's avatar
Louis committed

		match next_char {
			Some('=') => {
				if next_match!(self, '=') {
					Ok(self.tokenise(ScriptTokenType::EqualEqual))
				} else {
					Ok(self.tokenise(ScriptTokenType::Equal))
				}
			}
			Some('%') => Ok(self.tokenise(ScriptTokenType::Modulo)),
			Some('^') => Ok(self.tokenise(ScriptTokenType::Caret)),
			Some('!') => {
				if next_match!(self, '=') {
					Ok(self.tokenise(ScriptTokenType::BangEqual))
				} else {
					Ok(self.tokenise(ScriptTokenType::Bang))
				}
			}
			Some('(') => Ok(self.tokenise(ScriptTokenType::LeftParen)),
			Some(')') => Ok(self.tokenise(ScriptTokenType::RightParen)),
			Some('{') => Ok(self.tokenise(ScriptTokenType::LeftBrace)),
			Some('}') => Ok(self.tokenise(ScriptTokenType::RightBrace)),
			Some(',') => Ok(self.tokenise(ScriptTokenType::Comma)),
			Some('.') => Ok(self.tokenise(ScriptTokenType::Dot)),
			Some('-') => Ok(self.tokenise(ScriptTokenType::Minus)),
			Some('+') => Ok(self.tokenise(ScriptTokenType::Plus)),
			Some(';') => Ok(self.tokenise(ScriptTokenType::Semicolon)),
			Some('/') => Ok(self.tokenise(ScriptTokenType::Slash)),
			Some('*') => Ok(self.tokenise(ScriptTokenType::Asterisk)),
			Some('&') => {
				if next_match!(self, '&') {
					Ok(self.tokenise(ScriptTokenType::DoubleAmpersand))
				} else {
					Err(ScannerError {
						kind: ScannerErrorKind::UnexpectedToken {
							span: gen_token_span(self),
						},
						position: self.position,
					})
				}
			}
			Some('|') => {
				if next_match!(self, '|') {
					Ok(self.tokenise(ScriptTokenType::DoublePipe))
				} else {
					Err(ScannerError {
						kind: ScannerErrorKind::UnexpectedToken {
							span: gen_token_span(self),
						},
						position: self.position,
					})
				}
			}
Louis's avatar
Louis committed
			Some('<') => {
				if next_match!(self, '=') {
					Ok(self.tokenise(ScriptTokenType::LessEqual))
				} else {
					Ok(self.tokenise(ScriptTokenType::Less))
				}
			}
			Some('>') => {
				if next_match!(self, '=') {
					Ok(self.tokenise(ScriptTokenType::GreaterEqual))
				} else {
					Ok(self.tokenise(ScriptTokenType::Greater))
				}
			}
			Some('"') => {
				let val = self.capture_string()?;
				Ok(self.tokenise(val))
			}
			Some(other) => {
				if other.is_numeric() {
					let val = self.capture_number()?;
					Ok(self.tokenise(val))
				} else if other.is_alphabetic() {
					let val = self.capture_keyword_or_ident()?;
					Ok(self.tokenise(val))
				} else {
					Err(ScannerError {
						position: self.position,

						kind: ScannerErrorKind::UnexpectedToken {
							span: gen_token_span(self),
						},
					})
				}
			}
			None => Err(ScannerError {
				position: self.position,
				kind: ScannerErrorKind::UnexpectedEof,
			}),
		}
	}

	pub fn consume_ws(&mut self) {
		loop {
			match self.peek() {
				Some('\n') => self.increment_line(),
				Some('/') => {
					if self.peek_nth(1) == Some('/') {
						while !self.is_finished() && self.peek() != Some('\n') {
							self.increment_cursor();
						}
					}
				}
				Some(other) => {
					if other.is_whitespace() {
						self.increment_cursor()
					} else {
						break;
					}
				}
				None => break,
			}
		}
	}

	fn capture_string(&mut self) -> Result<ScriptTokenType, ScannerError> {
		loop {
			if self.is_finished() {
				return Err(ScannerError {
					position: self.position,

					kind: ScannerErrorKind::UnexpectedEof,
				});
			}

			match self.peek() {
				Some('\"') => {
					self.scan_state.skip_first_n(1); // Don't include first quote in capture
					let tok = ScriptTokenType::String(String::from(
						&self.source[self.scan_state.lexeme_start..self.scan_state.lexeme_current],
					));
					self.increment_cursor();
					return Ok(tok);
				}
				Some('\n') => self.increment_line(),
				Some(_) => self.increment_cursor(),
				_ => {}
			}
		}
	}

	fn capture_number(&mut self) -> Result<ScriptTokenType, ScannerError> {
		let mut can_have_dot = true;
		let mut can_have_underscore = true;

		loop {
			if self.is_finished() {
				break;
			}

			let char = self.peek().ok_or(ScannerError {
				position: self.position,
				kind: ScannerErrorKind::UnexpectedEof,
			})?;

			if char.is_numeric() {
				self.increment_cursor();
				can_have_underscore = true;
			} else if char == '_' && can_have_underscore {
				self.increment_cursor();
				can_have_underscore = false;
			} else if char == '.' && can_have_dot {
				self.increment_cursor();
				can_have_dot = false;
				can_have_underscore = false;
			} else {
				break;
			}
		}

		let result = if can_have_dot {
			// We haven't consumed a dot, it's an int
			(&self.source[self.scan_state.lexeme_start..self.scan_state.lexeme_current])
				.replace('_', "")
				.parse::<i64>()
				.map_err(|e| ScannerError {
					position: self.position,
					kind: ScannerErrorKind::InvalidLiteral { ltype: "integer" },
				})
				.map(ScriptTokenType::Integer)
Louis's avatar
Louis committed
		} else {
Louis's avatar
Louis committed
			// We have consumed a dot, it's a float
			(&self.source[self.scan_state.lexeme_start..self.scan_state.lexeme_current])
				.replace('_', "")
				.parse::<f64>()
				.map_err(|e| ScannerError {
					position: self.position,
					kind: ScannerErrorKind::InvalidLiteral { ltype: "float" },
				})
				.map(ScriptTokenType::Float)
		};

		self.increment_cursor();
		result
	}

	fn capture_keyword_or_ident(&mut self) -> Result<ScriptTokenType, ScannerError> {
		while self
			.peek()
			.map(|c| c.is_alphanumeric() || c == '_')
			.unwrap_or(false)
		{
			self.increment_cursor();
		}

		let val = self.peek_current_slice().ok_or(ScannerError {
			position: self.position,
			kind: ScannerErrorKind::BadIdentifier,
		})?;
		let bad_kw = || {
Louis's avatar
Louis committed
			Err(ScannerError {
Louis's avatar
Louis committed
				position: self.position,
				kind: ScannerErrorKind::BadIdentifier,
Louis's avatar
Louis committed
			})
Louis's avatar
Louis committed
		};
		let ident = || Ok(ScriptTokenType::Identifier(String::from(val)));

		eprintln!("Found ident {}", val);

		let mut val_chars = val.chars();
		match val_chars.next() {
			Some('a') => self.check_ident(1, val, "s", ScriptTokenType::Alias),
			Some('e') => match val_chars.next() {
				Some('l') => self.check_ident(2, val, "se", ScriptTokenType::Else),
				Some('x') => self.check_ident(2, val, "port", ScriptTokenType::Export),
				_ => ident(),
			},
			Some('f') => match val_chars.next() {
				Some('n') => self.check_ident(2, val, "", ScriptTokenType::Function),
				Some('r') => self.check_ident(2, val, "om", ScriptTokenType::From),
				Some('o') => self.check_ident(2, val, "r", ScriptTokenType::For),
				Some('i') => self.check_ident(2, val, "nally", ScriptTokenType::Finally),
				_ => ident(),
			},

			Some('i') => match val_chars.next() {
				Some('m') => self.check_ident(2, val, "port", ScriptTokenType::Import),
				Some('f') => self.check_ident(2, val, "", ScriptTokenType::If),
				_ => ident(),
			},
			Some('l') => self.check_ident(1, val, "et", ScriptTokenType::Let),
			Some('n') => self.check_ident(1, val, "ull", ScriptTokenType::Null),
			Some('p') => self.check_ident(1, val, "rint", ScriptTokenType::Print),
			Some('r') => self.check_ident(1, val, "eturn", ScriptTokenType::Return),
			Some('s') => match val_chars.next() {
				Some('t') => self.check_ident(2, val, "ruct", ScriptTokenType::Class),
				Some('u') => self.check_ident(2, val, "per", ScriptTokenType::Super),
				_ => ident(),
			},
			Some('t') => match val_chars.next() {
				Some('h') => self.check_ident(2, val, "is", ScriptTokenType::This),
				Some('y') => self.check_ident(2, val, "peof", ScriptTokenType::Typeof),
				_ => ident(),
			},
			Some('w') => self.check_ident(1, val, "hile", ScriptTokenType::While),
			Some(_) => ident(),
			None => bad_kw(),
		}
	}

	fn check_ident(
		&self,
		start: usize,
		val: &str,
		expected: &str,
		success: ScriptTokenType,
	) -> Result<ScriptTokenType, ScannerError> {
		let sub = val.get(start..).ok_or(ScannerError {
			position: self.position,
			kind: ScannerErrorKind::BadIdentifier,
		})?;

		if sub == expected {
			Ok(success)
		} else {
			Ok(ScriptTokenType::Identifier(String::from(val)))
		}
	}

	#[inline(always)]
	fn peek(&self) -> Option<char> {
		self.peek_nth(0)
	}

	fn peek_nth(&self, n: usize) -> Option<char> {
		self.source.chars().nth(self.position.offset + n)
	}

	fn peek_current_slice(&self) -> Option<&str> {
		self.source
			.get(self.scan_state.lexeme_start..self.scan_state.lexeme_current)
	}

	fn increment_cursor(&mut self) {
		self.position.advance(1);
		self.scan_state.advance(1);
	}
	fn increment_line(&mut self) {
		self.position.new_line();
		self.scan_state.advance(1);
	}

	fn next_matches(&mut self, ch: char) -> bool {
		if self.is_finished() {
			false
		} else if self.source.chars().nth(self.position.offset) == Some(ch) {
			self.position.advance(1);
			self.scan_state.advance(1);
			true
		} else {
			false
		}
	}

	fn next(&mut self) -> Option<char> {
		let ch = self.source.chars().nth(self.position.offset);
		self.increment_cursor();
Louis's avatar
Louis committed
		ch
	}

	fn next_checked(&mut self) -> Result<char, ScannerError> {
		self.next().ok_or(ScannerError {
			position: self.position,
			kind: ScannerErrorKind::UnexpectedEof,
		})
	}

	fn tokenise(&self, inner: ScriptTokenType) -> ScannerToken {
		ScannerToken {
			token: inner,
			location: gen_token_span(self),
Louis's avatar
Louis committed
		}
	}
}