diff options
Diffstat (limited to 'src/lang/tokenizer.rs')
-rw-r--r-- | src/lang/tokenizer.rs | 121 |
1 files changed, 112 insertions, 9 deletions
diff --git a/src/lang/tokenizer.rs b/src/lang/tokenizer.rs index 74b304d..1d62f3e 100644 --- a/src/lang/tokenizer.rs +++ b/src/lang/tokenizer.rs @@ -1,6 +1,7 @@ use std::collections::HashMap; use std::error::Error; -use std::fmt; +use std::{fmt, io, fs}; +use std::io::Read; use super::*; use unicode_xid::UnicodeXID; @@ -13,6 +14,7 @@ pub struct Lexemes { esc_oct: char, com_outer: char, com_inner: char, + include_delim: char, escapes: HashMap<char, char> } @@ -27,6 +29,7 @@ impl Default for Lexemes { esc_oct: 'o', com_outer: '/', com_inner: '*', + include_delim: '#', escapes: HashMap::new(), }; @@ -44,6 +47,7 @@ impl Default for Lexemes { pub enum Location { InString, InStringEscape, + InInclude, } #[derive(Debug)] @@ -64,6 +68,8 @@ pub enum ErrorKind { BadEscapeValue(EscapeKind, String, Option<Box<Error>>), BadNumericLiteral(NumericKind, String, Option<Box<Error>>), UnknownChar(char), + IncludeError(io::Error), + TooManyRecursions(usize), } #[derive(Debug)] @@ -83,6 +89,7 @@ impl ErrorType { ErrorKind::UnexpectedEOF(ref loc) => format!("Unexpected EOF {}", match *loc { Location::InString => "in string constant", Location::InStringEscape => "in string escape", + Location::InInclude => "in include", }), ErrorKind::BadEscapeValue(ref kind, ref val, ref err) => format!("Bad {} escape {}: {:?}", match *kind { EscapeKind::Hexadecimal => "hexadecimal", @@ -93,6 +100,8 @@ impl ErrorType { NumericKind::Float => "floating point", }, val, err), ErrorKind::UnknownChar(c) => format!("Unknown character {}", c), + ErrorKind::IncludeError(ref e) => format!("Error including file: {:?}", e), + ErrorKind::TooManyRecursions(n) => format!("Include recursed too many times ({})", n), }; ret @@ -117,7 +126,7 @@ impl Error for ErrorType { Some(ref err) => Some(&**err), None => None, }, - ErrorKind::UnexpectedEOF(_) | ErrorKind::UnknownChar(_) => None, + _ => None, } } } @@ -134,16 +143,56 @@ fn char_in(s: &str, c: char) -> bool { s.chars().find(|&x| x == c).map_or(false, |_| true) } +pub struct ResumableChars { + string: String, + pos: usize, +} + +impl ResumableChars { + pub fn new(s: String) -> ResumableChars { + ResumableChars { + string: s, + pos: 0, + } + } +} + +impl Iterator for ResumableChars { + type Item = char; + + fn next(&mut self) -> Option<char> { + if self.pos >= self.string.len() { + None + } else { + let mut iter = self.string[self.pos..].char_indices(); + match iter.next() { + Some((pos, ch)) => { + self.pos += match iter.next() { + Some((pos, _)) => pos, + None => self.string.len(), + }; + Some(ch) + }, + None => None, + } + } + } +} + pub struct Tokenizer<T: Iterator<Item=char>> { reader: T, + reader_stack: Vec<ResumableChars>, pushback: Option<char>, lexemes: Lexemes, } impl<T: Iterator<Item=char>> Tokenizer<T> { + const MAX_INCLUDE_RECURSIONS: usize = 256; + pub fn new(reader: T) -> Tokenizer<T> { Tokenizer { reader: reader, + reader_stack: Vec::new(), pushback: None, lexemes: Default::default(), } @@ -159,23 +208,49 @@ impl<T: Iterator<Item=char>> Tokenizer<T> { } } + pub fn push_reader(&mut self, rc: ResumableChars) -> Result<(), ErrorType> { + if self.reader_stack.len() > Self::MAX_INCLUDE_RECURSIONS { + Err(ErrorType::new(ErrorKind::TooManyRecursions(self.reader_stack.len()))) + } else { + self.reader_stack.push(rc); + Ok(()) + } + } + fn next_char(&mut self) -> Option<char> { match self.pushback { Some(c) => { self.pushback = None; Some(c) }, - None => self.reader.next(), + None => { + let mut ret = None; + let mut produced_idx: usize = 0; + let len = self.reader_stack.len(); + + for (idx, rc) in self.reader_stack.iter_mut().enumerate().rev() { + match rc.next() { + Some(c) => { + ret = Some(c); + produced_idx = idx; + break; + }, + None => {}, + } + } + + match ret { + Some(c) => { + self.reader_stack.truncate(produced_idx + 1); + Some(c) + }, + None => self.reader.next(), + } + }, } } pub fn next_token(&mut self) -> Result<Token, ErrorType> { - let res = self._next_token(); - eprintln!("next_token: {:?}", res); - res - } - - fn _next_token(&mut self) -> Result<Token, ErrorType> { let mut c = self.next_char(); if c == None { return Ok(Token::EOF); @@ -216,6 +291,34 @@ impl<T: Iterator<Item=char>> Tokenizer<T> { } } + /* Inclusion */ + if cc == self.lexemes.include_delim { + let mut buffer = String::new(); + + loop { + let nc = self.next_char(); + if nc == None { + return Err(ErrorType::new(ErrorKind::UnexpectedEOF(Location::InInclude))); + } + let ncc = nc.unwrap(); + + if ncc == self.lexemes.include_delim { + break; + } else { + buffer.push(ncc); + } + } + + let mut f = match fs::File::open(buffer) { + Err(err) => return Err(ErrorType::new(ErrorKind::IncludeError(err))), + Ok(f) => f, + }; + let mut contents = String::new(); + f.read_to_string(&mut contents); + self.push_reader(ResumableChars::new(contents))?; + return self.next_token() + } + /* Strings */ if char_in(&self.lexemes.string_delim, cc) { let mut buffer = String::new(); |