Escaping now works in the DataTree tokenizer.

Also byte offsets of the tokens in the text are now stored in
the tokens, for eventual use in parse error messages.
This commit is contained in:
Nathan Vegdahl 2016-03-06 14:10:17 -08:00
parent d769ccecee
commit 62389d42ae

View File

@ -51,27 +51,34 @@ fn parse_node(ti: &mut TokenIter) -> Option<Node> {
fn token_iter<'a>(text: &'a str) -> TokenIter<'a> { fn token_iter<'a>(text: &'a str) -> TokenIter<'a> {
TokenIter { TokenIter {
text: text, text: text,
bytes_consumed: 0,
after_open_leaf: false, after_open_leaf: false,
} }
} }
/// /////////////////////////////////////////////////////////////
// ================================================================
/// Tokens contain their starting byte offset in the original source
/// text. Some variants also contain a string slice of the relevant
/// text.
#[derive(Debug, PartialEq, Eq)] #[derive(Debug, PartialEq, Eq)]
enum Token<'a> { enum Token<'a> {
TypeName(&'a str), TypeName((usize, &'a str)),
Ident(&'a str), Ident((usize, &'a str)),
OpenInner, OpenInner(usize),
CloseInner, CloseInner(usize),
OpenLeaf, OpenLeaf(usize),
CloseLeaf, CloseLeaf(usize),
LeafContents(&'a str), LeafContents((usize, &'a str)),
Unknown, Unknown(usize),
} }
struct TokenIter<'a> { struct TokenIter<'a> {
text: &'a str, text: &'a str,
bytes_consumed: usize,
after_open_leaf: bool, after_open_leaf: bool,
} }
@ -122,7 +129,7 @@ impl<'a> Iterator for TokenIter<'a> {
iter.next(); iter.next();
let i1 = i; let i1 = i;
let i2 = { let i2 = {
let mut i2 = 0; let mut i2 = i1;
while let Some(&(i, c)) = iter.peek() { while let Some(&(i, c)) = iter.peek() {
if is_ident_char(c) { if is_ident_char(c) {
iter.next(); iter.next();
@ -133,78 +140,85 @@ impl<'a> Iterator for TokenIter<'a> {
} }
i2 i2
}; };
token = Some(Token::TypeName(&self.text[i1..i2])); token = Some(Token::TypeName((self.bytes_consumed + i1, &self.text[i1..i2])));
} }
// Ident // Ident
// TODO: handle escaping
else if c == '$' { else if c == '$' {
iter.next(); iter.next();
let i1 = i; let i1 = i;
let i2 = { let i2 = {
let mut i2 = 0; let mut i2 = i1;
let mut escaped = false;
while let Some(&(i, c)) = iter.peek() { while let Some(&(i, c)) = iter.peek() {
if is_ident_char(c) { if escaped {
iter.next(); escaped = false;
} else { } else if c == '\\' {
escaped = true;
} else if !is_ident_char(c) {
i2 = i; i2 = i;
break; break;
} }
iter.next();
} }
i2 i2
}; };
token = Some(Token::Ident(&self.text[i1..i2])); token = Some(Token::Ident((self.bytes_consumed + i1, &self.text[i1..i2])));
} }
// Structural characters // Structural characters
else if is_reserved_char(c) { else if is_reserved_char(c) {
iter.next(); iter.next();
match c { match c {
'{' => { '{' => {
token = Some(Token::OpenInner); token = Some(Token::OpenInner(self.bytes_consumed + i));
} }
'}' => { '}' => {
token = Some(Token::CloseInner); token = Some(Token::CloseInner(self.bytes_consumed + i));
} }
'[' => { '[' => {
self.after_open_leaf = true; self.after_open_leaf = true;
token = Some(Token::OpenLeaf); token = Some(Token::OpenLeaf(self.bytes_consumed + i));
} }
']' => { ']' => {
token = Some(Token::CloseLeaf); token = Some(Token::CloseLeaf(self.bytes_consumed + i));
} }
_ => { _ => {
token = Some(Token::Unknown); token = Some(Token::Unknown(self.bytes_consumed + i));
} }
} }
} }
} }
} }
// Leaf contents // Leaf contents
// TODO: handle escaping
else if let Some(&(i, _)) = iter.peek() { else if let Some(&(i, _)) = iter.peek() {
self.after_open_leaf = false; self.after_open_leaf = false;
let i1 = i; let i1 = i;
let i2 = { let i2 = {
let mut i2 = 0; let mut i2 = i1;
let mut escaped = false;
while let Some(&(i, c)) = iter.peek() { while let Some(&(i, c)) = iter.peek() {
if c != ']' { if escaped {
iter.next(); escaped = false;
} else { } else if c == '\\' {
escaped = true;
} else if c == ']' {
i2 = i; i2 = i;
break; break;
} }
iter.next();
} }
i2 i2
}; };
token = Some(Token::LeafContents(&self.text[i1..i2])); token = Some(Token::LeafContents((self.bytes_consumed + i1, &self.text[i1..i2])));
} }
// Finish up // Finish up
match iter.peek() { match iter.peek() {
Some(&(i, _)) => { Some(&(i, _)) => {
self.bytes_consumed += i;
self.text = &self.text[i..]; self.text = &self.text[i..];
} }
@ -218,7 +232,8 @@ impl<'a> Iterator for TokenIter<'a> {
/// /////////////////////////////////////////////////////////////
// ================================================================
/// Returns whether the given unicode character is whitespace or not. /// Returns whether the given unicode character is whitespace or not.
fn is_ws_char(c: char) -> bool { fn is_ws_char(c: char) -> bool {
@ -265,6 +280,9 @@ fn is_ident_char(c: char) -> bool {
// ================================================================
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::{token_iter, Token}; use super::{token_iter, Token};
@ -272,21 +290,47 @@ mod tests {
#[test] #[test]
fn token_iter_1() { fn token_iter_1() {
let s = r#" let s = r#"
# This is a comment and should be skipped # This is a comment and should be skipped
MyThing $ident { # This is another comment MyThing $ident { # This is another comment
MyProp [Some content] MyProp [Some content]
} }
"#; "#;
let mut ti = token_iter(s); let mut ti = token_iter(s);
assert_eq!(ti.next(), Some(Token::TypeName("MyThing"))); assert_eq!(ti.next(), Some(Token::TypeName((67, "MyThing"))));
assert_eq!(ti.next(), Some(Token::Ident("$ident"))); assert_eq!(ti.next(), Some(Token::Ident((75, "$ident"))));
assert_eq!(ti.next(), Some(Token::OpenInner)); assert_eq!(ti.next(), Some(Token::OpenInner(82)));
assert_eq!(ti.next(), Some(Token::TypeName("MyProp"))); assert_eq!(ti.next(), Some(Token::TypeName((126, "MyProp"))));
assert_eq!(ti.next(), Some(Token::OpenLeaf)); assert_eq!(ti.next(), Some(Token::OpenLeaf(133)));
assert_eq!(ti.next(), Some(Token::LeafContents("Some content"))); assert_eq!(ti.next(), Some(Token::LeafContents((134, "Some content"))));
assert_eq!(ti.next(), Some(Token::CloseLeaf)); assert_eq!(ti.next(), Some(Token::CloseLeaf(146)));
assert_eq!(ti.next(), Some(Token::CloseInner)); assert_eq!(ti.next(), Some(Token::CloseInner(160)));
assert_eq!(ti.next(), None);
}
#[test]
fn token_iter_2() {
let s = r#"MyProp [Some content\] with \escaped \\characters]"#;
let mut ti = token_iter(s);
assert_eq!(ti.next(), Some(Token::TypeName((0, "MyProp"))));
assert_eq!(ti.next(), Some(Token::OpenLeaf(7)));
assert_eq!(ti.next(),
Some(Token::LeafContents((8, r#"Some content\] with \escaped \\characters"#))));
assert_eq!(ti.next(), Some(Token::CloseLeaf(49)));
assert_eq!(ti.next(), None);
}
#[test]
fn token_iter_3() {
let s = r#"MyThing $\ an\ ident\$\ with\\\{\[\ \#escaped\ content {}"#;
let mut ti = token_iter(s);
assert_eq!(ti.next(), Some(Token::TypeName((0, "MyThing"))));
assert_eq!(ti.next(),
Some(Token::Ident((8, r#"$\ an\ ident\$\ with\\\{\[\ \#escaped\ content"#))));
assert_eq!(ti.next(), Some(Token::OpenInner(55)));
assert_eq!(ti.next(), Some(Token::CloseInner(56)));
assert_eq!(ti.next(), None); assert_eq!(ti.next(), None);
} }
} }