Escaping now works in the DataTree tokenizer.
Also byte offsets of the tokens in the text are now stored in the tokens, for eventual use in parse error messages.
This commit is contained in:
parent
d769ccecee
commit
62389d42ae
124
src/datatree.rs
124
src/datatree.rs
|
@ -51,27 +51,34 @@ fn parse_node(ti: &mut TokenIter) -> Option<Node> {
|
||||||
fn token_iter<'a>(text: &'a str) -> TokenIter<'a> {
|
fn token_iter<'a>(text: &'a str) -> TokenIter<'a> {
|
||||||
TokenIter {
|
TokenIter {
|
||||||
text: text,
|
text: text,
|
||||||
|
bytes_consumed: 0,
|
||||||
after_open_leaf: false,
|
after_open_leaf: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// /////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
|
|
||||||
|
// ================================================================
|
||||||
|
|
||||||
|
/// Tokens contain their starting byte offset in the original source
|
||||||
|
/// text. Some variants also contain a string slice of the relevant
|
||||||
|
/// text.
|
||||||
#[derive(Debug, PartialEq, Eq)]
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
enum Token<'a> {
|
enum Token<'a> {
|
||||||
TypeName(&'a str),
|
TypeName((usize, &'a str)),
|
||||||
Ident(&'a str),
|
Ident((usize, &'a str)),
|
||||||
OpenInner,
|
OpenInner(usize),
|
||||||
CloseInner,
|
CloseInner(usize),
|
||||||
OpenLeaf,
|
OpenLeaf(usize),
|
||||||
CloseLeaf,
|
CloseLeaf(usize),
|
||||||
LeafContents(&'a str),
|
LeafContents((usize, &'a str)),
|
||||||
Unknown,
|
Unknown(usize),
|
||||||
}
|
}
|
||||||
|
|
||||||
struct TokenIter<'a> {
|
struct TokenIter<'a> {
|
||||||
text: &'a str,
|
text: &'a str,
|
||||||
|
bytes_consumed: usize,
|
||||||
after_open_leaf: bool,
|
after_open_leaf: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -122,7 +129,7 @@ impl<'a> Iterator for TokenIter<'a> {
|
||||||
iter.next();
|
iter.next();
|
||||||
let i1 = i;
|
let i1 = i;
|
||||||
let i2 = {
|
let i2 = {
|
||||||
let mut i2 = 0;
|
let mut i2 = i1;
|
||||||
while let Some(&(i, c)) = iter.peek() {
|
while let Some(&(i, c)) = iter.peek() {
|
||||||
if is_ident_char(c) {
|
if is_ident_char(c) {
|
||||||
iter.next();
|
iter.next();
|
||||||
|
@ -133,78 +140,85 @@ impl<'a> Iterator for TokenIter<'a> {
|
||||||
}
|
}
|
||||||
i2
|
i2
|
||||||
};
|
};
|
||||||
token = Some(Token::TypeName(&self.text[i1..i2]));
|
token = Some(Token::TypeName((self.bytes_consumed + i1, &self.text[i1..i2])));
|
||||||
}
|
}
|
||||||
// Ident
|
// Ident
|
||||||
// TODO: handle escaping
|
|
||||||
else if c == '$' {
|
else if c == '$' {
|
||||||
iter.next();
|
iter.next();
|
||||||
let i1 = i;
|
let i1 = i;
|
||||||
let i2 = {
|
let i2 = {
|
||||||
let mut i2 = 0;
|
let mut i2 = i1;
|
||||||
|
let mut escaped = false;
|
||||||
while let Some(&(i, c)) = iter.peek() {
|
while let Some(&(i, c)) = iter.peek() {
|
||||||
if is_ident_char(c) {
|
if escaped {
|
||||||
iter.next();
|
escaped = false;
|
||||||
} else {
|
} else if c == '\\' {
|
||||||
|
escaped = true;
|
||||||
|
} else if !is_ident_char(c) {
|
||||||
i2 = i;
|
i2 = i;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
iter.next();
|
||||||
}
|
}
|
||||||
i2
|
i2
|
||||||
};
|
};
|
||||||
token = Some(Token::Ident(&self.text[i1..i2]));
|
token = Some(Token::Ident((self.bytes_consumed + i1, &self.text[i1..i2])));
|
||||||
}
|
}
|
||||||
// Structural characters
|
// Structural characters
|
||||||
else if is_reserved_char(c) {
|
else if is_reserved_char(c) {
|
||||||
iter.next();
|
iter.next();
|
||||||
match c {
|
match c {
|
||||||
'{' => {
|
'{' => {
|
||||||
token = Some(Token::OpenInner);
|
token = Some(Token::OpenInner(self.bytes_consumed + i));
|
||||||
}
|
}
|
||||||
|
|
||||||
'}' => {
|
'}' => {
|
||||||
token = Some(Token::CloseInner);
|
token = Some(Token::CloseInner(self.bytes_consumed + i));
|
||||||
}
|
}
|
||||||
|
|
||||||
'[' => {
|
'[' => {
|
||||||
self.after_open_leaf = true;
|
self.after_open_leaf = true;
|
||||||
token = Some(Token::OpenLeaf);
|
token = Some(Token::OpenLeaf(self.bytes_consumed + i));
|
||||||
}
|
}
|
||||||
|
|
||||||
']' => {
|
']' => {
|
||||||
token = Some(Token::CloseLeaf);
|
token = Some(Token::CloseLeaf(self.bytes_consumed + i));
|
||||||
}
|
}
|
||||||
|
|
||||||
_ => {
|
_ => {
|
||||||
token = Some(Token::Unknown);
|
token = Some(Token::Unknown(self.bytes_consumed + i));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Leaf contents
|
// Leaf contents
|
||||||
// TODO: handle escaping
|
|
||||||
else if let Some(&(i, _)) = iter.peek() {
|
else if let Some(&(i, _)) = iter.peek() {
|
||||||
self.after_open_leaf = false;
|
self.after_open_leaf = false;
|
||||||
let i1 = i;
|
let i1 = i;
|
||||||
let i2 = {
|
let i2 = {
|
||||||
let mut i2 = 0;
|
let mut i2 = i1;
|
||||||
|
let mut escaped = false;
|
||||||
while let Some(&(i, c)) = iter.peek() {
|
while let Some(&(i, c)) = iter.peek() {
|
||||||
if c != ']' {
|
if escaped {
|
||||||
iter.next();
|
escaped = false;
|
||||||
} else {
|
} else if c == '\\' {
|
||||||
|
escaped = true;
|
||||||
|
} else if c == ']' {
|
||||||
i2 = i;
|
i2 = i;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
iter.next();
|
||||||
}
|
}
|
||||||
i2
|
i2
|
||||||
};
|
};
|
||||||
token = Some(Token::LeafContents(&self.text[i1..i2]));
|
token = Some(Token::LeafContents((self.bytes_consumed + i1, &self.text[i1..i2])));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Finish up
|
// Finish up
|
||||||
match iter.peek() {
|
match iter.peek() {
|
||||||
Some(&(i, _)) => {
|
Some(&(i, _)) => {
|
||||||
|
self.bytes_consumed += i;
|
||||||
self.text = &self.text[i..];
|
self.text = &self.text[i..];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -218,7 +232,8 @@ impl<'a> Iterator for TokenIter<'a> {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/// /////////////////////////////////////////////////////////////
|
|
||||||
|
// ================================================================
|
||||||
|
|
||||||
/// Returns whether the given unicode character is whitespace or not.
|
/// Returns whether the given unicode character is whitespace or not.
|
||||||
fn is_ws_char(c: char) -> bool {
|
fn is_ws_char(c: char) -> bool {
|
||||||
|
@ -265,6 +280,9 @@ fn is_ident_char(c: char) -> bool {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// ================================================================
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::{token_iter, Token};
|
use super::{token_iter, Token};
|
||||||
|
@ -272,21 +290,47 @@ mod tests {
|
||||||
#[test]
|
#[test]
|
||||||
fn token_iter_1() {
|
fn token_iter_1() {
|
||||||
let s = r#"
|
let s = r#"
|
||||||
# This is a comment and should be skipped
|
# This is a comment and should be skipped
|
||||||
MyThing $ident { # This is another comment
|
MyThing $ident { # This is another comment
|
||||||
MyProp [Some content]
|
MyProp [Some content]
|
||||||
}
|
}
|
||||||
"#;
|
"#;
|
||||||
|
|
||||||
let mut ti = token_iter(s);
|
let mut ti = token_iter(s);
|
||||||
assert_eq!(ti.next(), Some(Token::TypeName("MyThing")));
|
assert_eq!(ti.next(), Some(Token::TypeName((67, "MyThing"))));
|
||||||
assert_eq!(ti.next(), Some(Token::Ident("$ident")));
|
assert_eq!(ti.next(), Some(Token::Ident((75, "$ident"))));
|
||||||
assert_eq!(ti.next(), Some(Token::OpenInner));
|
assert_eq!(ti.next(), Some(Token::OpenInner(82)));
|
||||||
assert_eq!(ti.next(), Some(Token::TypeName("MyProp")));
|
assert_eq!(ti.next(), Some(Token::TypeName((126, "MyProp"))));
|
||||||
assert_eq!(ti.next(), Some(Token::OpenLeaf));
|
assert_eq!(ti.next(), Some(Token::OpenLeaf(133)));
|
||||||
assert_eq!(ti.next(), Some(Token::LeafContents("Some content")));
|
assert_eq!(ti.next(), Some(Token::LeafContents((134, "Some content"))));
|
||||||
assert_eq!(ti.next(), Some(Token::CloseLeaf));
|
assert_eq!(ti.next(), Some(Token::CloseLeaf(146)));
|
||||||
assert_eq!(ti.next(), Some(Token::CloseInner));
|
assert_eq!(ti.next(), Some(Token::CloseInner(160)));
|
||||||
|
assert_eq!(ti.next(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn token_iter_2() {
|
||||||
|
let s = r#"MyProp [Some content\] with \escaped \\characters]"#;
|
||||||
|
|
||||||
|
let mut ti = token_iter(s);
|
||||||
|
assert_eq!(ti.next(), Some(Token::TypeName((0, "MyProp"))));
|
||||||
|
assert_eq!(ti.next(), Some(Token::OpenLeaf(7)));
|
||||||
|
assert_eq!(ti.next(),
|
||||||
|
Some(Token::LeafContents((8, r#"Some content\] with \escaped \\characters"#))));
|
||||||
|
assert_eq!(ti.next(), Some(Token::CloseLeaf(49)));
|
||||||
|
assert_eq!(ti.next(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn token_iter_3() {
|
||||||
|
let s = r#"MyThing $\ an\ ident\$\ with\\\{\[\ \#escaped\ content {}"#;
|
||||||
|
|
||||||
|
let mut ti = token_iter(s);
|
||||||
|
assert_eq!(ti.next(), Some(Token::TypeName((0, "MyThing"))));
|
||||||
|
assert_eq!(ti.next(),
|
||||||
|
Some(Token::Ident((8, r#"$\ an\ ident\$\ with\\\{\[\ \#escaped\ content"#))));
|
||||||
|
assert_eq!(ti.next(), Some(Token::OpenInner(55)));
|
||||||
|
assert_eq!(ti.next(), Some(Token::CloseInner(56)));
|
||||||
assert_eq!(ti.next(), None);
|
assert_eq!(ti.next(), None);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user