Take data reading out of the data tree parser.

It's now the responsibility of the caller to feed the parser
data as needed.
This commit is contained in:
Nathan Vegdahl 2020-01-07 10:09:46 +09:00
parent 9c8dd4d882
commit 3eca7eae25

View File

@ -1,19 +1,12 @@
#![allow(dead_code)]
use std::io::{self, Read};
//--------------------------------------------------------------------------
#[derive(Debug)]
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub enum Error {
ExpectedTypeNameOrInnerClose(usize),
UnexpectedIdent(usize),
ExpectedTypeNameOrClose(usize),
ExpectedOpenOrIdent(usize),
ExpectedInnerOpen(usize),
UnexpectedInnerClose(usize),
UnclosedInnerNode(usize),
UnexpectedEOF(usize),
IOError(io::Error),
ExpectedOpen(usize),
UnexpectedClose(usize),
UnexpectedIdent(usize),
}
impl std::error::Error for Error {}
@ -24,12 +17,6 @@ impl std::fmt::Display for Error {
}
}
impl From<io::Error> for Error {
fn from(other: io::Error) -> Self {
Error::IOError(other)
}
}
//---------------------------------------------------------------------
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
@ -47,7 +34,9 @@ pub enum Event<'a> {
contents: &'a str,
byte_offset: usize,
},
Done,
NeedMoreInput,
ValidEnd, // All data so far is consumed, and this is a
// valid place to finish the parse.
}
impl<'a> Event<'a> {
@ -74,7 +63,8 @@ impl<'a> Event<'a> {
contents: contents,
byte_offset: byte_offset + offset,
},
Event::Done => *self,
Event::NeedMoreInput => *self,
Event::ValidEnd => *self,
}
}
}
@ -82,168 +72,89 @@ impl<'a> Event<'a> {
//---------------------------------------------------------------------
#[derive(Debug)]
pub struct Parser<R: Read> {
reader: R,
buffer: Vec<u8>,
buf_fill_idx: usize,
pub struct Parser {
buffer: String,
buf_consumed_idx: usize,
total_bytes_processed: usize,
inner_opens: usize,
eof: bool,
}
impl<R: Read> Parser<R> {
pub fn new(reader: R) -> Parser<R> {
impl Parser {
pub fn new() -> Parser {
Parser {
reader: reader,
buffer: Vec::with_capacity(1024),
buf_fill_idx: 0,
buffer: String::with_capacity(1024),
buf_consumed_idx: 0,
total_bytes_processed: 0,
inner_opens: 0,
eof: false,
}
}
pub fn push_data(&mut self, text: &str) {
// Remove any consumed data.
if self.buf_consumed_idx > 0 {
self.buffer.replace_range(..self.buf_consumed_idx, "");
self.buf_consumed_idx = 0;
}
// Add the new data.
self.buffer.push_str(text);
}
pub fn next_event<'a>(&'a mut self) -> Result<Event<'a>, Error> {
// Remove any consumed data.
if self.buf_consumed_idx > 0 {
self.buffer
.copy_within(self.buf_consumed_idx..self.buf_fill_idx, 0);
self.buf_fill_idx -= self.buf_consumed_idx;
self.buffer.replace_range(..self.buf_consumed_idx, "");
self.buf_consumed_idx = 0;
}
loop {
// Determine how much of the buffer is valid utf8.
let valid_count = match std::str::from_utf8(&self.buffer[..self.buf_fill_idx]) {
Ok(_) => self.buf_fill_idx,
Err(e) => e.valid_up_to(),
};
// Make a str slice out of the valid prefix.
let buffer_text = std::str::from_utf8(&self.buffer[..valid_count]).unwrap();
// Try to parse an event from the valid prefix.
match try_parse_event(buffer_text) {
EventParse::Ok(event, bytes_consumed) => {
// Update internal state.
self.buf_consumed_idx += bytes_consumed;
self.total_bytes_processed += bytes_consumed;
if let Event::InnerOpen { .. } = event {
self.inner_opens += 1;
} else if let Event::InnerClose { byte_offset, .. } = event {
if self.inner_opens == 0 {
return Err(Error::UnexpectedInnerClose(
byte_offset + self.total_bytes_processed,
));
} else {
self.inner_opens -= 1;
}
}
// Hack the borrow checker, which doesn't understand
// loops apparently, and return.
return Ok(unsafe {
std::mem::transmute::<Event, Event>(
event.add_to_byte_offset(
self.total_bytes_processed - self.buf_consumed_idx,
),
)
});
}
EventParse::ReachedEnd => {
// If we're at the end, make sure we're in a valid
// state and finish. Otherwise, let things keep
// going.
if self.eof {
if self.inner_opens == 0 {
return Ok(Event::Done);
} else {
return Err(Error::UnclosedInnerNode(
self.total_bytes_processed + valid_count,
));
}
}
}
EventParse::IncompleteData => {
// If we're at the end, it's a problem.
// Otherwise, wait for more data.
if self.eof {
return Err(Error::UnexpectedEOF(
self.total_bytes_processed + valid_count,
// Try to parse an event from the valid prefix.
match try_parse_event(&self.buffer) {
EventParse::Ok(event, bytes_consumed) => {
// Update internal state.
if let Event::InnerOpen { .. } = event {
self.inner_opens += 1;
} else if let Event::InnerClose { byte_offset, .. } = event {
if self.inner_opens == 0 {
return Err(Error::UnexpectedClose(
byte_offset + self.total_bytes_processed,
));
} else {
self.inner_opens -= 1;
}
}
self.buf_consumed_idx += bytes_consumed;
self.total_bytes_processed += bytes_consumed;
// Hard errors.
EventParse::ExpectedTypeNameOrInnerClose(byte_offset) => {
return Err(Error::ExpectedTypeNameOrInnerClose(
byte_offset + self.total_bytes_processed,
));
}
EventParse::ExpectedOpenOrIdent(byte_offset) => {
return Err(Error::ExpectedOpenOrIdent(
byte_offset + self.total_bytes_processed,
));
}
EventParse::ExpectedInnerOpen(byte_offset) => {
return Err(Error::ExpectedInnerOpen(
byte_offset + self.total_bytes_processed,
));
}
EventParse::UnexpectedIdent(byte_offset) => {
return Err(Error::UnexpectedIdent(
byte_offset + self.total_bytes_processed,
));
// Hack the borrow checker, which doesn't understand
// loops apparently, and return.
Ok(event.add_to_byte_offset(self.total_bytes_processed - self.buf_consumed_idx))
}
EventParse::ReachedEnd => {
// If we consumed all data, then if all nodes are properly
// closed we're done. Otherwise we need more input.
if self.inner_opens == 0 {
Ok(Event::ValidEnd)
} else {
Ok(Event::NeedMoreInput)
}
}
EventParse::IncompleteData => Ok(Event::NeedMoreInput),
// If we couldn't parse a complete event, and if there were
// no errors, read in more data and loop back to try again.
if !self.eof {
let (read_count, _valid_count) = self.do_read()?;
if read_count == 0 {
self.eof = true;
}
}
// Hard errors.
EventParse::ExpectedTypeNameOrInnerClose(byte_offset) => Err(
Error::ExpectedTypeNameOrClose(byte_offset + self.total_bytes_processed),
),
EventParse::ExpectedOpenOrIdent(byte_offset) => Err(Error::ExpectedOpenOrIdent(
byte_offset + self.total_bytes_processed,
)),
EventParse::ExpectedInnerOpen(byte_offset) => Err(Error::ExpectedOpen(
byte_offset + self.total_bytes_processed,
)),
EventParse::UnexpectedIdent(byte_offset) => Err(Error::UnexpectedIdent(
byte_offset + self.total_bytes_processed,
)),
}
}
/// Returns (read_count, valid_utf8_bytes_count).
/// The former is how many new bytes were added to the buffer,
/// and the latter is the total valid prefix utf8 bytes in the
/// buffer after the read.
fn do_read(&mut self) -> io::Result<(usize, usize)> {
// Make sure the buffer has space for more data.
if (self.buf_fill_idx + 4) >= self.buffer.len() {
let new_len = ((self.buffer.len() * 3) / 2) + 4;
self.buffer.resize(new_len, 0);
}
// Read!
let read_count = self.reader.read(&mut self.buffer[self.buf_fill_idx..])?;
self.buf_fill_idx += read_count;
// Determine how much of the buffer is valid utf8.
let valid_count = match std::str::from_utf8(&self.buffer[..self.buf_fill_idx]) {
Ok(_) => self.buf_fill_idx,
Err(e) => e.valid_up_to(),
};
// Check for invalid utf8.
if (self.buf_fill_idx - valid_count) >= 4
|| (read_count == 0 && self.buf_fill_idx > valid_count)
{
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"stream contained invalid UTF-8",
));
}
return Ok((read_count, valid_count));
}
}
//--------------------------------------------------------------------------
@ -816,4 +727,133 @@ mod tests {
EventParse::ReachedEnd,
);
}
#[test]
fn parser_01() {
let mut parser = Parser::new();
parser.push_data("Hello");
assert_eq!(parser.next_event(), Ok(Event::NeedMoreInput));
parser.push_data("{");
assert_eq!(
parser.next_event(),
Ok(Event::InnerOpen {
type_name: "Hello",
ident: None,
byte_offset: 0,
})
);
assert_eq!(parser.next_event(), Ok(Event::NeedMoreInput));
parser.push_data("}");
assert_eq!(
parser.next_event(),
Ok(Event::InnerClose { byte_offset: 6 })
);
assert_eq!(parser.next_event(), Ok(Event::ValidEnd));
}
#[test]
fn parser_02() {
let mut parser = Parser::new();
parser.push_data("Hello");
assert_eq!(parser.next_event(), Ok(Event::NeedMoreInput));
parser.push_data("[");
assert_eq!(parser.next_event(), Ok(Event::NeedMoreInput));
parser.push_data("1.0 2.0 3.");
assert_eq!(parser.next_event(), Ok(Event::NeedMoreInput));
parser.push_data("0]");
assert_eq!(
parser.next_event(),
Ok(Event::Leaf {
type_name: "Hello",
contents: "1.0 2.0 3.0",
byte_offset: 0,
})
);
assert_eq!(parser.next_event(), Ok(Event::ValidEnd));
}
#[test]
fn parser_03() {
let mut parser = Parser::new();
parser.push_data("Hello $big_boy { World [1.0 2.0 3.0] }");
assert_eq!(
parser.next_event(),
Ok(Event::InnerOpen {
type_name: "Hello",
ident: Some("$big_boy"),
byte_offset: 0,
})
);
assert_eq!(
parser.next_event(),
Ok(Event::Leaf {
type_name: "World",
contents: "1.0 2.0 3.0",
byte_offset: 17,
})
);
assert_eq!(
parser.next_event(),
Ok(Event::InnerClose { byte_offset: 37 })
);
// Make sure repeated calls are stable.
assert_eq!(parser.next_event(), Ok(Event::ValidEnd));
assert_eq!(parser.next_event(), Ok(Event::ValidEnd));
assert_eq!(parser.next_event(), Ok(Event::ValidEnd));
}
#[test]
fn parser_04() {
let mut parser = Parser::new();
parser.push_data("$Hello");
assert_eq!(parser.next_event(), Err(Error::ExpectedTypeNameOrClose(0)));
}
#[test]
fn parser_05() {
let mut parser = Parser::new();
parser.push_data("Hello]");
assert_eq!(parser.next_event(), Err(Error::ExpectedOpenOrIdent(5)));
}
#[test]
fn parser_06() {
let mut parser = Parser::new();
parser.push_data("Hello}");
assert_eq!(parser.next_event(), Err(Error::ExpectedOpenOrIdent(5)));
}
#[test]
fn parser_07() {
let mut parser = Parser::new();
parser.push_data("Hello $yar [");
assert_eq!(parser.next_event(), Err(Error::UnexpectedIdent(6)));
}
#[test]
fn parser_08() {
let mut parser = Parser::new();
parser.push_data("}");
assert_eq!(parser.next_event(), Err(Error::UnexpectedClose(0)));
}
}