WIP creating a clean frontend/backend separation.

- Started work on writing a new backend.
- Started work on writing text encoding handling.
This commit is contained in:
Nathan Vegdahl 2018-08-17 20:34:43 -07:00
parent b713b72e72
commit 0ee183aa72
11 changed files with 579 additions and 1 deletions

14
Cargo.lock generated
View File

@ -2,12 +2,14 @@
name = "Led" name = "Led"
version = "0.0.2" version = "0.0.2"
dependencies = [ dependencies = [
"backend 0.1.0",
"docopt 0.8.3 (registry+https://github.com/rust-lang/crates.io-index)", "docopt 0.8.3 (registry+https://github.com/rust-lang/crates.io-index)",
"ropey 0.8.4 (git+https://github.com/cessen/ropey)", "ropey 0.8.4 (git+https://github.com/cessen/ropey)",
"serde 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)",
"smallvec 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", "smallvec 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)",
"termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)", "termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
"text_encoding 0.1.0",
"unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", "unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", "unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
@ -20,6 +22,14 @@ dependencies = [
"memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)", "memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]]
name = "backend"
version = "0.1.0"
dependencies = [
"ropey 0.8.4 (git+https://github.com/cessen/ropey)",
"unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]] [[package]]
name = "docopt" name = "docopt"
version = "0.8.3" version = "0.8.3"
@ -155,6 +165,10 @@ dependencies = [
"redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", "redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]]
name = "text_encoding"
version = "0.1.0"
[[package]] [[package]]
name = "thread_local" name = "thread_local"
version = "0.3.5" version = "0.3.5"

View File

@ -1,3 +1,9 @@
[workspace]
members = [
"sub_crates/backend",
"sub_crates/text_encoding",
]
[package] [package]
name = "Led" name = "Led"
version = "0.0.2" version = "0.0.2"
@ -17,4 +23,11 @@ serde = "1.*"
serde_derive = "1.*" serde_derive = "1.*"
docopt = "0.8" docopt = "0.8"
smallvec = "0.6" smallvec = "0.6"
termion = "1.5" termion = "1.5"
# Local crate dependencies
[dependencies.backend]
path = "sub_crates/backend"
[dependencies.text_encoding]
path = "sub_crates/text_encoding"

View File

@ -0,0 +1,14 @@
[package]
name = "backend"
version = "0.1.0"
authors = ["Nathan Vegdahl <cessen@cessen.com>"]
license = "MIT"
[lib]
name = "backend"
path = "src/lib.rs"
[dependencies]
# ropey = "0.8"
ropey = { git = "https://github.com/cessen/ropey", branch = "master" }
unicode-segmentation = "1.2.1"

View File

@ -0,0 +1,9 @@
use ropey::Rope;
#[derive(Debug, Clone)]
pub struct Buffer {
// on_disk_encoding: Encoding,
content_type: String,
is_dirty: bool,
text: Rope, // The actual text content.
}

View File

@ -0,0 +1,4 @@
extern crate ropey;
extern crate unicode_segmentation;
pub mod buffer;

View File

@ -0,0 +1,9 @@
[package]
name = "text_encoding"
version = "0.1.0"
authors = ["Nathan Vegdahl <cessen@cessen.com>"]
license = "MIT"
[lib]
name = "text_encoding"
path = "src/lib.rs"

View File

@ -0,0 +1,71 @@
//! Encoding/decoding functions for ISO/IEC 8859-1 (or "latin1"), which
//! conveniently happens to map 1-to-1 to the first 256 unicode scalar values.
//!
//! Because latin1 is a single-byte encoding where all bytes are valid,
//! decoding cannot fail. However, encoding will fail with scalar values
//! greater than 255.
use std;
use {DecodeResult, EncodeError, EncodeResult};
pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
// Do the encode.
let mut input_i = 0;
let mut output_i = 0;
for (offset, c) in input.char_indices() {
if output_i >= output.len() {
break;
}
if c as u32 > 255 {
return Err(EncodeError {
character: c,
byte_offset: offset,
bytes_written: output_i,
});
}
output[output_i] = c as u8;
output_i += 1;
input_i = offset;
}
// Calculate how much of the input was consumed.
input_i += 1;
if input_i > input.len() {
input_i = input.len();
} else {
while !input.is_char_boundary(input_i) {
input_i += 1;
}
}
Ok((input_i, output_i))
}
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
let mut input_i = 0;
let mut output_i = 0;
for &byte in input.iter() {
if byte <= 127 {
// 1-byte case
if output_i >= output.len() {
break;
}
output[output_i] = byte;
input_i += 1;
output_i += 1;
} else {
// 2-byte case
if (output_i + 1) >= output.len() {
break;
}
output[output_i] = 0b11000000 | (byte >> 6);
output[output_i + 1] = 0b10000000 | (byte & 0b00111111);
input_i += 1;
output_i += 2;
}
}
Ok((input_i, unsafe {
std::str::from_utf8_unchecked(&output[..output_i])
}))
}

View File

@ -0,0 +1,97 @@
//! A library for incrementally encoding/decoding between utf8 and various
//! text encodings.
mod latin1;
mod utf16_be;
mod utf16_le;
mod utf8;
/// Encodes text from utf8 to a destination encoding.
pub fn encode_from_utf8(output_encoding: Encoding, input: &str, output: &mut [u8]) -> EncodeResult {
match output_encoding {
Encoding::Utf8 => utf8::encode_from_utf8(input, output),
Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output),
Encoding::Utf16LE => utf16_le::encode_from_utf8(input, output),
Encoding::Latin1 => latin1::encode_from_utf8(input, output),
_ => unimplemented!(),
}
}
/// Decodes text from a source encoding to utf8.
pub fn decode_to_utf8<'a>(
input_encoding: Encoding,
input: &[u8],
output: &'a mut [u8],
) -> DecodeResult<'a> {
match input_encoding {
Encoding::Utf8 => utf8::decode_to_utf8(input, output),
Encoding::Utf16BE => utf16_be::decode_to_utf8(input, output),
Encoding::Utf16LE => utf16_le::decode_to_utf8(input, output),
Encoding::Latin1 => latin1::decode_to_utf8(input, output),
_ => unimplemented!(),
}
}
/// Describes a text encoding.
#[derive(Debug, Copy, Clone)]
pub enum Encoding {
Utf8,
Utf16BE, // Big endian
Utf16LE, // Little endian
Utf32BE, // Big endian
Utf32LE, // Little endian
ShiftJIS,
Big5,
Latin1, // ISO/IEC 8859-1
Windows1252, // Windows code page 1252
}
/// Result type for encoding text from utf8 to a target encoding.
///
/// The Ok() variant provides the number of bytes consumed and the
/// number of bytes written, in that order.
pub type EncodeResult = Result<(usize, usize), EncodeError>;
/// Result type for decoding text from a target encoding to utf8.
///
/// The Ok() variant provides the number of bytes consumed and a reference
/// to the valid decoded text.
pub type DecodeResult<'a> = Result<(usize, &'a str), DecodeError>;
/// Represents an error when encoding from utf8 to some other format.
///
/// Since valid input utf8 is statically assumed, the only possible
/// error is encountering a char that is not representable in the target
/// encoding.
///
/// The problematic character, the byte offset of that character
/// in the input utf8, and the number of bytes already written to the output
/// buffer is provided.
///
/// It is guaranteed that all input leading up to the problem character has
/// already been encoded and written to the output buffer.
#[derive(Debug, Copy, Clone)]
pub struct EncodeError {
pub character: char,
pub byte_offset: usize,
pub bytes_written: usize,
}
/// Represents an error when decoding to utf8 from some other format.
///
/// All supported text encodings can be fully represented in utf8, and
/// therefore the only possible error is that we encounter bytes in the
/// input data that are invalid for the text encoding we're attempting
/// to decode from.
///
/// The byte offset of the invalid input data and in the number of bytes
/// already written to the output buffer are.
/// already been encoded and written to the output buffer.
///
/// It is guaranteed that all input leading up to the invalid data has
/// already been encoded and written to the output buffer.
#[derive(Debug, Copy, Clone)]
pub struct DecodeError {
pub byte_offset: usize,
pub bytes_written: usize,
}

View File

@ -0,0 +1,148 @@
//! Encoding/decoding functions for big-endian UTF-16.
//!
//! Because both utf8 and utf16 can represent the entirety of unicode, the
//! only possible error is when invalid utf16 is encountered when decoding
//! to utf8.
use std;
use {DecodeError, DecodeResult, EncodeResult};
fn to_big_endian(n: u16) -> [u8; 2] {
use std::mem::transmute;
let ptr = unsafe { transmute::<*const u16, *const u8>(&n as *const u16) };
if cfg!(target_endian = "little") {
unsafe { [*ptr.offset(1), *ptr] }
} else {
unsafe { [*ptr, *ptr.offset(1)] }
}
}
fn from_big_endian(n: [u8; 2]) -> u16 {
use std::mem::transmute;
let mut x: u16 = 0;
let ptr = unsafe { transmute::<*mut u16, *mut u8>(&mut x as *mut u16) };
if cfg!(target_endian = "little") {
unsafe {
*ptr = n[1];
*ptr.offset(1) = n[0];
}
} else {
unsafe {
*ptr = n[0];
*ptr.offset(1) = n[1];
}
}
x
}
pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
// Do the encode.
let mut input_i = 0;
let mut output_i = 0;
for (offset, c) in input.char_indices() {
let mut code = c as u32;
if code <= 0xFFFF {
// One code unit
if (output_i + 1) < output.len() {
let val = to_big_endian(code as u16);
output[output_i] = val[0];
output[output_i + 1] = val[1];
output_i += 2;
input_i = offset;
} else {
break;
}
} else if (output_i + 3) < output.len() {
// Two code units
code -= 0x10000;
let first = to_big_endian(0xD800 | ((code >> 10) as u16));
let second = to_big_endian(0xDC00 | ((code as u16) & 0x3FF));
output[output_i] = first[0];
output[output_i + 1] = first[1];
output[output_i + 2] = second[0];
output[output_i + 3] = second[1];
output_i += 4;
input_i = offset;
} else {
break;
}
}
// Calculate how much of the input was consumed.
input_i += 1;
if input_i > input.len() {
input_i = input.len();
} else {
while !input.is_char_boundary(input_i) {
input_i += 1;
}
}
Ok((input_i, output_i))
}
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
let mut input_i = 0;
let mut output_i = 0;
// Loop through the input, getting 2 bytes at a time.
let mut itr = input.chunks(2);
while let Some(bytes) = itr.next() {
if bytes.len() < 2 {
break;
}
// Decode to scalar value.
let code = {
let code_1 = from_big_endian([bytes[0], bytes[1]]);
if code_1 < 0xD800 || code_1 > 0xDFFF {
// Single code unit.
unsafe { std::char::from_u32_unchecked(code_1 as u32) }
} else if (code_1 & 0xFC00) == 0xDC00 {
// Error: orphaned second half of a surrogate pair.
return Err(DecodeError {
byte_offset: input_i,
bytes_written: output_i,
});
} else {
// Two code units.
// Get the second code unit, if possible.
if !(input_i + 3) < input.len() {
break;
}
let bytes_2 = itr.next().unwrap();
let code_2 = from_big_endian([bytes_2[0], bytes_2[1]]);
if !(code_2 & 0xFC00) == 0xDC00 {
// Error: second half is not valid surrogate.
return Err(DecodeError {
byte_offset: input_i,
bytes_written: output_i,
});
}
unsafe {
std::char::from_u32_unchecked(
(((code_1 as u32 - 0xD800) << 10) | (code_2 as u32 - 0xDC00)) + 0x10000,
)
}
}
};
// Encode to utf8.
let mut buf = [0u8; 4];
let s = code.encode_utf8(&mut buf);
if (output_i + s.len()) > output.len() {
break;
}
output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes());
// Update our counters.
input_i += code.len_utf16() * 2;
output_i += s.len();
}
Ok((input_i, unsafe {
std::str::from_utf8_unchecked(&output[..output_i])
}))
}

View File

@ -0,0 +1,148 @@
//! Encoding/decoding functions for little-endian UTF-16.
//!
//! Because both utf8 and utf16 can represent the entirety of unicode, the
//! only possible error is when invalid utf16 is encountered when decoding
//! to utf8.
use std;
use {DecodeError, DecodeResult, EncodeResult};
fn to_little_endian(n: u16) -> [u8; 2] {
use std::mem::transmute;
let ptr = unsafe { transmute::<*const u16, *const u8>(&n as *const u16) };
if cfg!(target_endian = "little") {
unsafe { [*ptr, *ptr.offset(1)] }
} else {
unsafe { [*ptr.offset(1), *ptr] }
}
}
fn from_little_endian(n: [u8; 2]) -> u16 {
use std::mem::transmute;
let mut x: u16 = 0;
let ptr = unsafe { transmute::<*mut u16, *mut u8>(&mut x as *mut u16) };
if cfg!(target_endian = "little") {
unsafe {
*ptr = n[0];
*ptr.offset(1) = n[1];
}
} else {
unsafe {
*ptr = n[1];
*ptr.offset(1) = n[0];
}
}
x
}
pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
// Do the encode.
let mut input_i = 0;
let mut output_i = 0;
for (offset, c) in input.char_indices() {
let mut code = c as u32;
if code <= 0xFFFF {
// One code unit
if (output_i + 1) < output.len() {
let val = to_little_endian(code as u16);
output[output_i] = val[0];
output[output_i + 1] = val[1];
output_i += 2;
input_i = offset;
} else {
break;
}
} else if (output_i + 3) < output.len() {
// Two code units
code -= 0x10000;
let first = to_little_endian(0xD800 | ((code >> 10) as u16));
let second = to_little_endian(0xDC00 | ((code as u16) & 0x3FF));
output[output_i] = first[0];
output[output_i + 1] = first[1];
output[output_i + 2] = second[0];
output[output_i + 3] = second[1];
output_i += 4;
input_i = offset;
} else {
break;
}
}
// Calculate how much of the input was consumed.
input_i += 1;
if input_i > input.len() {
input_i = input.len();
} else {
while !input.is_char_boundary(input_i) {
input_i += 1;
}
}
Ok((input_i, output_i))
}
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
let mut input_i = 0;
let mut output_i = 0;
// Loop through the input, getting 2 bytes at a time.
let mut itr = input.chunks(2);
while let Some(bytes) = itr.next() {
if bytes.len() < 2 {
break;
}
// Decode to scalar value.
let code = {
let code_1 = from_little_endian([bytes[0], bytes[1]]);
if code_1 < 0xD800 || code_1 > 0xDFFF {
// Single code unit.
unsafe { std::char::from_u32_unchecked(code_1 as u32) }
} else if (code_1 & 0xFC00) == 0xDC00 {
// Error: orphaned second half of a surrogate pair.
return Err(DecodeError {
byte_offset: input_i,
bytes_written: output_i,
});
} else {
// Two code units.
// Get the second code unit, if possible.
if !(input_i + 3) < input.len() {
break;
}
let bytes_2 = itr.next().unwrap();
let code_2 = from_little_endian([bytes_2[0], bytes_2[1]]);
if !(code_2 & 0xFC00) == 0xDC00 {
// Error: second half is not valid surrogate.
return Err(DecodeError {
byte_offset: input_i,
bytes_written: output_i,
});
}
unsafe {
std::char::from_u32_unchecked(
(((code_1 as u32 - 0xD800) << 10) | (code_2 as u32 - 0xDC00)) + 0x10000,
)
}
}
};
// Encode to utf8.
let mut buf = [0u8; 4];
let s = code.encode_utf8(&mut buf);
if (output_i + s.len()) > output.len() {
break;
}
output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes());
// Update our counters.
input_i += code.len_utf16() * 2;
output_i += s.len();
}
Ok((input_i, unsafe {
std::str::from_utf8_unchecked(&output[..output_i])
}))
}

View File

@ -0,0 +1,51 @@
//! These functions are essentially redundant, since they're supposedly
//! encoding/decoding between utf8 and... utf8. However, `decode_to_utf8()`
//! is still useful for validating unknown input. And they allow a uniform
//! API for all encodings.
use std;
use {DecodeError, DecodeResult, EncodeResult};
// Encode from utf8
pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
let copy_len = {
if output.len() >= input.len() {
input.len()
} else {
let mut i = output.len();
while !input.is_char_boundary(i) {
i -= 1;
}
i
}
};
output[..copy_len].copy_from_slice(input[..copy_len].as_bytes());
Ok((copy_len, copy_len))
}
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
let valid_up_to = match std::str::from_utf8(input) {
Ok(text) => text.len(),
Err(e) => {
if e.valid_up_to() > 0 {
e.valid_up_to()
} else {
return Err(DecodeError {
byte_offset: 0,
bytes_written: 0,
});
}
}
};
let (in_consumed, out_written) = encode_from_utf8(
unsafe { std::str::from_utf8_unchecked(&input[..valid_up_to]) },
output,
).unwrap();
Ok((in_consumed, unsafe {
std::str::from_utf8_unchecked(&output[..out_written])
}))
}