diff --git a/Cargo.lock b/Cargo.lock index 7742c74..2a4b4bf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,12 +2,14 @@ name = "Led" version = "0.0.2" dependencies = [ + "backend 0.1.0", "docopt 0.8.3 (registry+https://github.com/rust-lang/crates.io-index)", "ropey 0.8.4 (git+https://github.com/cessen/ropey)", "serde 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)", "smallvec 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", "termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)", + "text_encoding 0.1.0", "unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", "unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -20,6 +22,14 @@ dependencies = [ "memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "backend" +version = "0.1.0" +dependencies = [ + "ropey 0.8.4 (git+https://github.com/cessen/ropey)", + "unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "docopt" version = "0.8.3" @@ -155,6 +165,10 @@ dependencies = [ "redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "text_encoding" +version = "0.1.0" + [[package]] name = "thread_local" version = "0.3.5" diff --git a/Cargo.toml b/Cargo.toml index b2d8e55..dfb8db6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,3 +1,9 @@ +[workspace] +members = [ + "sub_crates/backend", + "sub_crates/text_encoding", +] + [package] name = "Led" version = "0.0.2" @@ -17,4 +23,11 @@ serde = "1.*" serde_derive = "1.*" docopt = "0.8" smallvec = "0.6" -termion = "1.5" \ No newline at end of file +termion = "1.5" + +# Local crate dependencies +[dependencies.backend] +path = "sub_crates/backend" + +[dependencies.text_encoding] +path = "sub_crates/text_encoding" \ No newline at end of file diff --git a/sub_crates/backend/Cargo.toml b/sub_crates/backend/Cargo.toml new file mode 100644 index 0000000..4982a9b --- /dev/null +++ b/sub_crates/backend/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "backend" +version = "0.1.0" +authors = ["Nathan Vegdahl "] +license = "MIT" + +[lib] +name = "backend" +path = "src/lib.rs" + +[dependencies] +# ropey = "0.8" +ropey = { git = "https://github.com/cessen/ropey", branch = "master" } +unicode-segmentation = "1.2.1" \ No newline at end of file diff --git a/sub_crates/backend/src/buffer.rs b/sub_crates/backend/src/buffer.rs new file mode 100644 index 0000000..bc844e3 --- /dev/null +++ b/sub_crates/backend/src/buffer.rs @@ -0,0 +1,9 @@ +use ropey::Rope; + +#[derive(Debug, Clone)] +pub struct Buffer { + // on_disk_encoding: Encoding, + content_type: String, + is_dirty: bool, + text: Rope, // The actual text content. +} diff --git a/sub_crates/backend/src/lib.rs b/sub_crates/backend/src/lib.rs new file mode 100644 index 0000000..e34177e --- /dev/null +++ b/sub_crates/backend/src/lib.rs @@ -0,0 +1,4 @@ +extern crate ropey; +extern crate unicode_segmentation; + +pub mod buffer; diff --git a/sub_crates/text_encoding/Cargo.toml b/sub_crates/text_encoding/Cargo.toml new file mode 100644 index 0000000..30ccc23 --- /dev/null +++ b/sub_crates/text_encoding/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "text_encoding" +version = "0.1.0" +authors = ["Nathan Vegdahl "] +license = "MIT" + +[lib] +name = "text_encoding" +path = "src/lib.rs" \ No newline at end of file diff --git a/sub_crates/text_encoding/src/latin1.rs b/sub_crates/text_encoding/src/latin1.rs new file mode 100644 index 0000000..d3c8a0d --- /dev/null +++ b/sub_crates/text_encoding/src/latin1.rs @@ -0,0 +1,71 @@ +//! Encoding/decoding functions for ISO/IEC 8859-1 (or "latin1"), which +//! conveniently happens to map 1-to-1 to the first 256 unicode scalar values. +//! +//! Because latin1 is a single-byte encoding where all bytes are valid, +//! decoding cannot fail. However, encoding will fail with scalar values +//! greater than 255. + +use std; +use {DecodeResult, EncodeError, EncodeResult}; + +pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult { + // Do the encode. + let mut input_i = 0; + let mut output_i = 0; + for (offset, c) in input.char_indices() { + if output_i >= output.len() { + break; + } + if c as u32 > 255 { + return Err(EncodeError { + character: c, + byte_offset: offset, + bytes_written: output_i, + }); + } + output[output_i] = c as u8; + output_i += 1; + input_i = offset; + } + + // Calculate how much of the input was consumed. + input_i += 1; + if input_i > input.len() { + input_i = input.len(); + } else { + while !input.is_char_boundary(input_i) { + input_i += 1; + } + } + + Ok((input_i, output_i)) +} + +pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> { + let mut input_i = 0; + let mut output_i = 0; + for &byte in input.iter() { + if byte <= 127 { + // 1-byte case + if output_i >= output.len() { + break; + } + output[output_i] = byte; + input_i += 1; + output_i += 1; + } else { + // 2-byte case + if (output_i + 1) >= output.len() { + break; + } + output[output_i] = 0b11000000 | (byte >> 6); + output[output_i + 1] = 0b10000000 | (byte & 0b00111111); + input_i += 1; + output_i += 2; + } + } + + Ok((input_i, unsafe { + std::str::from_utf8_unchecked(&output[..output_i]) + })) +} diff --git a/sub_crates/text_encoding/src/lib.rs b/sub_crates/text_encoding/src/lib.rs new file mode 100644 index 0000000..da17fe9 --- /dev/null +++ b/sub_crates/text_encoding/src/lib.rs @@ -0,0 +1,97 @@ +//! A library for incrementally encoding/decoding between utf8 and various +//! text encodings. + +mod latin1; +mod utf16_be; +mod utf16_le; +mod utf8; + +/// Encodes text from utf8 to a destination encoding. +pub fn encode_from_utf8(output_encoding: Encoding, input: &str, output: &mut [u8]) -> EncodeResult { + match output_encoding { + Encoding::Utf8 => utf8::encode_from_utf8(input, output), + Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output), + Encoding::Utf16LE => utf16_le::encode_from_utf8(input, output), + Encoding::Latin1 => latin1::encode_from_utf8(input, output), + _ => unimplemented!(), + } +} + +/// Decodes text from a source encoding to utf8. +pub fn decode_to_utf8<'a>( + input_encoding: Encoding, + input: &[u8], + output: &'a mut [u8], +) -> DecodeResult<'a> { + match input_encoding { + Encoding::Utf8 => utf8::decode_to_utf8(input, output), + Encoding::Utf16BE => utf16_be::decode_to_utf8(input, output), + Encoding::Utf16LE => utf16_le::decode_to_utf8(input, output), + Encoding::Latin1 => latin1::decode_to_utf8(input, output), + _ => unimplemented!(), + } +} + +/// Describes a text encoding. +#[derive(Debug, Copy, Clone)] +pub enum Encoding { + Utf8, + Utf16BE, // Big endian + Utf16LE, // Little endian + Utf32BE, // Big endian + Utf32LE, // Little endian + ShiftJIS, + Big5, + Latin1, // ISO/IEC 8859-1 + Windows1252, // Windows code page 1252 +} + +/// Result type for encoding text from utf8 to a target encoding. +/// +/// The Ok() variant provides the number of bytes consumed and the +/// number of bytes written, in that order. +pub type EncodeResult = Result<(usize, usize), EncodeError>; + +/// Result type for decoding text from a target encoding to utf8. +/// +/// The Ok() variant provides the number of bytes consumed and a reference +/// to the valid decoded text. +pub type DecodeResult<'a> = Result<(usize, &'a str), DecodeError>; + +/// Represents an error when encoding from utf8 to some other format. +/// +/// Since valid input utf8 is statically assumed, the only possible +/// error is encountering a char that is not representable in the target +/// encoding. +/// +/// The problematic character, the byte offset of that character +/// in the input utf8, and the number of bytes already written to the output +/// buffer is provided. +/// +/// It is guaranteed that all input leading up to the problem character has +/// already been encoded and written to the output buffer. +#[derive(Debug, Copy, Clone)] +pub struct EncodeError { + pub character: char, + pub byte_offset: usize, + pub bytes_written: usize, +} + +/// Represents an error when decoding to utf8 from some other format. +/// +/// All supported text encodings can be fully represented in utf8, and +/// therefore the only possible error is that we encounter bytes in the +/// input data that are invalid for the text encoding we're attempting +/// to decode from. +/// +/// The byte offset of the invalid input data and in the number of bytes +/// already written to the output buffer are. +/// already been encoded and written to the output buffer. +/// +/// It is guaranteed that all input leading up to the invalid data has +/// already been encoded and written to the output buffer. +#[derive(Debug, Copy, Clone)] +pub struct DecodeError { + pub byte_offset: usize, + pub bytes_written: usize, +} diff --git a/sub_crates/text_encoding/src/utf16_be.rs b/sub_crates/text_encoding/src/utf16_be.rs new file mode 100644 index 0000000..439beb6 --- /dev/null +++ b/sub_crates/text_encoding/src/utf16_be.rs @@ -0,0 +1,148 @@ +//! Encoding/decoding functions for big-endian UTF-16. +//! +//! Because both utf8 and utf16 can represent the entirety of unicode, the +//! only possible error is when invalid utf16 is encountered when decoding +//! to utf8. + +use std; +use {DecodeError, DecodeResult, EncodeResult}; + +fn to_big_endian(n: u16) -> [u8; 2] { + use std::mem::transmute; + let ptr = unsafe { transmute::<*const u16, *const u8>(&n as *const u16) }; + if cfg!(target_endian = "little") { + unsafe { [*ptr.offset(1), *ptr] } + } else { + unsafe { [*ptr, *ptr.offset(1)] } + } +} + +fn from_big_endian(n: [u8; 2]) -> u16 { + use std::mem::transmute; + let mut x: u16 = 0; + let ptr = unsafe { transmute::<*mut u16, *mut u8>(&mut x as *mut u16) }; + if cfg!(target_endian = "little") { + unsafe { + *ptr = n[1]; + *ptr.offset(1) = n[0]; + } + } else { + unsafe { + *ptr = n[0]; + *ptr.offset(1) = n[1]; + } + } + x +} + +pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult { + // Do the encode. + let mut input_i = 0; + let mut output_i = 0; + for (offset, c) in input.char_indices() { + let mut code = c as u32; + if code <= 0xFFFF { + // One code unit + if (output_i + 1) < output.len() { + let val = to_big_endian(code as u16); + output[output_i] = val[0]; + output[output_i + 1] = val[1]; + output_i += 2; + input_i = offset; + } else { + break; + } + } else if (output_i + 3) < output.len() { + // Two code units + code -= 0x10000; + let first = to_big_endian(0xD800 | ((code >> 10) as u16)); + let second = to_big_endian(0xDC00 | ((code as u16) & 0x3FF)); + output[output_i] = first[0]; + output[output_i + 1] = first[1]; + output[output_i + 2] = second[0]; + output[output_i + 3] = second[1]; + output_i += 4; + input_i = offset; + } else { + break; + } + } + + // Calculate how much of the input was consumed. + input_i += 1; + if input_i > input.len() { + input_i = input.len(); + } else { + while !input.is_char_boundary(input_i) { + input_i += 1; + } + } + + Ok((input_i, output_i)) +} + +pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> { + let mut input_i = 0; + let mut output_i = 0; + + // Loop through the input, getting 2 bytes at a time. + let mut itr = input.chunks(2); + while let Some(bytes) = itr.next() { + if bytes.len() < 2 { + break; + } + + // Decode to scalar value. + let code = { + let code_1 = from_big_endian([bytes[0], bytes[1]]); + if code_1 < 0xD800 || code_1 > 0xDFFF { + // Single code unit. + unsafe { std::char::from_u32_unchecked(code_1 as u32) } + } else if (code_1 & 0xFC00) == 0xDC00 { + // Error: orphaned second half of a surrogate pair. + return Err(DecodeError { + byte_offset: input_i, + bytes_written: output_i, + }); + } else { + // Two code units. + + // Get the second code unit, if possible. + if !(input_i + 3) < input.len() { + break; + } + let bytes_2 = itr.next().unwrap(); + let code_2 = from_big_endian([bytes_2[0], bytes_2[1]]); + if !(code_2 & 0xFC00) == 0xDC00 { + // Error: second half is not valid surrogate. + return Err(DecodeError { + byte_offset: input_i, + bytes_written: output_i, + }); + } + + unsafe { + std::char::from_u32_unchecked( + (((code_1 as u32 - 0xD800) << 10) | (code_2 as u32 - 0xDC00)) + 0x10000, + ) + } + } + }; + + // Encode to utf8. + let mut buf = [0u8; 4]; + let s = code.encode_utf8(&mut buf); + if (output_i + s.len()) > output.len() { + break; + } + output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes()); + + // Update our counters. + input_i += code.len_utf16() * 2; + output_i += s.len(); + } + + Ok((input_i, unsafe { + std::str::from_utf8_unchecked(&output[..output_i]) + })) +} diff --git a/sub_crates/text_encoding/src/utf16_le.rs b/sub_crates/text_encoding/src/utf16_le.rs new file mode 100644 index 0000000..9e235c3 --- /dev/null +++ b/sub_crates/text_encoding/src/utf16_le.rs @@ -0,0 +1,148 @@ +//! Encoding/decoding functions for little-endian UTF-16. +//! +//! Because both utf8 and utf16 can represent the entirety of unicode, the +//! only possible error is when invalid utf16 is encountered when decoding +//! to utf8. + +use std; +use {DecodeError, DecodeResult, EncodeResult}; + +fn to_little_endian(n: u16) -> [u8; 2] { + use std::mem::transmute; + let ptr = unsafe { transmute::<*const u16, *const u8>(&n as *const u16) }; + if cfg!(target_endian = "little") { + unsafe { [*ptr, *ptr.offset(1)] } + } else { + unsafe { [*ptr.offset(1), *ptr] } + } +} + +fn from_little_endian(n: [u8; 2]) -> u16 { + use std::mem::transmute; + let mut x: u16 = 0; + let ptr = unsafe { transmute::<*mut u16, *mut u8>(&mut x as *mut u16) }; + if cfg!(target_endian = "little") { + unsafe { + *ptr = n[0]; + *ptr.offset(1) = n[1]; + } + } else { + unsafe { + *ptr = n[1]; + *ptr.offset(1) = n[0]; + } + } + x +} + +pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult { + // Do the encode. + let mut input_i = 0; + let mut output_i = 0; + for (offset, c) in input.char_indices() { + let mut code = c as u32; + if code <= 0xFFFF { + // One code unit + if (output_i + 1) < output.len() { + let val = to_little_endian(code as u16); + output[output_i] = val[0]; + output[output_i + 1] = val[1]; + output_i += 2; + input_i = offset; + } else { + break; + } + } else if (output_i + 3) < output.len() { + // Two code units + code -= 0x10000; + let first = to_little_endian(0xD800 | ((code >> 10) as u16)); + let second = to_little_endian(0xDC00 | ((code as u16) & 0x3FF)); + output[output_i] = first[0]; + output[output_i + 1] = first[1]; + output[output_i + 2] = second[0]; + output[output_i + 3] = second[1]; + output_i += 4; + input_i = offset; + } else { + break; + } + } + + // Calculate how much of the input was consumed. + input_i += 1; + if input_i > input.len() { + input_i = input.len(); + } else { + while !input.is_char_boundary(input_i) { + input_i += 1; + } + } + + Ok((input_i, output_i)) +} + +pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> { + let mut input_i = 0; + let mut output_i = 0; + + // Loop through the input, getting 2 bytes at a time. + let mut itr = input.chunks(2); + while let Some(bytes) = itr.next() { + if bytes.len() < 2 { + break; + } + + // Decode to scalar value. + let code = { + let code_1 = from_little_endian([bytes[0], bytes[1]]); + if code_1 < 0xD800 || code_1 > 0xDFFF { + // Single code unit. + unsafe { std::char::from_u32_unchecked(code_1 as u32) } + } else if (code_1 & 0xFC00) == 0xDC00 { + // Error: orphaned second half of a surrogate pair. + return Err(DecodeError { + byte_offset: input_i, + bytes_written: output_i, + }); + } else { + // Two code units. + + // Get the second code unit, if possible. + if !(input_i + 3) < input.len() { + break; + } + let bytes_2 = itr.next().unwrap(); + let code_2 = from_little_endian([bytes_2[0], bytes_2[1]]); + if !(code_2 & 0xFC00) == 0xDC00 { + // Error: second half is not valid surrogate. + return Err(DecodeError { + byte_offset: input_i, + bytes_written: output_i, + }); + } + + unsafe { + std::char::from_u32_unchecked( + (((code_1 as u32 - 0xD800) << 10) | (code_2 as u32 - 0xDC00)) + 0x10000, + ) + } + } + }; + + // Encode to utf8. + let mut buf = [0u8; 4]; + let s = code.encode_utf8(&mut buf); + if (output_i + s.len()) > output.len() { + break; + } + output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes()); + + // Update our counters. + input_i += code.len_utf16() * 2; + output_i += s.len(); + } + + Ok((input_i, unsafe { + std::str::from_utf8_unchecked(&output[..output_i]) + })) +} diff --git a/sub_crates/text_encoding/src/utf8.rs b/sub_crates/text_encoding/src/utf8.rs new file mode 100644 index 0000000..404edf8 --- /dev/null +++ b/sub_crates/text_encoding/src/utf8.rs @@ -0,0 +1,51 @@ +//! These functions are essentially redundant, since they're supposedly +//! encoding/decoding between utf8 and... utf8. However, `decode_to_utf8()` +//! is still useful for validating unknown input. And they allow a uniform +//! API for all encodings. + +use std; +use {DecodeError, DecodeResult, EncodeResult}; + +// Encode from utf8 +pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult { + let copy_len = { + if output.len() >= input.len() { + input.len() + } else { + let mut i = output.len(); + while !input.is_char_boundary(i) { + i -= 1; + } + i + } + }; + + output[..copy_len].copy_from_slice(input[..copy_len].as_bytes()); + + Ok((copy_len, copy_len)) +} + +pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> { + let valid_up_to = match std::str::from_utf8(input) { + Ok(text) => text.len(), + Err(e) => { + if e.valid_up_to() > 0 { + e.valid_up_to() + } else { + return Err(DecodeError { + byte_offset: 0, + bytes_written: 0, + }); + } + } + }; + + let (in_consumed, out_written) = encode_from_utf8( + unsafe { std::str::from_utf8_unchecked(&input[..valid_up_to]) }, + output, + ).unwrap(); + + Ok((in_consumed, unsafe { + std::str::from_utf8_unchecked(&output[..out_written]) + })) +}