WIP creating a clean frontend/backend separation.

- Started work on writing a new backend. - Started work on writing text encoding handling.
2018-08-17 20:34:43 -07:00 · 2018-08-17 20:34:43 -07:00 · 0ee183aa72
commit 0ee183aa72
parent b713b72e72
11 changed files with 579 additions and 1 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2,12 +2,14 @@
 name = "Led"
 version = "0.0.2"
 dependencies = [
 "backend 0.1.0",
 "docopt 0.8.3 (registry+https://github.com/rust-lang/crates.io-index)",
 "ropey 0.8.4 (git+https://github.com/cessen/ropey)",
 "serde 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)",
 "serde_derive 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)",
 "smallvec 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)",
 "termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
 "text_encoding 0.1.0",
 "unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
 "unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
@ -20,6 +22,14 @@ dependencies = [
 "memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 [[package]]
 name = "backend"
 version = "0.1.0"
 dependencies = [
 "ropey 0.8.4 (git+https://github.com/cessen/ropey)",
 "unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 [[package]]
 name = "docopt"
 version = "0.8.3"
@ -155,6 +165,10 @@ dependencies = [
 "redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 [[package]]
 name = "text_encoding"
 version = "0.1.0"
 [[package]]
 name = "thread_local"
 version = "0.3.5"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,3 +1,9 @@
 [workspace]
 members = [
    "sub_crates/backend",
    "sub_crates/text_encoding",
 ]
 [package]
 name = "Led"
 version = "0.0.2"
@ -17,4 +23,11 @@ serde = "1.*"
 serde_derive = "1.*"
 docopt = "0.8"
 smallvec = "0.6"
-termion = "1.5"
+termion = "1.5"
 # Local crate dependencies
 [dependencies.backend]
 path = "sub_crates/backend"
 [dependencies.text_encoding]
 path = "sub_crates/text_encoding"
--- a/sub_crates/backend/Cargo.toml
+++ b/sub_crates/backend/Cargo.toml
@ -0,0 +1,14 @@
 [package]
 name = "backend"
 version = "0.1.0"
 authors = ["Nathan Vegdahl <cessen@cessen.com>"]
 license = "MIT"
 [lib]
 name = "backend"
 path = "src/lib.rs"
 [dependencies]
 # ropey = "0.8"
 ropey = { git = "https://github.com/cessen/ropey", branch = "master" }
 unicode-segmentation = "1.2.1"
--- a/sub_crates/backend/src/buffer.rs
+++ b/sub_crates/backend/src/buffer.rs
@ -0,0 +1,9 @@
 use ropey::Rope;
 #[derive(Debug, Clone)]
 pub struct Buffer {
    // on_disk_encoding: Encoding,
    content_type: String,
    is_dirty: bool,
    text: Rope, // The actual text content.
 }
--- a/sub_crates/backend/src/lib.rs
+++ b/sub_crates/backend/src/lib.rs
@ -0,0 +1,4 @@
 extern crate ropey;
 extern crate unicode_segmentation;
 pub mod buffer;
--- a/sub_crates/text_encoding/Cargo.toml
+++ b/sub_crates/text_encoding/Cargo.toml
@ -0,0 +1,9 @@
 [package]
 name = "text_encoding"
 version = "0.1.0"
 authors = ["Nathan Vegdahl <cessen@cessen.com>"]
 license = "MIT"
 [lib]
 name = "text_encoding"
 path = "src/lib.rs"
--- a/sub_crates/text_encoding/src/latin1.rs
+++ b/sub_crates/text_encoding/src/latin1.rs
@ -0,0 +1,71 @@
 //! Encoding/decoding functions for ISO/IEC 8859-1 (or "latin1"), which
 //! conveniently happens to map 1-to-1 to the first 256 unicode scalar values.
 //!
 //! Because latin1 is a single-byte encoding where all bytes are valid,
 //! decoding cannot fail.  However, encoding will fail with scalar values
 //! greater than 255.
 use std;
 use {DecodeResult, EncodeError, EncodeResult};
 pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
    // Do the encode.
    let mut input_i = 0;
    let mut output_i = 0;
    for (offset, c) in input.char_indices() {
        if output_i >= output.len() {
            break;
        }
        if c as u32 > 255 {
            return Err(EncodeError {
                character: c,
                byte_offset: offset,
                bytes_written: output_i,
            });
        }
        output[output_i] = c as u8;
        output_i += 1;
        input_i = offset;
    }
    // Calculate how much of the input was consumed.
    input_i += 1;
    if input_i > input.len() {
        input_i = input.len();
    } else {
        while !input.is_char_boundary(input_i) {
            input_i += 1;
        }
    }
    Ok((input_i, output_i))
 }
 pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
    let mut input_i = 0;
    let mut output_i = 0;
    for &byte in input.iter() {
        if byte <= 127 {
            // 1-byte case
            if output_i >= output.len() {
                break;
            }
            output[output_i] = byte;
            input_i += 1;
            output_i += 1;
        } else {
            // 2-byte case
            if (output_i + 1) >= output.len() {
                break;
            }
            output[output_i] = 0b11000000 | (byte >> 6);
            output[output_i + 1] = 0b10000000 | (byte & 0b00111111);
            input_i += 1;
            output_i += 2;
        }
    }
    Ok((input_i, unsafe {
        std::str::from_utf8_unchecked(&output[..output_i])
    }))
 }
--- a/sub_crates/text_encoding/src/lib.rs
+++ b/sub_crates/text_encoding/src/lib.rs
@ -0,0 +1,97 @@
 //! A library for incrementally encoding/decoding between utf8 and various
 //! text encodings.
 mod latin1;
 mod utf16_be;
 mod utf16_le;
 mod utf8;
 /// Encodes text from utf8 to a destination encoding.
 pub fn encode_from_utf8(output_encoding: Encoding, input: &str, output: &mut [u8]) -> EncodeResult {
    match output_encoding {
        Encoding::Utf8 => utf8::encode_from_utf8(input, output),
        Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output),
        Encoding::Utf16LE => utf16_le::encode_from_utf8(input, output),
        Encoding::Latin1 => latin1::encode_from_utf8(input, output),
        _ => unimplemented!(),
    }
 }
 /// Decodes text from a source encoding to utf8.
 pub fn decode_to_utf8<'a>(
    input_encoding: Encoding,
    input: &[u8],
    output: &'a mut [u8],
 ) -> DecodeResult<'a> {
    match input_encoding {
        Encoding::Utf8 => utf8::decode_to_utf8(input, output),
        Encoding::Utf16BE => utf16_be::decode_to_utf8(input, output),
        Encoding::Utf16LE => utf16_le::decode_to_utf8(input, output),
        Encoding::Latin1 => latin1::decode_to_utf8(input, output),
        _ => unimplemented!(),
    }
 }
 /// Describes a text encoding.
 #[derive(Debug, Copy, Clone)]
 pub enum Encoding {
    Utf8,
    Utf16BE, // Big endian
    Utf16LE, // Little endian
    Utf32BE, // Big endian
    Utf32LE, // Little endian
    ShiftJIS,
    Big5,
    Latin1,      // ISO/IEC 8859-1
    Windows1252, // Windows code page 1252
 }
 /// Result type for encoding text from utf8 to a target encoding.
 ///
 /// The Ok() variant provides the number of bytes consumed and the
 /// number of bytes written, in that order.
 pub type EncodeResult = Result<(usize, usize), EncodeError>;
 /// Result type for decoding text from a target encoding to utf8.
 ///
 /// The Ok() variant provides the number of bytes consumed and a reference
 /// to the valid decoded text.
 pub type DecodeResult<'a> = Result<(usize, &'a str), DecodeError>;
 /// Represents an error when encoding from utf8 to some other format.
 ///
 /// Since valid input utf8 is statically assumed, the only possible
 /// error is encountering a char that is not representable in the target
 /// encoding.
 ///
 /// The problematic character, the byte offset of that character
 /// in the input utf8, and the number of bytes already written to the output
 /// buffer is provided.
 ///
 /// It is guaranteed that all input leading up to the problem character has
 /// already been encoded and written to the output buffer.
 #[derive(Debug, Copy, Clone)]
 pub struct EncodeError {
    pub character: char,
    pub byte_offset: usize,
    pub bytes_written: usize,
 }
 /// Represents an error when decoding to utf8 from some other format.
 ///
 /// All supported text encodings can be fully represented in utf8, and
 /// therefore the only possible error is that we encounter bytes in the
 /// input data that are invalid for the text encoding we're attempting
 /// to decode from.
 ///
 /// The byte offset of the invalid input data and in the number of bytes
 /// already written to the output buffer are.
 /// already been encoded and written to the output buffer.
 ///
 /// It is guaranteed that all input leading up to the invalid data has
 /// already been encoded and written to the output buffer.
 #[derive(Debug, Copy, Clone)]
 pub struct DecodeError {
    pub byte_offset: usize,
    pub bytes_written: usize,
 }
--- a/sub_crates/text_encoding/src/utf16_be.rs
+++ b/sub_crates/text_encoding/src/utf16_be.rs
@ -0,0 +1,148 @@
 //! Encoding/decoding functions for big-endian UTF-16.
 //!
 //! Because both utf8 and utf16 can represent the entirety of unicode, the
 //! only possible error is when invalid utf16 is encountered when decoding
 //! to utf8.
 use std;
 use {DecodeError, DecodeResult, EncodeResult};
 fn to_big_endian(n: u16) -> [u8; 2] {
    use std::mem::transmute;
    let ptr = unsafe { transmute::<*const u16, *const u8>(&n as *const u16) };
    if cfg!(target_endian = "little") {
        unsafe { [*ptr.offset(1), *ptr] }
    } else {
        unsafe { [*ptr, *ptr.offset(1)] }
    }
 }
 fn from_big_endian(n: [u8; 2]) -> u16 {
    use std::mem::transmute;
    let mut x: u16 = 0;
    let ptr = unsafe { transmute::<*mut u16, *mut u8>(&mut x as *mut u16) };
    if cfg!(target_endian = "little") {
        unsafe {
            *ptr = n[1];
            *ptr.offset(1) = n[0];
        }
    } else {
        unsafe {
            *ptr = n[0];
            *ptr.offset(1) = n[1];
        }
    }
    x
 }
 pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
    // Do the encode.
    let mut input_i = 0;
    let mut output_i = 0;
    for (offset, c) in input.char_indices() {
        let mut code = c as u32;
        if code <= 0xFFFF {
            // One code unit
            if (output_i + 1) < output.len() {
                let val = to_big_endian(code as u16);
                output[output_i] = val[0];
                output[output_i + 1] = val[1];
                output_i += 2;
                input_i = offset;
            } else {
                break;
            }
        } else if (output_i + 3) < output.len() {
            // Two code units
            code -= 0x10000;
            let first = to_big_endian(0xD800 | ((code >> 10) as u16));
            let second = to_big_endian(0xDC00 | ((code as u16) & 0x3FF));
            output[output_i] = first[0];
            output[output_i + 1] = first[1];
            output[output_i + 2] = second[0];
            output[output_i + 3] = second[1];
            output_i += 4;
            input_i = offset;
        } else {
            break;
        }
    }
    // Calculate how much of the input was consumed.
    input_i += 1;
    if input_i > input.len() {
        input_i = input.len();
    } else {
        while !input.is_char_boundary(input_i) {
            input_i += 1;
        }
    }
    Ok((input_i, output_i))
 }
 pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
    let mut input_i = 0;
    let mut output_i = 0;
    // Loop through the input, getting 2 bytes at a time.
    let mut itr = input.chunks(2);
    while let Some(bytes) = itr.next() {
        if bytes.len() < 2 {
            break;
        }
        // Decode to scalar value.
        let code = {
            let code_1 = from_big_endian([bytes[0], bytes[1]]);
            if code_1 < 0xD800 || code_1 > 0xDFFF {
                // Single code unit.
                unsafe { std::char::from_u32_unchecked(code_1 as u32) }
            } else if (code_1 & 0xFC00) == 0xDC00 {
                // Error: orphaned second half of a surrogate pair.
                return Err(DecodeError {
                    byte_offset: input_i,
                    bytes_written: output_i,
                });
            } else {
                // Two code units.
                // Get the second code unit, if possible.
                if !(input_i + 3) < input.len() {
                    break;
                }
                let bytes_2 = itr.next().unwrap();
                let code_2 = from_big_endian([bytes_2[0], bytes_2[1]]);
                if !(code_2 & 0xFC00) == 0xDC00 {
                    // Error: second half is not valid surrogate.
                    return Err(DecodeError {
                        byte_offset: input_i,
                        bytes_written: output_i,
                    });
                }
                unsafe {
                    std::char::from_u32_unchecked(
                        (((code_1 as u32 - 0xD800) << 10) | (code_2 as u32 - 0xDC00)) + 0x10000,
                    )
                }
            }
        };
        // Encode to utf8.
        let mut buf = [0u8; 4];
        let s = code.encode_utf8(&mut buf);
        if (output_i + s.len()) > output.len() {
            break;
        }
        output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes());
        // Update our counters.
        input_i += code.len_utf16() * 2;
        output_i += s.len();
    }
    Ok((input_i, unsafe {
        std::str::from_utf8_unchecked(&output[..output_i])
    }))
 }
--- a/sub_crates/text_encoding/src/utf16_le.rs
+++ b/sub_crates/text_encoding/src/utf16_le.rs
@ -0,0 +1,148 @@
 //! Encoding/decoding functions for little-endian UTF-16.
 //!
 //! Because both utf8 and utf16 can represent the entirety of unicode, the
 //! only possible error is when invalid utf16 is encountered when decoding
 //! to utf8.
 use std;
 use {DecodeError, DecodeResult, EncodeResult};
 fn to_little_endian(n: u16) -> [u8; 2] {
    use std::mem::transmute;
    let ptr = unsafe { transmute::<*const u16, *const u8>(&n as *const u16) };
    if cfg!(target_endian = "little") {
        unsafe { [*ptr, *ptr.offset(1)] }
    } else {
        unsafe { [*ptr.offset(1), *ptr] }
    }
 }
 fn from_little_endian(n: [u8; 2]) -> u16 {
    use std::mem::transmute;
    let mut x: u16 = 0;
    let ptr = unsafe { transmute::<*mut u16, *mut u8>(&mut x as *mut u16) };
    if cfg!(target_endian = "little") {
        unsafe {
            *ptr = n[0];
            *ptr.offset(1) = n[1];
        }
    } else {
        unsafe {
            *ptr = n[1];
            *ptr.offset(1) = n[0];
        }
    }
    x
 }
 pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
    // Do the encode.
    let mut input_i = 0;
    let mut output_i = 0;
    for (offset, c) in input.char_indices() {
        let mut code = c as u32;
        if code <= 0xFFFF {
            // One code unit
            if (output_i + 1) < output.len() {
                let val = to_little_endian(code as u16);
                output[output_i] = val[0];
                output[output_i + 1] = val[1];
                output_i += 2;
                input_i = offset;
            } else {
                break;
            }
        } else if (output_i + 3) < output.len() {
            // Two code units
            code -= 0x10000;
            let first = to_little_endian(0xD800 | ((code >> 10) as u16));
            let second = to_little_endian(0xDC00 | ((code as u16) & 0x3FF));
            output[output_i] = first[0];
            output[output_i + 1] = first[1];
            output[output_i + 2] = second[0];
            output[output_i + 3] = second[1];
            output_i += 4;
            input_i = offset;
        } else {
            break;
        }
    }
    // Calculate how much of the input was consumed.
    input_i += 1;
    if input_i > input.len() {
        input_i = input.len();
    } else {
        while !input.is_char_boundary(input_i) {
            input_i += 1;
        }
    }
    Ok((input_i, output_i))
 }
 pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
    let mut input_i = 0;
    let mut output_i = 0;
    // Loop through the input, getting 2 bytes at a time.
    let mut itr = input.chunks(2);
    while let Some(bytes) = itr.next() {
        if bytes.len() < 2 {
            break;
        }
        // Decode to scalar value.
        let code = {
            let code_1 = from_little_endian([bytes[0], bytes[1]]);
            if code_1 < 0xD800 || code_1 > 0xDFFF {
                // Single code unit.
                unsafe { std::char::from_u32_unchecked(code_1 as u32) }
            } else if (code_1 & 0xFC00) == 0xDC00 {
                // Error: orphaned second half of a surrogate pair.
                return Err(DecodeError {
                    byte_offset: input_i,
                    bytes_written: output_i,
                });
            } else {
                // Two code units.
                // Get the second code unit, if possible.
                if !(input_i + 3) < input.len() {
                    break;
                }
                let bytes_2 = itr.next().unwrap();
                let code_2 = from_little_endian([bytes_2[0], bytes_2[1]]);
                if !(code_2 & 0xFC00) == 0xDC00 {
                    // Error: second half is not valid surrogate.
                    return Err(DecodeError {
                        byte_offset: input_i,
                        bytes_written: output_i,
                    });
                }
                unsafe {
                    std::char::from_u32_unchecked(
                        (((code_1 as u32 - 0xD800) << 10) | (code_2 as u32 - 0xDC00)) + 0x10000,
                    )
                }
            }
        };
        // Encode to utf8.
        let mut buf = [0u8; 4];
        let s = code.encode_utf8(&mut buf);
        if (output_i + s.len()) > output.len() {
            break;
        }
        output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes());
        // Update our counters.
        input_i += code.len_utf16() * 2;
        output_i += s.len();
    }
    Ok((input_i, unsafe {
        std::str::from_utf8_unchecked(&output[..output_i])
    }))
 }
--- a/sub_crates/text_encoding/src/utf8.rs
+++ b/sub_crates/text_encoding/src/utf8.rs
@ -0,0 +1,51 @@
 //! These functions are essentially redundant, since they're supposedly
 //! encoding/decoding between utf8 and... utf8.  However, `decode_to_utf8()`
 //! is still useful for validating unknown input.  And they allow a uniform
 //! API for all encodings.
 use std;
 use {DecodeError, DecodeResult, EncodeResult};
 // Encode from utf8
 pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
    let copy_len = {
        if output.len() >= input.len() {
            input.len()
        } else {
            let mut i = output.len();
            while !input.is_char_boundary(i) {
                i -= 1;
            }
            i
        }
    };
    output[..copy_len].copy_from_slice(input[..copy_len].as_bytes());
    Ok((copy_len, copy_len))
 }
 pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
    let valid_up_to = match std::str::from_utf8(input) {
        Ok(text) => text.len(),
        Err(e) => {
            if e.valid_up_to() > 0 {
                e.valid_up_to()
            } else {
                return Err(DecodeError {
                    byte_offset: 0,
                    bytes_written: 0,
                });
            }
        }
    };
    let (in_consumed, out_written) = encode_from_utf8(
        unsafe { std::str::from_utf8_unchecked(&input[..valid_up_to]) },
        output,
    ).unwrap();
    Ok((in_consumed, unsafe {
        std::str::from_utf8_unchecked(&output[..out_written])
    }))
 }