WIP creating a clean frontend/backend separation.

- Started work on writing a new backend. - Started work on writing text encoding handling.
2018-08-17 20:34:43 -07:00 · 2018-08-17 20:34:43 -07:00 · 0ee183aa72
commit 0ee183aa72
parent b713b72e72
11 changed files with 579 additions and 1 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2,12 +2,14 @@
 name = "Led"
 version = "0.0.2"
 dependencies = [
+ "backend 0.1.0",
 "docopt 0.8.3 (registry+https://github.com/rust-lang/crates.io-index)",
 "ropey 0.8.4 (git+https://github.com/cessen/ropey)",
 "serde 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)",
 "serde_derive 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)",
 "smallvec 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)",
 "termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "text_encoding 0.1.0",
 "unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
 "unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
@ -20,6 +22,14 @@ dependencies = [
 "memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

+[[package]]
+name = "backend"
+version = "0.1.0"
+dependencies = [
+ "ropey 0.8.4 (git+https://github.com/cessen/ropey)",
+ "unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
 [[package]]
 name = "docopt"
 version = "0.8.3"
@ -155,6 +165,10 @@ dependencies = [
 "redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

+[[package]]
+name = "text_encoding"
+version = "0.1.0"
+
 [[package]]
 name = "thread_local"
 version = "0.3.5"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,3 +1,9 @@
+[workspace]
+members = [
+    "sub_crates/backend",
+    "sub_crates/text_encoding",
+]
+
 [package]
 name = "Led"
 version = "0.0.2"
@ -17,4 +23,11 @@ serde = "1.*"
 serde_derive = "1.*"
 docopt = "0.8"
 smallvec = "0.6"
-termion = "1.5"
+termion = "1.5"
+
+# Local crate dependencies
+[dependencies.backend]
+path = "sub_crates/backend"
+
+[dependencies.text_encoding]
+path = "sub_crates/text_encoding"
--- a/sub_crates/backend/Cargo.toml
+++ b/sub_crates/backend/Cargo.toml
@ -0,0 +1,14 @@
+[package]
+name = "backend"
+version = "0.1.0"
+authors = ["Nathan Vegdahl <cessen@cessen.com>"]
+license = "MIT"
+
+[lib]
+name = "backend"
+path = "src/lib.rs"
+
+[dependencies]
+# ropey = "0.8"
+ropey = { git = "https://github.com/cessen/ropey", branch = "master" }
+unicode-segmentation = "1.2.1"
--- a/sub_crates/backend/src/buffer.rs
+++ b/sub_crates/backend/src/buffer.rs
@ -0,0 +1,9 @@
+use ropey::Rope;
+
+#[derive(Debug, Clone)]
+pub struct Buffer {
+    // on_disk_encoding: Encoding,
+    content_type: String,
+    is_dirty: bool,
+    text: Rope, // The actual text content.
+}
--- a/sub_crates/backend/src/lib.rs
+++ b/sub_crates/backend/src/lib.rs
@ -0,0 +1,4 @@
+extern crate ropey;
+extern crate unicode_segmentation;
+
+pub mod buffer;
--- a/sub_crates/text_encoding/Cargo.toml
+++ b/sub_crates/text_encoding/Cargo.toml
@ -0,0 +1,9 @@
+[package]
+name = "text_encoding"
+version = "0.1.0"
+authors = ["Nathan Vegdahl <cessen@cessen.com>"]
+license = "MIT"
+
+[lib]
+name = "text_encoding"
+path = "src/lib.rs"
--- a/sub_crates/text_encoding/src/latin1.rs
+++ b/sub_crates/text_encoding/src/latin1.rs
@ -0,0 +1,71 @@
+//! Encoding/decoding functions for ISO/IEC 8859-1 (or "latin1"), which
+//! conveniently happens to map 1-to-1 to the first 256 unicode scalar values.
+//!
+//! Because latin1 is a single-byte encoding where all bytes are valid,
+//! decoding cannot fail.  However, encoding will fail with scalar values
+//! greater than 255.
+
+use std;
+use {DecodeResult, EncodeError, EncodeResult};
+
+pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
+    // Do the encode.
+    let mut input_i = 0;
+    let mut output_i = 0;
+    for (offset, c) in input.char_indices() {
+        if output_i >= output.len() {
+            break;
+        }
+        if c as u32 > 255 {
+            return Err(EncodeError {
+                character: c,
+                byte_offset: offset,
+                bytes_written: output_i,
+            });
+        }
+        output[output_i] = c as u8;
+        output_i += 1;
+        input_i = offset;
+    }
+
+    // Calculate how much of the input was consumed.
+    input_i += 1;
+    if input_i > input.len() {
+        input_i = input.len();
+    } else {
+        while !input.is_char_boundary(input_i) {
+            input_i += 1;
+        }
+    }
+
+    Ok((input_i, output_i))
+}
+
+pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
+    let mut input_i = 0;
+    let mut output_i = 0;
+    for &byte in input.iter() {
+        if byte <= 127 {
+            // 1-byte case
+            if output_i >= output.len() {
+                break;
+            }
+            output[output_i] = byte;
+            input_i += 1;
+            output_i += 1;
+        } else {
+            // 2-byte case
+            if (output_i + 1) >= output.len() {
+                break;
+            }
+            output[output_i] = 0b11000000 | (byte >> 6);
+            output[output_i + 1] = 0b10000000 | (byte & 0b00111111);
+            input_i += 1;
+            output_i += 2;
+        }
+    }
+
+    Ok((input_i, unsafe {
+        std::str::from_utf8_unchecked(&output[..output_i])
+    }))
+}
--- a/sub_crates/text_encoding/src/lib.rs
+++ b/sub_crates/text_encoding/src/lib.rs
@ -0,0 +1,97 @@
+//! A library for incrementally encoding/decoding between utf8 and various
+//! text encodings.
+
+mod latin1;
+mod utf16_be;
+mod utf16_le;
+mod utf8;
+
+/// Encodes text from utf8 to a destination encoding.
+pub fn encode_from_utf8(output_encoding: Encoding, input: &str, output: &mut [u8]) -> EncodeResult {
+    match output_encoding {
+        Encoding::Utf8 => utf8::encode_from_utf8(input, output),
+        Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output),
+        Encoding::Utf16LE => utf16_le::encode_from_utf8(input, output),
+        Encoding::Latin1 => latin1::encode_from_utf8(input, output),
+        _ => unimplemented!(),
+    }
+}
+
+/// Decodes text from a source encoding to utf8.
+pub fn decode_to_utf8<'a>(
+    input_encoding: Encoding,
+    input: &[u8],
+    output: &'a mut [u8],
+) -> DecodeResult<'a> {
+    match input_encoding {
+        Encoding::Utf8 => utf8::decode_to_utf8(input, output),
+        Encoding::Utf16BE => utf16_be::decode_to_utf8(input, output),
+        Encoding::Utf16LE => utf16_le::decode_to_utf8(input, output),
+        Encoding::Latin1 => latin1::decode_to_utf8(input, output),
+        _ => unimplemented!(),
+    }
+}
+
+/// Describes a text encoding.
+#[derive(Debug, Copy, Clone)]
+pub enum Encoding {
+    Utf8,
+    Utf16BE, // Big endian
+    Utf16LE, // Little endian
+    Utf32BE, // Big endian
+    Utf32LE, // Little endian
+    ShiftJIS,
+    Big5,
+    Latin1,      // ISO/IEC 8859-1
+    Windows1252, // Windows code page 1252
+}
+
+/// Result type for encoding text from utf8 to a target encoding.
+///
+/// The Ok() variant provides the number of bytes consumed and the
+/// number of bytes written, in that order.
+pub type EncodeResult = Result<(usize, usize), EncodeError>;
+
+/// Result type for decoding text from a target encoding to utf8.
+///
+/// The Ok() variant provides the number of bytes consumed and a reference
+/// to the valid decoded text.
+pub type DecodeResult<'a> = Result<(usize, &'a str), DecodeError>;
+
+/// Represents an error when encoding from utf8 to some other format.
+///
+/// Since valid input utf8 is statically assumed, the only possible
+/// error is encountering a char that is not representable in the target
+/// encoding.
+///
+/// The problematic character, the byte offset of that character
+/// in the input utf8, and the number of bytes already written to the output
+/// buffer is provided.
+///
+/// It is guaranteed that all input leading up to the problem character has
+/// already been encoded and written to the output buffer.
+#[derive(Debug, Copy, Clone)]
+pub struct EncodeError {
+    pub character: char,
+    pub byte_offset: usize,
+    pub bytes_written: usize,
+}
+
+/// Represents an error when decoding to utf8 from some other format.
+///
+/// All supported text encodings can be fully represented in utf8, and
+/// therefore the only possible error is that we encounter bytes in the
+/// input data that are invalid for the text encoding we're attempting
+/// to decode from.
+///
+/// The byte offset of the invalid input data and in the number of bytes
+/// already written to the output buffer are.
+/// already been encoded and written to the output buffer.
+///
+/// It is guaranteed that all input leading up to the invalid data has
+/// already been encoded and written to the output buffer.
+#[derive(Debug, Copy, Clone)]
+pub struct DecodeError {
+    pub byte_offset: usize,
+    pub bytes_written: usize,
+}
--- a/sub_crates/text_encoding/src/utf16_be.rs
+++ b/sub_crates/text_encoding/src/utf16_be.rs
@ -0,0 +1,148 @@
+//! Encoding/decoding functions for big-endian UTF-16.
+//!
+//! Because both utf8 and utf16 can represent the entirety of unicode, the
+//! only possible error is when invalid utf16 is encountered when decoding
+//! to utf8.
+
+use std;
+use {DecodeError, DecodeResult, EncodeResult};
+
+fn to_big_endian(n: u16) -> [u8; 2] {
+    use std::mem::transmute;
+    let ptr = unsafe { transmute::<*const u16, *const u8>(&n as *const u16) };
+    if cfg!(target_endian = "little") {
+        unsafe { [*ptr.offset(1), *ptr] }
+    } else {
+        unsafe { [*ptr, *ptr.offset(1)] }
+    }
+}
+
+fn from_big_endian(n: [u8; 2]) -> u16 {
+    use std::mem::transmute;
+    let mut x: u16 = 0;
+    let ptr = unsafe { transmute::<*mut u16, *mut u8>(&mut x as *mut u16) };
+    if cfg!(target_endian = "little") {
+        unsafe {
+            *ptr = n[1];
+            *ptr.offset(1) = n[0];
+        }
+    } else {
+        unsafe {
+            *ptr = n[0];
+            *ptr.offset(1) = n[1];
+        }
+    }
+    x
+}
+
+pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
+    // Do the encode.
+    let mut input_i = 0;
+    let mut output_i = 0;
+    for (offset, c) in input.char_indices() {
+        let mut code = c as u32;
+        if code <= 0xFFFF {
+            // One code unit
+            if (output_i + 1) < output.len() {
+                let val = to_big_endian(code as u16);
+                output[output_i] = val[0];
+                output[output_i + 1] = val[1];
+                output_i += 2;
+                input_i = offset;
+            } else {
+                break;
+            }
+        } else if (output_i + 3) < output.len() {
+            // Two code units
+            code -= 0x10000;
+            let first = to_big_endian(0xD800 | ((code >> 10) as u16));
+            let second = to_big_endian(0xDC00 | ((code as u16) & 0x3FF));
+            output[output_i] = first[0];
+            output[output_i + 1] = first[1];
+            output[output_i + 2] = second[0];
+            output[output_i + 3] = second[1];
+            output_i += 4;
+            input_i = offset;
+        } else {
+            break;
+        }
+    }
+
+    // Calculate how much of the input was consumed.
+    input_i += 1;
+    if input_i > input.len() {
+        input_i = input.len();
+    } else {
+        while !input.is_char_boundary(input_i) {
+            input_i += 1;
+        }
+    }
+
+    Ok((input_i, output_i))
+}
+
+pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
+    let mut input_i = 0;
+    let mut output_i = 0;
+
+    // Loop through the input, getting 2 bytes at a time.
+    let mut itr = input.chunks(2);
+    while let Some(bytes) = itr.next() {
+        if bytes.len() < 2 {
+            break;
+        }
+
+        // Decode to scalar value.
+        let code = {
+            let code_1 = from_big_endian([bytes[0], bytes[1]]);
+            if code_1 < 0xD800 || code_1 > 0xDFFF {
+                // Single code unit.
+                unsafe { std::char::from_u32_unchecked(code_1 as u32) }
+            } else if (code_1 & 0xFC00) == 0xDC00 {
+                // Error: orphaned second half of a surrogate pair.
+                return Err(DecodeError {
+                    byte_offset: input_i,
+                    bytes_written: output_i,
+                });
+            } else {
+                // Two code units.
+
+                // Get the second code unit, if possible.
+                if !(input_i + 3) < input.len() {
+                    break;
+                }
+                let bytes_2 = itr.next().unwrap();
+                let code_2 = from_big_endian([bytes_2[0], bytes_2[1]]);
+                if !(code_2 & 0xFC00) == 0xDC00 {
+                    // Error: second half is not valid surrogate.
+                    return Err(DecodeError {
+                        byte_offset: input_i,
+                        bytes_written: output_i,
+                    });
+                }
+
+                unsafe {
+                    std::char::from_u32_unchecked(
+                        (((code_1 as u32 - 0xD800) << 10) | (code_2 as u32 - 0xDC00)) + 0x10000,
+                    )
+                }
+            }
+        };
+
+        // Encode to utf8.
+        let mut buf = [0u8; 4];
+        let s = code.encode_utf8(&mut buf);
+        if (output_i + s.len()) > output.len() {
+            break;
+        }
+        output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes());
+
+        // Update our counters.
+        input_i += code.len_utf16() * 2;
+        output_i += s.len();
+    }
+
+    Ok((input_i, unsafe {
+        std::str::from_utf8_unchecked(&output[..output_i])
+    }))
+}
--- a/sub_crates/text_encoding/src/utf16_le.rs
+++ b/sub_crates/text_encoding/src/utf16_le.rs
@ -0,0 +1,148 @@
+//! Encoding/decoding functions for little-endian UTF-16.
+//!
+//! Because both utf8 and utf16 can represent the entirety of unicode, the
+//! only possible error is when invalid utf16 is encountered when decoding
+//! to utf8.
+
+use std;
+use {DecodeError, DecodeResult, EncodeResult};
+
+fn to_little_endian(n: u16) -> [u8; 2] {
+    use std::mem::transmute;
+    let ptr = unsafe { transmute::<*const u16, *const u8>(&n as *const u16) };
+    if cfg!(target_endian = "little") {
+        unsafe { [*ptr, *ptr.offset(1)] }
+    } else {
+        unsafe { [*ptr.offset(1), *ptr] }
+    }
+}
+
+fn from_little_endian(n: [u8; 2]) -> u16 {
+    use std::mem::transmute;
+    let mut x: u16 = 0;
+    let ptr = unsafe { transmute::<*mut u16, *mut u8>(&mut x as *mut u16) };
+    if cfg!(target_endian = "little") {
+        unsafe {
+            *ptr = n[0];
+            *ptr.offset(1) = n[1];
+        }
+    } else {
+        unsafe {
+            *ptr = n[1];
+            *ptr.offset(1) = n[0];
+        }
+    }
+    x
+}
+
+pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
+    // Do the encode.
+    let mut input_i = 0;
+    let mut output_i = 0;
+    for (offset, c) in input.char_indices() {
+        let mut code = c as u32;
+        if code <= 0xFFFF {
+            // One code unit
+            if (output_i + 1) < output.len() {
+                let val = to_little_endian(code as u16);
+                output[output_i] = val[0];
+                output[output_i + 1] = val[1];
+                output_i += 2;
+                input_i = offset;
+            } else {
+                break;
+            }
+        } else if (output_i + 3) < output.len() {
+            // Two code units
+            code -= 0x10000;
+            let first = to_little_endian(0xD800 | ((code >> 10) as u16));
+            let second = to_little_endian(0xDC00 | ((code as u16) & 0x3FF));
+            output[output_i] = first[0];
+            output[output_i + 1] = first[1];
+            output[output_i + 2] = second[0];
+            output[output_i + 3] = second[1];
+            output_i += 4;
+            input_i = offset;
+        } else {
+            break;
+        }
+    }
+
+    // Calculate how much of the input was consumed.
+    input_i += 1;
+    if input_i > input.len() {
+        input_i = input.len();
+    } else {
+        while !input.is_char_boundary(input_i) {
+            input_i += 1;
+        }
+    }
+
+    Ok((input_i, output_i))
+}
+
+pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
+    let mut input_i = 0;
+    let mut output_i = 0;
+
+    // Loop through the input, getting 2 bytes at a time.
+    let mut itr = input.chunks(2);
+    while let Some(bytes) = itr.next() {
+        if bytes.len() < 2 {
+            break;
+        }
+
+        // Decode to scalar value.
+        let code = {
+            let code_1 = from_little_endian([bytes[0], bytes[1]]);
+            if code_1 < 0xD800 || code_1 > 0xDFFF {
+                // Single code unit.
+                unsafe { std::char::from_u32_unchecked(code_1 as u32) }
+            } else if (code_1 & 0xFC00) == 0xDC00 {
+                // Error: orphaned second half of a surrogate pair.
+                return Err(DecodeError {
+                    byte_offset: input_i,
+                    bytes_written: output_i,
+                });
+            } else {
+                // Two code units.
+
+                // Get the second code unit, if possible.
+                if !(input_i + 3) < input.len() {
+                    break;
+                }
+                let bytes_2 = itr.next().unwrap();
+                let code_2 = from_little_endian([bytes_2[0], bytes_2[1]]);
+                if !(code_2 & 0xFC00) == 0xDC00 {
+                    // Error: second half is not valid surrogate.
+                    return Err(DecodeError {
+                        byte_offset: input_i,
+                        bytes_written: output_i,
+                    });
+                }
+
+                unsafe {
+                    std::char::from_u32_unchecked(
+                        (((code_1 as u32 - 0xD800) << 10) | (code_2 as u32 - 0xDC00)) + 0x10000,
+                    )
+                }
+            }
+        };
+
+        // Encode to utf8.
+        let mut buf = [0u8; 4];
+        let s = code.encode_utf8(&mut buf);
+        if (output_i + s.len()) > output.len() {
+            break;
+        }
+        output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes());
+
+        // Update our counters.
+        input_i += code.len_utf16() * 2;
+        output_i += s.len();
+    }
+
+    Ok((input_i, unsafe {
+        std::str::from_utf8_unchecked(&output[..output_i])
+    }))
+}
--- a/sub_crates/text_encoding/src/utf8.rs
+++ b/sub_crates/text_encoding/src/utf8.rs
@ -0,0 +1,51 @@
+//! These functions are essentially redundant, since they're supposedly
+//! encoding/decoding between utf8 and... utf8.  However, `decode_to_utf8()`
+//! is still useful for validating unknown input.  And they allow a uniform
+//! API for all encodings.
+
+use std;
+use {DecodeError, DecodeResult, EncodeResult};
+
+// Encode from utf8
+pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
+    let copy_len = {
+        if output.len() >= input.len() {
+            input.len()
+        } else {
+            let mut i = output.len();
+            while !input.is_char_boundary(i) {
+                i -= 1;
+            }
+            i
+        }
+    };
+
+    output[..copy_len].copy_from_slice(input[..copy_len].as_bytes());
+
+    Ok((copy_len, copy_len))
+}
+
+pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
+    let valid_up_to = match std::str::from_utf8(input) {
+        Ok(text) => text.len(),
+        Err(e) => {
+            if e.valid_up_to() > 0 {
+                e.valid_up_to()
+            } else {
+                return Err(DecodeError {
+                    byte_offset: 0,
+                    bytes_written: 0,
+                });
+            }
+        }
+    };
+
+    let (in_consumed, out_written) = encode_from_utf8(
+        unsafe { std::str::from_utf8_unchecked(&input[..valid_up_to]) },
+        output,
+    ).unwrap();
+
+    Ok((in_consumed, unsafe {
+        std::str::from_utf8_unchecked(&output[..out_written])
+    }))
+}