Added utf32 encoders/decoders to the text_encoding sub-crate.

2018-08-21 05:25:53 -07:00 · 2018-08-21 05:25:53 -07:00 · 3a17ca9e8c
commit 3a17ca9e8c
parent 173837b827
5 changed files with 292 additions and 3 deletions
--- a/sub_crates/text_encoding/src/lib.rs
+++ b/sub_crates/text_encoding/src/lib.rs
@ -4,6 +4,8 @@
 mod latin1;
 mod utf16_be;
 mod utf16_le;
 mod utf32_be;
 mod utf32_le;
 mod utf8;
 mod windows1252;
@ -17,6 +19,8 @@ pub fn encode_from_utf8<'a>(
        Encoding::Utf8 => utf8::encode_from_utf8(input, output),
        Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output),
        Encoding::Utf16LE => utf16_le::encode_from_utf8(input, output),
        Encoding::Utf32BE => utf32_be::encode_from_utf8(input, output),
        Encoding::Utf32LE => utf32_le::encode_from_utf8(input, output),
        Encoding::Latin1 => latin1::encode_from_utf8(input, output),
        Encoding::Windows1252 => windows1252::encode_from_utf8(input, output),
    }
@ -32,6 +36,8 @@ pub fn decode_to_utf8<'a>(
        Encoding::Utf8 => utf8::decode_to_utf8(input, output),
        Encoding::Utf16BE => utf16_be::decode_to_utf8(input, output),
        Encoding::Utf16LE => utf16_le::decode_to_utf8(input, output),
        Encoding::Utf32BE => utf32_be::decode_to_utf8(input, output),
        Encoding::Utf32LE => utf32_le::decode_to_utf8(input, output),
        Encoding::Latin1 => latin1::decode_to_utf8(input, output),
        Encoding::Windows1252 => windows1252::decode_to_utf8(input, output),
    }
@ -43,8 +49,8 @@ pub enum Encoding {
    Utf8,
    Utf16BE, // Big endian
    Utf16LE, // Little endian
-    // Utf32BE, // Big endian
+    Utf32BE, // Big endian
-    // Utf32LE, // Little endian
+    Utf32LE, // Little endian
    // ShiftJIS,
    // EUC_JP,
    // Big5,
--- a/sub_crates/text_encoding/src/utf32_be.rs
+++ b/sub_crates/text_encoding/src/utf32_be.rs
@ -0,0 +1,111 @@
 //! Encoding/decoding functions for big-endian UTF-32.
 //!
 //! Because both utf8 and utf32 can represent the entirety of unicode, the
 //! only possible error is when invalid utf32 is encountered when decoding
 //! to utf8.
 use std;
 use {DecodeError, DecodeResult, EncodeResult};
 fn to_big_endian(n: u32) -> [u8; 4] {
    use std::mem::transmute;
    let ptr = unsafe { transmute::<*const u32, *const u8>(&n as *const u32) };
    if cfg!(target_endian = "little") {
        unsafe { [*ptr.offset(3), *ptr.offset(2), *ptr.offset(1), *ptr] }
    } else {
        unsafe { [*ptr, *ptr.offset(1), *ptr.offset(2), *ptr.offset(3)] }
    }
 }
 fn from_big_endian(n: [u8; 4]) -> u32 {
    use std::mem::transmute;
    let mut x: u32 = 0;
    let ptr = unsafe { transmute::<*mut u32, *mut u8>(&mut x as *mut u32) };
    if cfg!(target_endian = "little") {
        unsafe {
            *ptr = n[3];
            *ptr.offset(1) = n[2];
            *ptr.offset(2) = n[1];
            *ptr.offset(3) = n[0];
        }
    } else {
        unsafe {
            *ptr = n[0];
            *ptr.offset(1) = n[1];
            *ptr.offset(2) = n[2];
            *ptr.offset(3) = n[3];
        }
    }
    x
 }
 pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
    // Do the encode.
    let mut input_i = 0;
    let mut output_i = 0;
    for (offset, c) in input.char_indices() {
        if (output_i + 3) < output.len() {
            let mut code = to_big_endian(c as u32);
            output[output_i] = code[0];
            output[output_i + 1] = code[1];
            output[output_i + 2] = code[2];
            output[output_i + 3] = code[3];
            output_i += 4;
            input_i = offset;
        } else {
            break;
        }
    }
    // Calculate how much of the input was consumed.
    input_i += 1;
    if input_i > input.len() {
        input_i = input.len();
    } else {
        while !input.is_char_boundary(input_i) {
            input_i += 1;
        }
    }
    Ok((input_i, &output[..output_i]))
 }
 pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
    let mut input_i = 0;
    let mut output_i = 0;
    // Loop through the input, getting 4 bytes at a time.
    let mut itr = input.chunks(4);
    while let Some(bytes) = itr.next() {
        if bytes.len() < 4 {
            break;
        }
        // Do the decode.
        if let Some(code) =
            std::char::from_u32(from_big_endian([bytes[0], bytes[1], bytes[2], bytes[3]]))
        {
            // Encode to utf8.
            let mut buf = [0u8; 4];
            let s = code.encode_utf8(&mut buf);
            if (output_i + s.len()) > output.len() {
                break;
            }
            output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes());
            // Update our counters.
            input_i += 4;
            output_i += s.len();
        } else {
            // Error: invalid codepoint.
            return Err(DecodeError {
                error_range: (input_i, input_i + 4),
                output_bytes_written: output_i,
            });
        }
    }
    Ok((input_i, unsafe {
        std::str::from_utf8_unchecked(&output[..output_i])
    }))
 }
--- a/sub_crates/text_encoding/src/utf32_le.rs
+++ b/sub_crates/text_encoding/src/utf32_le.rs
@ -0,0 +1,111 @@
 //! Encoding/decoding functions for big-endian UTF-32.
 //!
 //! Because both utf8 and utf32 can represent the entirety of unicode, the
 //! only possible error is when invalid utf32 is encountered when decoding
 //! to utf8.
 use std;
 use {DecodeError, DecodeResult, EncodeResult};
 fn to_little_endian(n: u32) -> [u8; 4] {
    use std::mem::transmute;
    let ptr = unsafe { transmute::<*const u32, *const u8>(&n as *const u32) };
    if cfg!(target_endian = "little") {
        unsafe { [*ptr, *ptr.offset(1), *ptr.offset(2), *ptr.offset(3)] }
    } else {
        unsafe { [*ptr.offset(3), *ptr.offset(2), *ptr.offset(1), *ptr] }
    }
 }
 fn from_little_endian(n: [u8; 4]) -> u32 {
    use std::mem::transmute;
    let mut x: u32 = 0;
    let ptr = unsafe { transmute::<*mut u32, *mut u8>(&mut x as *mut u32) };
    if cfg!(target_endian = "little") {
        unsafe {
            *ptr = n[0];
            *ptr.offset(1) = n[1];
            *ptr.offset(2) = n[2];
            *ptr.offset(3) = n[3];
        }
    } else {
        unsafe {
            *ptr = n[3];
            *ptr.offset(1) = n[2];
            *ptr.offset(2) = n[1];
            *ptr.offset(3) = n[0];
        }
    }
    x
 }
 pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
    // Do the encode.
    let mut input_i = 0;
    let mut output_i = 0;
    for (offset, c) in input.char_indices() {
        if (output_i + 3) < output.len() {
            let mut code = to_little_endian(c as u32);
            output[output_i] = code[0];
            output[output_i + 1] = code[1];
            output[output_i + 2] = code[2];
            output[output_i + 3] = code[3];
            output_i += 4;
            input_i = offset;
        } else {
            break;
        }
    }
    // Calculate how much of the input was consumed.
    input_i += 1;
    if input_i > input.len() {
        input_i = input.len();
    } else {
        while !input.is_char_boundary(input_i) {
            input_i += 1;
        }
    }
    Ok((input_i, &output[..output_i]))
 }
 pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
    let mut input_i = 0;
    let mut output_i = 0;
    // Loop through the input, getting 4 bytes at a time.
    let mut itr = input.chunks(4);
    while let Some(bytes) = itr.next() {
        if bytes.len() < 4 {
            break;
        }
        // Do the decode.
        if let Some(code) =
            std::char::from_u32(from_little_endian([bytes[0], bytes[1], bytes[2], bytes[3]]))
        {
            // Encode to utf8.
            let mut buf = [0u8; 4];
            let s = code.encode_utf8(&mut buf);
            if (output_i + s.len()) > output.len() {
                break;
            }
            output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes());
            // Update our counters.
            input_i += 4;
            output_i += s.len();
        } else {
            // Error: invalid codepoint.
            return Err(DecodeError {
                error_range: (input_i, input_i + 4),
                output_bytes_written: output_i,
            });
        }
    }
    Ok((input_i, unsafe {
        std::str::from_utf8_unchecked(&output[..output_i])
    }))
 }
--- a/sub_crates/text_encoding/src/utf8.rs
+++ b/sub_crates/text_encoding/src/utf8.rs
@ -6,7 +6,6 @@
 use std;
 use {DecodeError, DecodeResult, EncodeResult};
 // Encode from utf8
 pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
    let copy_len = {
        if output.len() >= input.len() {
--- a/sub_crates/text_encoding/tests/property_tests.rs
+++ b/sub_crates/text_encoding/tests/property_tests.rs
@ -104,6 +104,68 @@ proptest! {
        assert_eq!(&text[..], &utf8[..]);
    }
    #[test]
    fn pt_utf32be_roundtrip(ref text in "\\PC*\\PC*\\PC*") {
        let mut buf = [0u8; 32];
        let mut utf32: Vec<u8> = Vec::new();
        let mut utf8 = String::new();
        // Encode to utf32 big endian
        let mut tmp = &text[..];
        while !tmp.is_empty() {
            if let Ok((n, encoded)) = encode_from_utf8(Encoding::Utf32BE, tmp, &mut buf) {
                tmp = &tmp[n..];
                utf32.extend_from_slice(encoded);
            } else {
                panic!("Error when encoding.");
            }
        }
        // Decode back from utf32 big endian
        let mut tmp = &utf32[..];
        while !tmp.is_empty() {
            if let Ok((n, decoded)) = decode_to_utf8(Encoding::Utf32BE, tmp, &mut buf) {
                tmp = &tmp[n..];
                utf8.extend(decoded.chars());
            } else {
                panic!("Error when decoding.");
            }
        }
        assert_eq!(&text[..], &utf8[..]);
    }
    #[test]
    fn pt_utf32le_roundtrip(ref text in "\\PC*\\PC*\\PC*") {
        let mut buf = [0u8; 32];
        let mut utf32: Vec<u8> = Vec::new();
        let mut utf8 = String::new();
        // Encode to utf32 little endian
        let mut tmp = &text[..];
        while !tmp.is_empty() {
            if let Ok((n, encoded)) = encode_from_utf8(Encoding::Utf32LE, tmp, &mut buf) {
                tmp = &tmp[n..];
                utf32.extend_from_slice(encoded);
            } else {
                panic!("Error when encoding.");
            }
        }
        // Decode back from utf32 little endian
        let mut tmp = &utf32[..];
        while !tmp.is_empty() {
            if let Ok((n, decoded)) = decode_to_utf8(Encoding::Utf32LE, tmp, &mut buf) {
                tmp = &tmp[n..];
                utf8.extend(decoded.chars());
            } else {
                panic!("Error when decoding.");
            }
        }
        assert_eq!(&text[..], &utf8[..]);
    }
    #[test]
    fn pt_latin1_roundtrip(ref data in vec(0u8..=255, 0..1000)) {
        let mut buf = [0u8; 32];