Added utf32 encoders/decoders to the text_encoding sub-crate.

2018-08-21 05:25:53 -07:00 · 2018-08-21 05:25:53 -07:00 · 3a17ca9e8c
commit 3a17ca9e8c
parent 173837b827
5 changed files with 292 additions and 3 deletions
--- a/sub_crates/text_encoding/src/lib.rs
+++ b/sub_crates/text_encoding/src/lib.rs
@ -4,6 +4,8 @@
 mod latin1;
 mod utf16_be;
 mod utf16_le;
+mod utf32_be;
+mod utf32_le;
 mod utf8;
 mod windows1252;

@ -17,6 +19,8 @@ pub fn encode_from_utf8<'a>(
        Encoding::Utf8 => utf8::encode_from_utf8(input, output),
        Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output),
        Encoding::Utf16LE => utf16_le::encode_from_utf8(input, output),
+        Encoding::Utf32BE => utf32_be::encode_from_utf8(input, output),
+        Encoding::Utf32LE => utf32_le::encode_from_utf8(input, output),
        Encoding::Latin1 => latin1::encode_from_utf8(input, output),
        Encoding::Windows1252 => windows1252::encode_from_utf8(input, output),
    }
@ -32,6 +36,8 @@ pub fn decode_to_utf8<'a>(
        Encoding::Utf8 => utf8::decode_to_utf8(input, output),
        Encoding::Utf16BE => utf16_be::decode_to_utf8(input, output),
        Encoding::Utf16LE => utf16_le::decode_to_utf8(input, output),
+        Encoding::Utf32BE => utf32_be::decode_to_utf8(input, output),
+        Encoding::Utf32LE => utf32_le::decode_to_utf8(input, output),
        Encoding::Latin1 => latin1::decode_to_utf8(input, output),
        Encoding::Windows1252 => windows1252::decode_to_utf8(input, output),
    }
@ -43,8 +49,8 @@ pub enum Encoding {
    Utf8,
    Utf16BE, // Big endian
    Utf16LE, // Little endian
-    // Utf32BE, // Big endian
-    // Utf32LE, // Little endian
+    Utf32BE, // Big endian
+    Utf32LE, // Little endian
    // ShiftJIS,
    // EUC_JP,
    // Big5,
--- a/sub_crates/text_encoding/src/utf32_be.rs
+++ b/sub_crates/text_encoding/src/utf32_be.rs
@ -0,0 +1,111 @@
+//! Encoding/decoding functions for big-endian UTF-32.
+//!
+//! Because both utf8 and utf32 can represent the entirety of unicode, the
+//! only possible error is when invalid utf32 is encountered when decoding
+//! to utf8.
+
+use std;
+use {DecodeError, DecodeResult, EncodeResult};
+
+fn to_big_endian(n: u32) -> [u8; 4] {
+    use std::mem::transmute;
+    let ptr = unsafe { transmute::<*const u32, *const u8>(&n as *const u32) };
+    if cfg!(target_endian = "little") {
+        unsafe { [*ptr.offset(3), *ptr.offset(2), *ptr.offset(1), *ptr] }
+    } else {
+        unsafe { [*ptr, *ptr.offset(1), *ptr.offset(2), *ptr.offset(3)] }
+    }
+}
+
+fn from_big_endian(n: [u8; 4]) -> u32 {
+    use std::mem::transmute;
+    let mut x: u32 = 0;
+    let ptr = unsafe { transmute::<*mut u32, *mut u8>(&mut x as *mut u32) };
+    if cfg!(target_endian = "little") {
+        unsafe {
+            *ptr = n[3];
+            *ptr.offset(1) = n[2];
+            *ptr.offset(2) = n[1];
+            *ptr.offset(3) = n[0];
+        }
+    } else {
+        unsafe {
+            *ptr = n[0];
+            *ptr.offset(1) = n[1];
+            *ptr.offset(2) = n[2];
+            *ptr.offset(3) = n[3];
+        }
+    }
+    x
+}
+
+pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
+    // Do the encode.
+    let mut input_i = 0;
+    let mut output_i = 0;
+    for (offset, c) in input.char_indices() {
+        if (output_i + 3) < output.len() {
+            let mut code = to_big_endian(c as u32);
+            output[output_i] = code[0];
+            output[output_i + 1] = code[1];
+            output[output_i + 2] = code[2];
+            output[output_i + 3] = code[3];
+            output_i += 4;
+            input_i = offset;
+        } else {
+            break;
+        }
+    }
+
+    // Calculate how much of the input was consumed.
+    input_i += 1;
+    if input_i > input.len() {
+        input_i = input.len();
+    } else {
+        while !input.is_char_boundary(input_i) {
+            input_i += 1;
+        }
+    }
+
+    Ok((input_i, &output[..output_i]))
+}
+
+pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
+    let mut input_i = 0;
+    let mut output_i = 0;
+
+    // Loop through the input, getting 4 bytes at a time.
+    let mut itr = input.chunks(4);
+    while let Some(bytes) = itr.next() {
+        if bytes.len() < 4 {
+            break;
+        }
+
+        // Do the decode.
+        if let Some(code) =
+            std::char::from_u32(from_big_endian([bytes[0], bytes[1], bytes[2], bytes[3]]))
+        {
+            // Encode to utf8.
+            let mut buf = [0u8; 4];
+            let s = code.encode_utf8(&mut buf);
+            if (output_i + s.len()) > output.len() {
+                break;
+            }
+            output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes());
+
+            // Update our counters.
+            input_i += 4;
+            output_i += s.len();
+        } else {
+            // Error: invalid codepoint.
+            return Err(DecodeError {
+                error_range: (input_i, input_i + 4),
+                output_bytes_written: output_i,
+            });
+        }
+    }
+
+    Ok((input_i, unsafe {
+        std::str::from_utf8_unchecked(&output[..output_i])
+    }))
+}
--- a/sub_crates/text_encoding/src/utf32_le.rs
+++ b/sub_crates/text_encoding/src/utf32_le.rs
@ -0,0 +1,111 @@
+//! Encoding/decoding functions for big-endian UTF-32.
+//!
+//! Because both utf8 and utf32 can represent the entirety of unicode, the
+//! only possible error is when invalid utf32 is encountered when decoding
+//! to utf8.
+
+use std;
+use {DecodeError, DecodeResult, EncodeResult};
+
+fn to_little_endian(n: u32) -> [u8; 4] {
+    use std::mem::transmute;
+    let ptr = unsafe { transmute::<*const u32, *const u8>(&n as *const u32) };
+    if cfg!(target_endian = "little") {
+        unsafe { [*ptr, *ptr.offset(1), *ptr.offset(2), *ptr.offset(3)] }
+    } else {
+        unsafe { [*ptr.offset(3), *ptr.offset(2), *ptr.offset(1), *ptr] }
+    }
+}
+
+fn from_little_endian(n: [u8; 4]) -> u32 {
+    use std::mem::transmute;
+    let mut x: u32 = 0;
+    let ptr = unsafe { transmute::<*mut u32, *mut u8>(&mut x as *mut u32) };
+    if cfg!(target_endian = "little") {
+        unsafe {
+            *ptr = n[0];
+            *ptr.offset(1) = n[1];
+            *ptr.offset(2) = n[2];
+            *ptr.offset(3) = n[3];
+        }
+    } else {
+        unsafe {
+            *ptr = n[3];
+            *ptr.offset(1) = n[2];
+            *ptr.offset(2) = n[1];
+            *ptr.offset(3) = n[0];
+        }
+    }
+    x
+}
+
+pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
+    // Do the encode.
+    let mut input_i = 0;
+    let mut output_i = 0;
+    for (offset, c) in input.char_indices() {
+        if (output_i + 3) < output.len() {
+            let mut code = to_little_endian(c as u32);
+            output[output_i] = code[0];
+            output[output_i + 1] = code[1];
+            output[output_i + 2] = code[2];
+            output[output_i + 3] = code[3];
+            output_i += 4;
+            input_i = offset;
+        } else {
+            break;
+        }
+    }
+
+    // Calculate how much of the input was consumed.
+    input_i += 1;
+    if input_i > input.len() {
+        input_i = input.len();
+    } else {
+        while !input.is_char_boundary(input_i) {
+            input_i += 1;
+        }
+    }
+
+    Ok((input_i, &output[..output_i]))
+}
+
+pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
+    let mut input_i = 0;
+    let mut output_i = 0;
+
+    // Loop through the input, getting 4 bytes at a time.
+    let mut itr = input.chunks(4);
+    while let Some(bytes) = itr.next() {
+        if bytes.len() < 4 {
+            break;
+        }
+
+        // Do the decode.
+        if let Some(code) =
+            std::char::from_u32(from_little_endian([bytes[0], bytes[1], bytes[2], bytes[3]]))
+        {
+            // Encode to utf8.
+            let mut buf = [0u8; 4];
+            let s = code.encode_utf8(&mut buf);
+            if (output_i + s.len()) > output.len() {
+                break;
+            }
+            output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes());
+
+            // Update our counters.
+            input_i += 4;
+            output_i += s.len();
+        } else {
+            // Error: invalid codepoint.
+            return Err(DecodeError {
+                error_range: (input_i, input_i + 4),
+                output_bytes_written: output_i,
+            });
+        }
+    }
+
+    Ok((input_i, unsafe {
+        std::str::from_utf8_unchecked(&output[..output_i])
+    }))
+}
--- a/sub_crates/text_encoding/src/utf8.rs
+++ b/sub_crates/text_encoding/src/utf8.rs
@ -6,7 +6,6 @@
 use std;
 use {DecodeError, DecodeResult, EncodeResult};

-// Encode from utf8
 pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
    let copy_len = {
        if output.len() >= input.len() {
--- a/sub_crates/text_encoding/tests/property_tests.rs
+++ b/sub_crates/text_encoding/tests/property_tests.rs
@ -104,6 +104,68 @@ proptest! {
        assert_eq!(&text[..], &utf8[..]);
    }

+    #[test]
+    fn pt_utf32be_roundtrip(ref text in "\\PC*\\PC*\\PC*") {
+        let mut buf = [0u8; 32];
+        let mut utf32: Vec<u8> = Vec::new();
+        let mut utf8 = String::new();
+
+        // Encode to utf32 big endian
+        let mut tmp = &text[..];
+        while !tmp.is_empty() {
+            if let Ok((n, encoded)) = encode_from_utf8(Encoding::Utf32BE, tmp, &mut buf) {
+                tmp = &tmp[n..];
+                utf32.extend_from_slice(encoded);
+            } else {
+                panic!("Error when encoding.");
+            }
+        }
+
+        // Decode back from utf32 big endian
+        let mut tmp = &utf32[..];
+        while !tmp.is_empty() {
+            if let Ok((n, decoded)) = decode_to_utf8(Encoding::Utf32BE, tmp, &mut buf) {
+                tmp = &tmp[n..];
+                utf8.extend(decoded.chars());
+            } else {
+                panic!("Error when decoding.");
+            }
+        }
+
+        assert_eq!(&text[..], &utf8[..]);
+    }
+
+    #[test]
+    fn pt_utf32le_roundtrip(ref text in "\\PC*\\PC*\\PC*") {
+        let mut buf = [0u8; 32];
+        let mut utf32: Vec<u8> = Vec::new();
+        let mut utf8 = String::new();
+
+        // Encode to utf32 little endian
+        let mut tmp = &text[..];
+        while !tmp.is_empty() {
+            if let Ok((n, encoded)) = encode_from_utf8(Encoding::Utf32LE, tmp, &mut buf) {
+                tmp = &tmp[n..];
+                utf32.extend_from_slice(encoded);
+            } else {
+                panic!("Error when encoding.");
+            }
+        }
+
+        // Decode back from utf32 little endian
+        let mut tmp = &utf32[..];
+        while !tmp.is_empty() {
+            if let Ok((n, decoded)) = decode_to_utf8(Encoding::Utf32LE, tmp, &mut buf) {
+                tmp = &tmp[n..];
+                utf8.extend(decoded.chars());
+            } else {
+                panic!("Error when decoding.");
+            }
+        }
+
+        assert_eq!(&text[..], &utf8[..]);
+    }
+
    #[test]
    fn pt_latin1_roundtrip(ref data in vec(0u8..=255, 0..1000)) {
        let mut buf = [0u8; 32];