From 3a17ca9e8c0ca008829768f4549c188b99d9cce4 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Tue, 21 Aug 2018 05:25:53 -0700
Subject: [PATCH] Added utf32 encoders/decoders to the text_encoding sub-crate.

---
 sub_crates/text_encoding/src/lib.rs           |  10 +-
 sub_crates/text_encoding/src/utf32_be.rs      | 111 ++++++++++++++++++
 sub_crates/text_encoding/src/utf32_le.rs      | 111 ++++++++++++++++++
 sub_crates/text_encoding/src/utf8.rs          |   1 -
 .../text_encoding/tests/property_tests.rs     |  62 ++++++++++
 5 files changed, 292 insertions(+), 3 deletions(-)
 create mode 100644 sub_crates/text_encoding/src/utf32_be.rs
 create mode 100644 sub_crates/text_encoding/src/utf32_le.rs

diff --git a/sub_crates/text_encoding/src/lib.rs b/sub_crates/text_encoding/src/lib.rs
index 60cb7a3..2013bb3 100644
--- a/sub_crates/text_encoding/src/lib.rs
+++ b/sub_crates/text_encoding/src/lib.rs
@@ -4,6 +4,8 @@
 mod latin1;
 mod utf16_be;
 mod utf16_le;
+mod utf32_be;
+mod utf32_le;
 mod utf8;
 mod windows1252;
 
@@ -17,6 +19,8 @@ pub fn encode_from_utf8<'a>(
         Encoding::Utf8 => utf8::encode_from_utf8(input, output),
         Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output),
         Encoding::Utf16LE => utf16_le::encode_from_utf8(input, output),
+        Encoding::Utf32BE => utf32_be::encode_from_utf8(input, output),
+        Encoding::Utf32LE => utf32_le::encode_from_utf8(input, output),
         Encoding::Latin1 => latin1::encode_from_utf8(input, output),
         Encoding::Windows1252 => windows1252::encode_from_utf8(input, output),
     }
@@ -32,6 +36,8 @@ pub fn decode_to_utf8<'a>(
         Encoding::Utf8 => utf8::decode_to_utf8(input, output),
         Encoding::Utf16BE => utf16_be::decode_to_utf8(input, output),
         Encoding::Utf16LE => utf16_le::decode_to_utf8(input, output),
+        Encoding::Utf32BE => utf32_be::decode_to_utf8(input, output),
+        Encoding::Utf32LE => utf32_le::decode_to_utf8(input, output),
         Encoding::Latin1 => latin1::decode_to_utf8(input, output),
         Encoding::Windows1252 => windows1252::decode_to_utf8(input, output),
     }
@@ -43,8 +49,8 @@ pub enum Encoding {
     Utf8,
     Utf16BE, // Big endian
     Utf16LE, // Little endian
-    // Utf32BE, // Big endian
-    // Utf32LE, // Little endian
+    Utf32BE, // Big endian
+    Utf32LE, // Little endian
     // ShiftJIS,
     // EUC_JP,
     // Big5,
diff --git a/sub_crates/text_encoding/src/utf32_be.rs b/sub_crates/text_encoding/src/utf32_be.rs
new file mode 100644
index 0000000..69e9b7a
--- /dev/null
+++ b/sub_crates/text_encoding/src/utf32_be.rs
@@ -0,0 +1,111 @@
+//! Encoding/decoding functions for big-endian UTF-32.
+//!
+//! Because both utf8 and utf32 can represent the entirety of unicode, the
+//! only possible error is when invalid utf32 is encountered when decoding
+//! to utf8.
+
+use std;
+use {DecodeError, DecodeResult, EncodeResult};
+
+fn to_big_endian(n: u32) -> [u8; 4] {
+    use std::mem::transmute;
+    let ptr = unsafe { transmute::<*const u32, *const u8>(&n as *const u32) };
+    if cfg!(target_endian = "little") {
+        unsafe { [*ptr.offset(3), *ptr.offset(2), *ptr.offset(1), *ptr] }
+    } else {
+        unsafe { [*ptr, *ptr.offset(1), *ptr.offset(2), *ptr.offset(3)] }
+    }
+}
+
+fn from_big_endian(n: [u8; 4]) -> u32 {
+    use std::mem::transmute;
+    let mut x: u32 = 0;
+    let ptr = unsafe { transmute::<*mut u32, *mut u8>(&mut x as *mut u32) };
+    if cfg!(target_endian = "little") {
+        unsafe {
+            *ptr = n[3];
+            *ptr.offset(1) = n[2];
+            *ptr.offset(2) = n[1];
+            *ptr.offset(3) = n[0];
+        }
+    } else {
+        unsafe {
+            *ptr = n[0];
+            *ptr.offset(1) = n[1];
+            *ptr.offset(2) = n[2];
+            *ptr.offset(3) = n[3];
+        }
+    }
+    x
+}
+
+pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
+    // Do the encode.
+    let mut input_i = 0;
+    let mut output_i = 0;
+    for (offset, c) in input.char_indices() {
+        if (output_i + 3) < output.len() {
+            let mut code = to_big_endian(c as u32);
+            output[output_i] = code[0];
+            output[output_i + 1] = code[1];
+            output[output_i + 2] = code[2];
+            output[output_i + 3] = code[3];
+            output_i += 4;
+            input_i = offset;
+        } else {
+            break;
+        }
+    }
+
+    // Calculate how much of the input was consumed.
+    input_i += 1;
+    if input_i > input.len() {
+        input_i = input.len();
+    } else {
+        while !input.is_char_boundary(input_i) {
+            input_i += 1;
+        }
+    }
+
+    Ok((input_i, &output[..output_i]))
+}
+
+pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
+    let mut input_i = 0;
+    let mut output_i = 0;
+
+    // Loop through the input, getting 4 bytes at a time.
+    let mut itr = input.chunks(4);
+    while let Some(bytes) = itr.next() {
+        if bytes.len() < 4 {
+            break;
+        }
+
+        // Do the decode.
+        if let Some(code) =
+            std::char::from_u32(from_big_endian([bytes[0], bytes[1], bytes[2], bytes[3]]))
+        {
+            // Encode to utf8.
+            let mut buf = [0u8; 4];
+            let s = code.encode_utf8(&mut buf);
+            if (output_i + s.len()) > output.len() {
+                break;
+            }
+            output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes());
+
+            // Update our counters.
+            input_i += 4;
+            output_i += s.len();
+        } else {
+            // Error: invalid codepoint.
+            return Err(DecodeError {
+                error_range: (input_i, input_i + 4),
+                output_bytes_written: output_i,
+            });
+        }
+    }
+
+    Ok((input_i, unsafe {
+        std::str::from_utf8_unchecked(&output[..output_i])
+    }))
+}
diff --git a/sub_crates/text_encoding/src/utf32_le.rs b/sub_crates/text_encoding/src/utf32_le.rs
new file mode 100644
index 0000000..fd34435
--- /dev/null
+++ b/sub_crates/text_encoding/src/utf32_le.rs
@@ -0,0 +1,111 @@
+//! Encoding/decoding functions for big-endian UTF-32.
+//!
+//! Because both utf8 and utf32 can represent the entirety of unicode, the
+//! only possible error is when invalid utf32 is encountered when decoding
+//! to utf8.
+
+use std;
+use {DecodeError, DecodeResult, EncodeResult};
+
+fn to_little_endian(n: u32) -> [u8; 4] {
+    use std::mem::transmute;
+    let ptr = unsafe { transmute::<*const u32, *const u8>(&n as *const u32) };
+    if cfg!(target_endian = "little") {
+        unsafe { [*ptr, *ptr.offset(1), *ptr.offset(2), *ptr.offset(3)] }
+    } else {
+        unsafe { [*ptr.offset(3), *ptr.offset(2), *ptr.offset(1), *ptr] }
+    }
+}
+
+fn from_little_endian(n: [u8; 4]) -> u32 {
+    use std::mem::transmute;
+    let mut x: u32 = 0;
+    let ptr = unsafe { transmute::<*mut u32, *mut u8>(&mut x as *mut u32) };
+    if cfg!(target_endian = "little") {
+        unsafe {
+            *ptr = n[0];
+            *ptr.offset(1) = n[1];
+            *ptr.offset(2) = n[2];
+            *ptr.offset(3) = n[3];
+        }
+    } else {
+        unsafe {
+            *ptr = n[3];
+            *ptr.offset(1) = n[2];
+            *ptr.offset(2) = n[1];
+            *ptr.offset(3) = n[0];
+        }
+    }
+    x
+}
+
+pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
+    // Do the encode.
+    let mut input_i = 0;
+    let mut output_i = 0;
+    for (offset, c) in input.char_indices() {
+        if (output_i + 3) < output.len() {
+            let mut code = to_little_endian(c as u32);
+            output[output_i] = code[0];
+            output[output_i + 1] = code[1];
+            output[output_i + 2] = code[2];
+            output[output_i + 3] = code[3];
+            output_i += 4;
+            input_i = offset;
+        } else {
+            break;
+        }
+    }
+
+    // Calculate how much of the input was consumed.
+    input_i += 1;
+    if input_i > input.len() {
+        input_i = input.len();
+    } else {
+        while !input.is_char_boundary(input_i) {
+            input_i += 1;
+        }
+    }
+
+    Ok((input_i, &output[..output_i]))
+}
+
+pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
+    let mut input_i = 0;
+    let mut output_i = 0;
+
+    // Loop through the input, getting 4 bytes at a time.
+    let mut itr = input.chunks(4);
+    while let Some(bytes) = itr.next() {
+        if bytes.len() < 4 {
+            break;
+        }
+
+        // Do the decode.
+        if let Some(code) =
+            std::char::from_u32(from_little_endian([bytes[0], bytes[1], bytes[2], bytes[3]]))
+        {
+            // Encode to utf8.
+            let mut buf = [0u8; 4];
+            let s = code.encode_utf8(&mut buf);
+            if (output_i + s.len()) > output.len() {
+                break;
+            }
+            output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes());
+
+            // Update our counters.
+            input_i += 4;
+            output_i += s.len();
+        } else {
+            // Error: invalid codepoint.
+            return Err(DecodeError {
+                error_range: (input_i, input_i + 4),
+                output_bytes_written: output_i,
+            });
+        }
+    }
+
+    Ok((input_i, unsafe {
+        std::str::from_utf8_unchecked(&output[..output_i])
+    }))
+}
diff --git a/sub_crates/text_encoding/src/utf8.rs b/sub_crates/text_encoding/src/utf8.rs
index 830b03c..3cd65e9 100644
--- a/sub_crates/text_encoding/src/utf8.rs
+++ b/sub_crates/text_encoding/src/utf8.rs
@@ -6,7 +6,6 @@
 use std;
 use {DecodeError, DecodeResult, EncodeResult};
 
-// Encode from utf8
 pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
     let copy_len = {
         if output.len() >= input.len() {
diff --git a/sub_crates/text_encoding/tests/property_tests.rs b/sub_crates/text_encoding/tests/property_tests.rs
index 340e6f2..576f574 100644
--- a/sub_crates/text_encoding/tests/property_tests.rs
+++ b/sub_crates/text_encoding/tests/property_tests.rs
@@ -104,6 +104,68 @@ proptest! {
         assert_eq!(&text[..], &utf8[..]);
     }
 
+    #[test]
+    fn pt_utf32be_roundtrip(ref text in "\\PC*\\PC*\\PC*") {
+        let mut buf = [0u8; 32];
+        let mut utf32: Vec<u8> = Vec::new();
+        let mut utf8 = String::new();
+
+        // Encode to utf32 big endian
+        let mut tmp = &text[..];
+        while !tmp.is_empty() {
+            if let Ok((n, encoded)) = encode_from_utf8(Encoding::Utf32BE, tmp, &mut buf) {
+                tmp = &tmp[n..];
+                utf32.extend_from_slice(encoded);
+            } else {
+                panic!("Error when encoding.");
+            }
+        }
+
+        // Decode back from utf32 big endian
+        let mut tmp = &utf32[..];
+        while !tmp.is_empty() {
+            if let Ok((n, decoded)) = decode_to_utf8(Encoding::Utf32BE, tmp, &mut buf) {
+                tmp = &tmp[n..];
+                utf8.extend(decoded.chars());
+            } else {
+                panic!("Error when decoding.");
+            }
+        }
+
+        assert_eq!(&text[..], &utf8[..]);
+    }
+
+    #[test]
+    fn pt_utf32le_roundtrip(ref text in "\\PC*\\PC*\\PC*") {
+        let mut buf = [0u8; 32];
+        let mut utf32: Vec<u8> = Vec::new();
+        let mut utf8 = String::new();
+
+        // Encode to utf32 little endian
+        let mut tmp = &text[..];
+        while !tmp.is_empty() {
+            if let Ok((n, encoded)) = encode_from_utf8(Encoding::Utf32LE, tmp, &mut buf) {
+                tmp = &tmp[n..];
+                utf32.extend_from_slice(encoded);
+            } else {
+                panic!("Error when encoding.");
+            }
+        }
+
+        // Decode back from utf32 little endian
+        let mut tmp = &utf32[..];
+        while !tmp.is_empty() {
+            if let Ok((n, decoded)) = decode_to_utf8(Encoding::Utf32LE, tmp, &mut buf) {
+                tmp = &tmp[n..];
+                utf8.extend(decoded.chars());
+            } else {
+                panic!("Error when decoding.");
+            }
+        }
+
+        assert_eq!(&text[..], &utf8[..]);
+    }
+
     #[test]
     fn pt_latin1_roundtrip(ref data in vec(0u8..=255, 0..1000)) {
         let mut buf = [0u8; 32];