From 3a17ca9e8c0ca008829768f4549c188b99d9cce4 Mon Sep 17 00:00:00 2001 From: Nathan Vegdahl Date: Tue, 21 Aug 2018 05:25:53 -0700 Subject: [PATCH] Added utf32 encoders/decoders to the text_encoding sub-crate. --- sub_crates/text_encoding/src/lib.rs | 10 +- sub_crates/text_encoding/src/utf32_be.rs | 111 ++++++++++++++++++ sub_crates/text_encoding/src/utf32_le.rs | 111 ++++++++++++++++++ sub_crates/text_encoding/src/utf8.rs | 1 - .../text_encoding/tests/property_tests.rs | 62 ++++++++++ 5 files changed, 292 insertions(+), 3 deletions(-) create mode 100644 sub_crates/text_encoding/src/utf32_be.rs create mode 100644 sub_crates/text_encoding/src/utf32_le.rs diff --git a/sub_crates/text_encoding/src/lib.rs b/sub_crates/text_encoding/src/lib.rs index 60cb7a3..2013bb3 100644 --- a/sub_crates/text_encoding/src/lib.rs +++ b/sub_crates/text_encoding/src/lib.rs @@ -4,6 +4,8 @@ mod latin1; mod utf16_be; mod utf16_le; +mod utf32_be; +mod utf32_le; mod utf8; mod windows1252; @@ -17,6 +19,8 @@ pub fn encode_from_utf8<'a>( Encoding::Utf8 => utf8::encode_from_utf8(input, output), Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output), Encoding::Utf16LE => utf16_le::encode_from_utf8(input, output), + Encoding::Utf32BE => utf32_be::encode_from_utf8(input, output), + Encoding::Utf32LE => utf32_le::encode_from_utf8(input, output), Encoding::Latin1 => latin1::encode_from_utf8(input, output), Encoding::Windows1252 => windows1252::encode_from_utf8(input, output), } @@ -32,6 +36,8 @@ pub fn decode_to_utf8<'a>( Encoding::Utf8 => utf8::decode_to_utf8(input, output), Encoding::Utf16BE => utf16_be::decode_to_utf8(input, output), Encoding::Utf16LE => utf16_le::decode_to_utf8(input, output), + Encoding::Utf32BE => utf32_be::decode_to_utf8(input, output), + Encoding::Utf32LE => utf32_le::decode_to_utf8(input, output), Encoding::Latin1 => latin1::decode_to_utf8(input, output), Encoding::Windows1252 => windows1252::decode_to_utf8(input, output), } @@ -43,8 +49,8 @@ pub enum Encoding { Utf8, Utf16BE, // Big endian Utf16LE, // Little endian - // Utf32BE, // Big endian - // Utf32LE, // Little endian + Utf32BE, // Big endian + Utf32LE, // Little endian // ShiftJIS, // EUC_JP, // Big5, diff --git a/sub_crates/text_encoding/src/utf32_be.rs b/sub_crates/text_encoding/src/utf32_be.rs new file mode 100644 index 0000000..69e9b7a --- /dev/null +++ b/sub_crates/text_encoding/src/utf32_be.rs @@ -0,0 +1,111 @@ +//! Encoding/decoding functions for big-endian UTF-32. +//! +//! Because both utf8 and utf32 can represent the entirety of unicode, the +//! only possible error is when invalid utf32 is encountered when decoding +//! to utf8. + +use std; +use {DecodeError, DecodeResult, EncodeResult}; + +fn to_big_endian(n: u32) -> [u8; 4] { + use std::mem::transmute; + let ptr = unsafe { transmute::<*const u32, *const u8>(&n as *const u32) }; + if cfg!(target_endian = "little") { + unsafe { [*ptr.offset(3), *ptr.offset(2), *ptr.offset(1), *ptr] } + } else { + unsafe { [*ptr, *ptr.offset(1), *ptr.offset(2), *ptr.offset(3)] } + } +} + +fn from_big_endian(n: [u8; 4]) -> u32 { + use std::mem::transmute; + let mut x: u32 = 0; + let ptr = unsafe { transmute::<*mut u32, *mut u8>(&mut x as *mut u32) }; + if cfg!(target_endian = "little") { + unsafe { + *ptr = n[3]; + *ptr.offset(1) = n[2]; + *ptr.offset(2) = n[1]; + *ptr.offset(3) = n[0]; + } + } else { + unsafe { + *ptr = n[0]; + *ptr.offset(1) = n[1]; + *ptr.offset(2) = n[2]; + *ptr.offset(3) = n[3]; + } + } + x +} + +pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> { + // Do the encode. + let mut input_i = 0; + let mut output_i = 0; + for (offset, c) in input.char_indices() { + if (output_i + 3) < output.len() { + let mut code = to_big_endian(c as u32); + output[output_i] = code[0]; + output[output_i + 1] = code[1]; + output[output_i + 2] = code[2]; + output[output_i + 3] = code[3]; + output_i += 4; + input_i = offset; + } else { + break; + } + } + + // Calculate how much of the input was consumed. + input_i += 1; + if input_i > input.len() { + input_i = input.len(); + } else { + while !input.is_char_boundary(input_i) { + input_i += 1; + } + } + + Ok((input_i, &output[..output_i])) +} + +pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> { + let mut input_i = 0; + let mut output_i = 0; + + // Loop through the input, getting 4 bytes at a time. + let mut itr = input.chunks(4); + while let Some(bytes) = itr.next() { + if bytes.len() < 4 { + break; + } + + // Do the decode. + if let Some(code) = + std::char::from_u32(from_big_endian([bytes[0], bytes[1], bytes[2], bytes[3]])) + { + // Encode to utf8. + let mut buf = [0u8; 4]; + let s = code.encode_utf8(&mut buf); + if (output_i + s.len()) > output.len() { + break; + } + output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes()); + + // Update our counters. + input_i += 4; + output_i += s.len(); + } else { + // Error: invalid codepoint. + return Err(DecodeError { + error_range: (input_i, input_i + 4), + output_bytes_written: output_i, + }); + } + } + + Ok((input_i, unsafe { + std::str::from_utf8_unchecked(&output[..output_i]) + })) +} diff --git a/sub_crates/text_encoding/src/utf32_le.rs b/sub_crates/text_encoding/src/utf32_le.rs new file mode 100644 index 0000000..fd34435 --- /dev/null +++ b/sub_crates/text_encoding/src/utf32_le.rs @@ -0,0 +1,111 @@ +//! Encoding/decoding functions for big-endian UTF-32. +//! +//! Because both utf8 and utf32 can represent the entirety of unicode, the +//! only possible error is when invalid utf32 is encountered when decoding +//! to utf8. + +use std; +use {DecodeError, DecodeResult, EncodeResult}; + +fn to_little_endian(n: u32) -> [u8; 4] { + use std::mem::transmute; + let ptr = unsafe { transmute::<*const u32, *const u8>(&n as *const u32) }; + if cfg!(target_endian = "little") { + unsafe { [*ptr, *ptr.offset(1), *ptr.offset(2), *ptr.offset(3)] } + } else { + unsafe { [*ptr.offset(3), *ptr.offset(2), *ptr.offset(1), *ptr] } + } +} + +fn from_little_endian(n: [u8; 4]) -> u32 { + use std::mem::transmute; + let mut x: u32 = 0; + let ptr = unsafe { transmute::<*mut u32, *mut u8>(&mut x as *mut u32) }; + if cfg!(target_endian = "little") { + unsafe { + *ptr = n[0]; + *ptr.offset(1) = n[1]; + *ptr.offset(2) = n[2]; + *ptr.offset(3) = n[3]; + } + } else { + unsafe { + *ptr = n[3]; + *ptr.offset(1) = n[2]; + *ptr.offset(2) = n[1]; + *ptr.offset(3) = n[0]; + } + } + x +} + +pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> { + // Do the encode. + let mut input_i = 0; + let mut output_i = 0; + for (offset, c) in input.char_indices() { + if (output_i + 3) < output.len() { + let mut code = to_little_endian(c as u32); + output[output_i] = code[0]; + output[output_i + 1] = code[1]; + output[output_i + 2] = code[2]; + output[output_i + 3] = code[3]; + output_i += 4; + input_i = offset; + } else { + break; + } + } + + // Calculate how much of the input was consumed. + input_i += 1; + if input_i > input.len() { + input_i = input.len(); + } else { + while !input.is_char_boundary(input_i) { + input_i += 1; + } + } + + Ok((input_i, &output[..output_i])) +} + +pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> { + let mut input_i = 0; + let mut output_i = 0; + + // Loop through the input, getting 4 bytes at a time. + let mut itr = input.chunks(4); + while let Some(bytes) = itr.next() { + if bytes.len() < 4 { + break; + } + + // Do the decode. + if let Some(code) = + std::char::from_u32(from_little_endian([bytes[0], bytes[1], bytes[2], bytes[3]])) + { + // Encode to utf8. + let mut buf = [0u8; 4]; + let s = code.encode_utf8(&mut buf); + if (output_i + s.len()) > output.len() { + break; + } + output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes()); + + // Update our counters. + input_i += 4; + output_i += s.len(); + } else { + // Error: invalid codepoint. + return Err(DecodeError { + error_range: (input_i, input_i + 4), + output_bytes_written: output_i, + }); + } + } + + Ok((input_i, unsafe { + std::str::from_utf8_unchecked(&output[..output_i]) + })) +} diff --git a/sub_crates/text_encoding/src/utf8.rs b/sub_crates/text_encoding/src/utf8.rs index 830b03c..3cd65e9 100644 --- a/sub_crates/text_encoding/src/utf8.rs +++ b/sub_crates/text_encoding/src/utf8.rs @@ -6,7 +6,6 @@ use std; use {DecodeError, DecodeResult, EncodeResult}; -// Encode from utf8 pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> { let copy_len = { if output.len() >= input.len() { diff --git a/sub_crates/text_encoding/tests/property_tests.rs b/sub_crates/text_encoding/tests/property_tests.rs index 340e6f2..576f574 100644 --- a/sub_crates/text_encoding/tests/property_tests.rs +++ b/sub_crates/text_encoding/tests/property_tests.rs @@ -104,6 +104,68 @@ proptest! { assert_eq!(&text[..], &utf8[..]); } + #[test] + fn pt_utf32be_roundtrip(ref text in "\\PC*\\PC*\\PC*") { + let mut buf = [0u8; 32]; + let mut utf32: Vec = Vec::new(); + let mut utf8 = String::new(); + + // Encode to utf32 big endian + let mut tmp = &text[..]; + while !tmp.is_empty() { + if let Ok((n, encoded)) = encode_from_utf8(Encoding::Utf32BE, tmp, &mut buf) { + tmp = &tmp[n..]; + utf32.extend_from_slice(encoded); + } else { + panic!("Error when encoding."); + } + } + + // Decode back from utf32 big endian + let mut tmp = &utf32[..]; + while !tmp.is_empty() { + if let Ok((n, decoded)) = decode_to_utf8(Encoding::Utf32BE, tmp, &mut buf) { + tmp = &tmp[n..]; + utf8.extend(decoded.chars()); + } else { + panic!("Error when decoding."); + } + } + + assert_eq!(&text[..], &utf8[..]); + } + + #[test] + fn pt_utf32le_roundtrip(ref text in "\\PC*\\PC*\\PC*") { + let mut buf = [0u8; 32]; + let mut utf32: Vec = Vec::new(); + let mut utf8 = String::new(); + + // Encode to utf32 little endian + let mut tmp = &text[..]; + while !tmp.is_empty() { + if let Ok((n, encoded)) = encode_from_utf8(Encoding::Utf32LE, tmp, &mut buf) { + tmp = &tmp[n..]; + utf32.extend_from_slice(encoded); + } else { + panic!("Error when encoding."); + } + } + + // Decode back from utf32 little endian + let mut tmp = &utf32[..]; + while !tmp.is_empty() { + if let Ok((n, decoded)) = decode_to_utf8(Encoding::Utf32LE, tmp, &mut buf) { + tmp = &tmp[n..]; + utf8.extend(decoded.chars()); + } else { + panic!("Error when decoding."); + } + } + + assert_eq!(&text[..], &utf8[..]); + } + #[test] fn pt_latin1_roundtrip(ref data in vec(0u8..=255, 0..1000)) { let mut buf = [0u8; 32];