Added utf32 encoders/decoders to the text_encoding sub-crate.

This commit is contained in:
Nathan Vegdahl 2018-08-21 05:25:53 -07:00
parent 173837b827
commit 3a17ca9e8c
5 changed files with 292 additions and 3 deletions

View File

@ -4,6 +4,8 @@
mod latin1; mod latin1;
mod utf16_be; mod utf16_be;
mod utf16_le; mod utf16_le;
mod utf32_be;
mod utf32_le;
mod utf8; mod utf8;
mod windows1252; mod windows1252;
@ -17,6 +19,8 @@ pub fn encode_from_utf8<'a>(
Encoding::Utf8 => utf8::encode_from_utf8(input, output), Encoding::Utf8 => utf8::encode_from_utf8(input, output),
Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output), Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output),
Encoding::Utf16LE => utf16_le::encode_from_utf8(input, output), Encoding::Utf16LE => utf16_le::encode_from_utf8(input, output),
Encoding::Utf32BE => utf32_be::encode_from_utf8(input, output),
Encoding::Utf32LE => utf32_le::encode_from_utf8(input, output),
Encoding::Latin1 => latin1::encode_from_utf8(input, output), Encoding::Latin1 => latin1::encode_from_utf8(input, output),
Encoding::Windows1252 => windows1252::encode_from_utf8(input, output), Encoding::Windows1252 => windows1252::encode_from_utf8(input, output),
} }
@ -32,6 +36,8 @@ pub fn decode_to_utf8<'a>(
Encoding::Utf8 => utf8::decode_to_utf8(input, output), Encoding::Utf8 => utf8::decode_to_utf8(input, output),
Encoding::Utf16BE => utf16_be::decode_to_utf8(input, output), Encoding::Utf16BE => utf16_be::decode_to_utf8(input, output),
Encoding::Utf16LE => utf16_le::decode_to_utf8(input, output), Encoding::Utf16LE => utf16_le::decode_to_utf8(input, output),
Encoding::Utf32BE => utf32_be::decode_to_utf8(input, output),
Encoding::Utf32LE => utf32_le::decode_to_utf8(input, output),
Encoding::Latin1 => latin1::decode_to_utf8(input, output), Encoding::Latin1 => latin1::decode_to_utf8(input, output),
Encoding::Windows1252 => windows1252::decode_to_utf8(input, output), Encoding::Windows1252 => windows1252::decode_to_utf8(input, output),
} }
@ -43,8 +49,8 @@ pub enum Encoding {
Utf8, Utf8,
Utf16BE, // Big endian Utf16BE, // Big endian
Utf16LE, // Little endian Utf16LE, // Little endian
// Utf32BE, // Big endian Utf32BE, // Big endian
// Utf32LE, // Little endian Utf32LE, // Little endian
// ShiftJIS, // ShiftJIS,
// EUC_JP, // EUC_JP,
// Big5, // Big5,

View File

@ -0,0 +1,111 @@
//! Encoding/decoding functions for big-endian UTF-32.
//!
//! Because both utf8 and utf32 can represent the entirety of unicode, the
//! only possible error is when invalid utf32 is encountered when decoding
//! to utf8.
use std;
use {DecodeError, DecodeResult, EncodeResult};
fn to_big_endian(n: u32) -> [u8; 4] {
use std::mem::transmute;
let ptr = unsafe { transmute::<*const u32, *const u8>(&n as *const u32) };
if cfg!(target_endian = "little") {
unsafe { [*ptr.offset(3), *ptr.offset(2), *ptr.offset(1), *ptr] }
} else {
unsafe { [*ptr, *ptr.offset(1), *ptr.offset(2), *ptr.offset(3)] }
}
}
fn from_big_endian(n: [u8; 4]) -> u32 {
use std::mem::transmute;
let mut x: u32 = 0;
let ptr = unsafe { transmute::<*mut u32, *mut u8>(&mut x as *mut u32) };
if cfg!(target_endian = "little") {
unsafe {
*ptr = n[3];
*ptr.offset(1) = n[2];
*ptr.offset(2) = n[1];
*ptr.offset(3) = n[0];
}
} else {
unsafe {
*ptr = n[0];
*ptr.offset(1) = n[1];
*ptr.offset(2) = n[2];
*ptr.offset(3) = n[3];
}
}
x
}
pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
// Do the encode.
let mut input_i = 0;
let mut output_i = 0;
for (offset, c) in input.char_indices() {
if (output_i + 3) < output.len() {
let mut code = to_big_endian(c as u32);
output[output_i] = code[0];
output[output_i + 1] = code[1];
output[output_i + 2] = code[2];
output[output_i + 3] = code[3];
output_i += 4;
input_i = offset;
} else {
break;
}
}
// Calculate how much of the input was consumed.
input_i += 1;
if input_i > input.len() {
input_i = input.len();
} else {
while !input.is_char_boundary(input_i) {
input_i += 1;
}
}
Ok((input_i, &output[..output_i]))
}
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
let mut input_i = 0;
let mut output_i = 0;
// Loop through the input, getting 4 bytes at a time.
let mut itr = input.chunks(4);
while let Some(bytes) = itr.next() {
if bytes.len() < 4 {
break;
}
// Do the decode.
if let Some(code) =
std::char::from_u32(from_big_endian([bytes[0], bytes[1], bytes[2], bytes[3]]))
{
// Encode to utf8.
let mut buf = [0u8; 4];
let s = code.encode_utf8(&mut buf);
if (output_i + s.len()) > output.len() {
break;
}
output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes());
// Update our counters.
input_i += 4;
output_i += s.len();
} else {
// Error: invalid codepoint.
return Err(DecodeError {
error_range: (input_i, input_i + 4),
output_bytes_written: output_i,
});
}
}
Ok((input_i, unsafe {
std::str::from_utf8_unchecked(&output[..output_i])
}))
}

View File

@ -0,0 +1,111 @@
//! Encoding/decoding functions for big-endian UTF-32.
//!
//! Because both utf8 and utf32 can represent the entirety of unicode, the
//! only possible error is when invalid utf32 is encountered when decoding
//! to utf8.
use std;
use {DecodeError, DecodeResult, EncodeResult};
fn to_little_endian(n: u32) -> [u8; 4] {
use std::mem::transmute;
let ptr = unsafe { transmute::<*const u32, *const u8>(&n as *const u32) };
if cfg!(target_endian = "little") {
unsafe { [*ptr, *ptr.offset(1), *ptr.offset(2), *ptr.offset(3)] }
} else {
unsafe { [*ptr.offset(3), *ptr.offset(2), *ptr.offset(1), *ptr] }
}
}
fn from_little_endian(n: [u8; 4]) -> u32 {
use std::mem::transmute;
let mut x: u32 = 0;
let ptr = unsafe { transmute::<*mut u32, *mut u8>(&mut x as *mut u32) };
if cfg!(target_endian = "little") {
unsafe {
*ptr = n[0];
*ptr.offset(1) = n[1];
*ptr.offset(2) = n[2];
*ptr.offset(3) = n[3];
}
} else {
unsafe {
*ptr = n[3];
*ptr.offset(1) = n[2];
*ptr.offset(2) = n[1];
*ptr.offset(3) = n[0];
}
}
x
}
pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
// Do the encode.
let mut input_i = 0;
let mut output_i = 0;
for (offset, c) in input.char_indices() {
if (output_i + 3) < output.len() {
let mut code = to_little_endian(c as u32);
output[output_i] = code[0];
output[output_i + 1] = code[1];
output[output_i + 2] = code[2];
output[output_i + 3] = code[3];
output_i += 4;
input_i = offset;
} else {
break;
}
}
// Calculate how much of the input was consumed.
input_i += 1;
if input_i > input.len() {
input_i = input.len();
} else {
while !input.is_char_boundary(input_i) {
input_i += 1;
}
}
Ok((input_i, &output[..output_i]))
}
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
let mut input_i = 0;
let mut output_i = 0;
// Loop through the input, getting 4 bytes at a time.
let mut itr = input.chunks(4);
while let Some(bytes) = itr.next() {
if bytes.len() < 4 {
break;
}
// Do the decode.
if let Some(code) =
std::char::from_u32(from_little_endian([bytes[0], bytes[1], bytes[2], bytes[3]]))
{
// Encode to utf8.
let mut buf = [0u8; 4];
let s = code.encode_utf8(&mut buf);
if (output_i + s.len()) > output.len() {
break;
}
output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes());
// Update our counters.
input_i += 4;
output_i += s.len();
} else {
// Error: invalid codepoint.
return Err(DecodeError {
error_range: (input_i, input_i + 4),
output_bytes_written: output_i,
});
}
}
Ok((input_i, unsafe {
std::str::from_utf8_unchecked(&output[..output_i])
}))
}

View File

@ -6,7 +6,6 @@
use std; use std;
use {DecodeError, DecodeResult, EncodeResult}; use {DecodeError, DecodeResult, EncodeResult};
// Encode from utf8
pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> { pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
let copy_len = { let copy_len = {
if output.len() >= input.len() { if output.len() >= input.len() {

View File

@ -104,6 +104,68 @@ proptest! {
assert_eq!(&text[..], &utf8[..]); assert_eq!(&text[..], &utf8[..]);
} }
#[test]
fn pt_utf32be_roundtrip(ref text in "\\PC*\\PC*\\PC*") {
let mut buf = [0u8; 32];
let mut utf32: Vec<u8> = Vec::new();
let mut utf8 = String::new();
// Encode to utf32 big endian
let mut tmp = &text[..];
while !tmp.is_empty() {
if let Ok((n, encoded)) = encode_from_utf8(Encoding::Utf32BE, tmp, &mut buf) {
tmp = &tmp[n..];
utf32.extend_from_slice(encoded);
} else {
panic!("Error when encoding.");
}
}
// Decode back from utf32 big endian
let mut tmp = &utf32[..];
while !tmp.is_empty() {
if let Ok((n, decoded)) = decode_to_utf8(Encoding::Utf32BE, tmp, &mut buf) {
tmp = &tmp[n..];
utf8.extend(decoded.chars());
} else {
panic!("Error when decoding.");
}
}
assert_eq!(&text[..], &utf8[..]);
}
#[test]
fn pt_utf32le_roundtrip(ref text in "\\PC*\\PC*\\PC*") {
let mut buf = [0u8; 32];
let mut utf32: Vec<u8> = Vec::new();
let mut utf8 = String::new();
// Encode to utf32 little endian
let mut tmp = &text[..];
while !tmp.is_empty() {
if let Ok((n, encoded)) = encode_from_utf8(Encoding::Utf32LE, tmp, &mut buf) {
tmp = &tmp[n..];
utf32.extend_from_slice(encoded);
} else {
panic!("Error when encoding.");
}
}
// Decode back from utf32 little endian
let mut tmp = &utf32[..];
while !tmp.is_empty() {
if let Ok((n, decoded)) = decode_to_utf8(Encoding::Utf32LE, tmp, &mut buf) {
tmp = &tmp[n..];
utf8.extend(decoded.chars());
} else {
panic!("Error when decoding.");
}
}
assert_eq!(&text[..], &utf8[..]);
}
#[test] #[test]
fn pt_latin1_roundtrip(ref data in vec(0u8..=255, 0..1000)) { fn pt_latin1_roundtrip(ref data in vec(0u8..=255, 0..1000)) {
let mut buf = [0u8; 32]; let mut buf = [0u8; 32];