Added utf32 encoders/decoders to the text_encoding sub-crate.
This commit is contained in:
parent
173837b827
commit
3a17ca9e8c
|
@ -4,6 +4,8 @@
|
|||
mod latin1;
|
||||
mod utf16_be;
|
||||
mod utf16_le;
|
||||
mod utf32_be;
|
||||
mod utf32_le;
|
||||
mod utf8;
|
||||
mod windows1252;
|
||||
|
||||
|
@ -17,6 +19,8 @@ pub fn encode_from_utf8<'a>(
|
|||
Encoding::Utf8 => utf8::encode_from_utf8(input, output),
|
||||
Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output),
|
||||
Encoding::Utf16LE => utf16_le::encode_from_utf8(input, output),
|
||||
Encoding::Utf32BE => utf32_be::encode_from_utf8(input, output),
|
||||
Encoding::Utf32LE => utf32_le::encode_from_utf8(input, output),
|
||||
Encoding::Latin1 => latin1::encode_from_utf8(input, output),
|
||||
Encoding::Windows1252 => windows1252::encode_from_utf8(input, output),
|
||||
}
|
||||
|
@ -32,6 +36,8 @@ pub fn decode_to_utf8<'a>(
|
|||
Encoding::Utf8 => utf8::decode_to_utf8(input, output),
|
||||
Encoding::Utf16BE => utf16_be::decode_to_utf8(input, output),
|
||||
Encoding::Utf16LE => utf16_le::decode_to_utf8(input, output),
|
||||
Encoding::Utf32BE => utf32_be::decode_to_utf8(input, output),
|
||||
Encoding::Utf32LE => utf32_le::decode_to_utf8(input, output),
|
||||
Encoding::Latin1 => latin1::decode_to_utf8(input, output),
|
||||
Encoding::Windows1252 => windows1252::decode_to_utf8(input, output),
|
||||
}
|
||||
|
@ -43,8 +49,8 @@ pub enum Encoding {
|
|||
Utf8,
|
||||
Utf16BE, // Big endian
|
||||
Utf16LE, // Little endian
|
||||
// Utf32BE, // Big endian
|
||||
// Utf32LE, // Little endian
|
||||
Utf32BE, // Big endian
|
||||
Utf32LE, // Little endian
|
||||
// ShiftJIS,
|
||||
// EUC_JP,
|
||||
// Big5,
|
||||
|
|
111
sub_crates/text_encoding/src/utf32_be.rs
Normal file
111
sub_crates/text_encoding/src/utf32_be.rs
Normal file
|
@ -0,0 +1,111 @@
|
|||
//! Encoding/decoding functions for big-endian UTF-32.
|
||||
//!
|
||||
//! Because both utf8 and utf32 can represent the entirety of unicode, the
|
||||
//! only possible error is when invalid utf32 is encountered when decoding
|
||||
//! to utf8.
|
||||
|
||||
use std;
|
||||
use {DecodeError, DecodeResult, EncodeResult};
|
||||
|
||||
fn to_big_endian(n: u32) -> [u8; 4] {
|
||||
use std::mem::transmute;
|
||||
let ptr = unsafe { transmute::<*const u32, *const u8>(&n as *const u32) };
|
||||
if cfg!(target_endian = "little") {
|
||||
unsafe { [*ptr.offset(3), *ptr.offset(2), *ptr.offset(1), *ptr] }
|
||||
} else {
|
||||
unsafe { [*ptr, *ptr.offset(1), *ptr.offset(2), *ptr.offset(3)] }
|
||||
}
|
||||
}
|
||||
|
||||
fn from_big_endian(n: [u8; 4]) -> u32 {
|
||||
use std::mem::transmute;
|
||||
let mut x: u32 = 0;
|
||||
let ptr = unsafe { transmute::<*mut u32, *mut u8>(&mut x as *mut u32) };
|
||||
if cfg!(target_endian = "little") {
|
||||
unsafe {
|
||||
*ptr = n[3];
|
||||
*ptr.offset(1) = n[2];
|
||||
*ptr.offset(2) = n[1];
|
||||
*ptr.offset(3) = n[0];
|
||||
}
|
||||
} else {
|
||||
unsafe {
|
||||
*ptr = n[0];
|
||||
*ptr.offset(1) = n[1];
|
||||
*ptr.offset(2) = n[2];
|
||||
*ptr.offset(3) = n[3];
|
||||
}
|
||||
}
|
||||
x
|
||||
}
|
||||
|
||||
pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
|
||||
// Do the encode.
|
||||
let mut input_i = 0;
|
||||
let mut output_i = 0;
|
||||
for (offset, c) in input.char_indices() {
|
||||
if (output_i + 3) < output.len() {
|
||||
let mut code = to_big_endian(c as u32);
|
||||
output[output_i] = code[0];
|
||||
output[output_i + 1] = code[1];
|
||||
output[output_i + 2] = code[2];
|
||||
output[output_i + 3] = code[3];
|
||||
output_i += 4;
|
||||
input_i = offset;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate how much of the input was consumed.
|
||||
input_i += 1;
|
||||
if input_i > input.len() {
|
||||
input_i = input.len();
|
||||
} else {
|
||||
while !input.is_char_boundary(input_i) {
|
||||
input_i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
Ok((input_i, &output[..output_i]))
|
||||
}
|
||||
|
||||
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
|
||||
let mut input_i = 0;
|
||||
let mut output_i = 0;
|
||||
|
||||
// Loop through the input, getting 4 bytes at a time.
|
||||
let mut itr = input.chunks(4);
|
||||
while let Some(bytes) = itr.next() {
|
||||
if bytes.len() < 4 {
|
||||
break;
|
||||
}
|
||||
|
||||
// Do the decode.
|
||||
if let Some(code) =
|
||||
std::char::from_u32(from_big_endian([bytes[0], bytes[1], bytes[2], bytes[3]]))
|
||||
{
|
||||
// Encode to utf8.
|
||||
let mut buf = [0u8; 4];
|
||||
let s = code.encode_utf8(&mut buf);
|
||||
if (output_i + s.len()) > output.len() {
|
||||
break;
|
||||
}
|
||||
output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes());
|
||||
|
||||
// Update our counters.
|
||||
input_i += 4;
|
||||
output_i += s.len();
|
||||
} else {
|
||||
// Error: invalid codepoint.
|
||||
return Err(DecodeError {
|
||||
error_range: (input_i, input_i + 4),
|
||||
output_bytes_written: output_i,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Ok((input_i, unsafe {
|
||||
std::str::from_utf8_unchecked(&output[..output_i])
|
||||
}))
|
||||
}
|
111
sub_crates/text_encoding/src/utf32_le.rs
Normal file
111
sub_crates/text_encoding/src/utf32_le.rs
Normal file
|
@ -0,0 +1,111 @@
|
|||
//! Encoding/decoding functions for big-endian UTF-32.
|
||||
//!
|
||||
//! Because both utf8 and utf32 can represent the entirety of unicode, the
|
||||
//! only possible error is when invalid utf32 is encountered when decoding
|
||||
//! to utf8.
|
||||
|
||||
use std;
|
||||
use {DecodeError, DecodeResult, EncodeResult};
|
||||
|
||||
fn to_little_endian(n: u32) -> [u8; 4] {
|
||||
use std::mem::transmute;
|
||||
let ptr = unsafe { transmute::<*const u32, *const u8>(&n as *const u32) };
|
||||
if cfg!(target_endian = "little") {
|
||||
unsafe { [*ptr, *ptr.offset(1), *ptr.offset(2), *ptr.offset(3)] }
|
||||
} else {
|
||||
unsafe { [*ptr.offset(3), *ptr.offset(2), *ptr.offset(1), *ptr] }
|
||||
}
|
||||
}
|
||||
|
||||
fn from_little_endian(n: [u8; 4]) -> u32 {
|
||||
use std::mem::transmute;
|
||||
let mut x: u32 = 0;
|
||||
let ptr = unsafe { transmute::<*mut u32, *mut u8>(&mut x as *mut u32) };
|
||||
if cfg!(target_endian = "little") {
|
||||
unsafe {
|
||||
*ptr = n[0];
|
||||
*ptr.offset(1) = n[1];
|
||||
*ptr.offset(2) = n[2];
|
||||
*ptr.offset(3) = n[3];
|
||||
}
|
||||
} else {
|
||||
unsafe {
|
||||
*ptr = n[3];
|
||||
*ptr.offset(1) = n[2];
|
||||
*ptr.offset(2) = n[1];
|
||||
*ptr.offset(3) = n[0];
|
||||
}
|
||||
}
|
||||
x
|
||||
}
|
||||
|
||||
pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
|
||||
// Do the encode.
|
||||
let mut input_i = 0;
|
||||
let mut output_i = 0;
|
||||
for (offset, c) in input.char_indices() {
|
||||
if (output_i + 3) < output.len() {
|
||||
let mut code = to_little_endian(c as u32);
|
||||
output[output_i] = code[0];
|
||||
output[output_i + 1] = code[1];
|
||||
output[output_i + 2] = code[2];
|
||||
output[output_i + 3] = code[3];
|
||||
output_i += 4;
|
||||
input_i = offset;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate how much of the input was consumed.
|
||||
input_i += 1;
|
||||
if input_i > input.len() {
|
||||
input_i = input.len();
|
||||
} else {
|
||||
while !input.is_char_boundary(input_i) {
|
||||
input_i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
Ok((input_i, &output[..output_i]))
|
||||
}
|
||||
|
||||
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
|
||||
let mut input_i = 0;
|
||||
let mut output_i = 0;
|
||||
|
||||
// Loop through the input, getting 4 bytes at a time.
|
||||
let mut itr = input.chunks(4);
|
||||
while let Some(bytes) = itr.next() {
|
||||
if bytes.len() < 4 {
|
||||
break;
|
||||
}
|
||||
|
||||
// Do the decode.
|
||||
if let Some(code) =
|
||||
std::char::from_u32(from_little_endian([bytes[0], bytes[1], bytes[2], bytes[3]]))
|
||||
{
|
||||
// Encode to utf8.
|
||||
let mut buf = [0u8; 4];
|
||||
let s = code.encode_utf8(&mut buf);
|
||||
if (output_i + s.len()) > output.len() {
|
||||
break;
|
||||
}
|
||||
output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes());
|
||||
|
||||
// Update our counters.
|
||||
input_i += 4;
|
||||
output_i += s.len();
|
||||
} else {
|
||||
// Error: invalid codepoint.
|
||||
return Err(DecodeError {
|
||||
error_range: (input_i, input_i + 4),
|
||||
output_bytes_written: output_i,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Ok((input_i, unsafe {
|
||||
std::str::from_utf8_unchecked(&output[..output_i])
|
||||
}))
|
||||
}
|
|
@ -6,7 +6,6 @@
|
|||
use std;
|
||||
use {DecodeError, DecodeResult, EncodeResult};
|
||||
|
||||
// Encode from utf8
|
||||
pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
|
||||
let copy_len = {
|
||||
if output.len() >= input.len() {
|
||||
|
|
|
@ -104,6 +104,68 @@ proptest! {
|
|||
assert_eq!(&text[..], &utf8[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pt_utf32be_roundtrip(ref text in "\\PC*\\PC*\\PC*") {
|
||||
let mut buf = [0u8; 32];
|
||||
let mut utf32: Vec<u8> = Vec::new();
|
||||
let mut utf8 = String::new();
|
||||
|
||||
// Encode to utf32 big endian
|
||||
let mut tmp = &text[..];
|
||||
while !tmp.is_empty() {
|
||||
if let Ok((n, encoded)) = encode_from_utf8(Encoding::Utf32BE, tmp, &mut buf) {
|
||||
tmp = &tmp[n..];
|
||||
utf32.extend_from_slice(encoded);
|
||||
} else {
|
||||
panic!("Error when encoding.");
|
||||
}
|
||||
}
|
||||
|
||||
// Decode back from utf32 big endian
|
||||
let mut tmp = &utf32[..];
|
||||
while !tmp.is_empty() {
|
||||
if let Ok((n, decoded)) = decode_to_utf8(Encoding::Utf32BE, tmp, &mut buf) {
|
||||
tmp = &tmp[n..];
|
||||
utf8.extend(decoded.chars());
|
||||
} else {
|
||||
panic!("Error when decoding.");
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(&text[..], &utf8[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pt_utf32le_roundtrip(ref text in "\\PC*\\PC*\\PC*") {
|
||||
let mut buf = [0u8; 32];
|
||||
let mut utf32: Vec<u8> = Vec::new();
|
||||
let mut utf8 = String::new();
|
||||
|
||||
// Encode to utf32 little endian
|
||||
let mut tmp = &text[..];
|
||||
while !tmp.is_empty() {
|
||||
if let Ok((n, encoded)) = encode_from_utf8(Encoding::Utf32LE, tmp, &mut buf) {
|
||||
tmp = &tmp[n..];
|
||||
utf32.extend_from_slice(encoded);
|
||||
} else {
|
||||
panic!("Error when encoding.");
|
||||
}
|
||||
}
|
||||
|
||||
// Decode back from utf32 little endian
|
||||
let mut tmp = &utf32[..];
|
||||
while !tmp.is_empty() {
|
||||
if let Ok((n, decoded)) = decode_to_utf8(Encoding::Utf32LE, tmp, &mut buf) {
|
||||
tmp = &tmp[n..];
|
||||
utf8.extend(decoded.chars());
|
||||
} else {
|
||||
panic!("Error when decoding.");
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(&text[..], &utf8[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pt_latin1_roundtrip(ref data in vec(0u8..=255, 0..1000)) {
|
||||
let mut buf = [0u8; 32];
|
||||
|
|
Loading…
Reference in New Issue
Block a user