From fc07ee344450149d74fbb8d2f6c76486c26ba83a Mon Sep 17 00:00:00 2001 From: Nathan Vegdahl Date: Tue, 21 Aug 2018 21:13:00 -0700 Subject: [PATCH] Reorg text_encoding sub-crate a bit and make it no_std. --- sub_crates/text_encoding/src/latin1.rs | 4 +- sub_crates/text_encoding/src/lib.rs | 3 + sub_crates/text_encoding/src/utf16_be.rs | 47 ++------ sub_crates/text_encoding/src/utf16_le.rs | 47 ++------ sub_crates/text_encoding/src/utf32_be.rs | 45 ++------ sub_crates/text_encoding/src/utf32_le.rs | 45 ++------ sub_crates/text_encoding/src/utf8.rs | 8 +- sub_crates/text_encoding/src/utils.rs | 121 ++++++++++++++++++++ sub_crates/text_encoding/src/windows1252.rs | 4 +- 9 files changed, 166 insertions(+), 158 deletions(-) create mode 100644 sub_crates/text_encoding/src/utils.rs diff --git a/sub_crates/text_encoding/src/latin1.rs b/sub_crates/text_encoding/src/latin1.rs index 71df924..27bb0c0 100644 --- a/sub_crates/text_encoding/src/latin1.rs +++ b/sub_crates/text_encoding/src/latin1.rs @@ -5,7 +5,7 @@ //! decoding cannot fail. However, encoding will fail with scalar values //! greater than 255. -use std; +use core; use {DecodeResult, EncodeError, EncodeResult}; pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> { @@ -66,6 +66,6 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a } Ok((input_i, unsafe { - std::str::from_utf8_unchecked(&output[..output_i]) + core::str::from_utf8_unchecked(&output[..output_i]) })) } diff --git a/sub_crates/text_encoding/src/lib.rs b/sub_crates/text_encoding/src/lib.rs index 2013bb3..1561e13 100644 --- a/sub_crates/text_encoding/src/lib.rs +++ b/sub_crates/text_encoding/src/lib.rs @@ -1,3 +1,5 @@ +#![no_std] + //! A library for incrementally encoding/decoding between utf8 and various //! text encodings. @@ -7,6 +9,7 @@ mod utf16_le; mod utf32_be; mod utf32_le; mod utf8; +mod utils; mod windows1252; /// Encodes text from utf8 to a destination encoding. diff --git a/sub_crates/text_encoding/src/utf16_be.rs b/sub_crates/text_encoding/src/utf16_be.rs index 01e55ce..3b23e3f 100644 --- a/sub_crates/text_encoding/src/utf16_be.rs +++ b/sub_crates/text_encoding/src/utf16_be.rs @@ -4,37 +4,10 @@ //! only possible error is when invalid utf16 is encountered when decoding //! to utf8. -use std; +use core; +use utils::{from_big_endian_u16, to_big_endian_u16}; use {DecodeError, DecodeResult, EncodeResult}; -fn to_big_endian(n: u16) -> [u8; 2] { - use std::mem::transmute; - let ptr = unsafe { transmute::<*const u16, *const u8>(&n as *const u16) }; - if cfg!(target_endian = "little") { - unsafe { [*ptr.offset(1), *ptr] } - } else { - unsafe { [*ptr, *ptr.offset(1)] } - } -} - -fn from_big_endian(n: [u8; 2]) -> u16 { - use std::mem::transmute; - let mut x: u16 = 0; - let ptr = unsafe { transmute::<*mut u16, *mut u8>(&mut x as *mut u16) }; - if cfg!(target_endian = "little") { - unsafe { - *ptr = n[1]; - *ptr.offset(1) = n[0]; - } - } else { - unsafe { - *ptr = n[0]; - *ptr.offset(1) = n[1]; - } - } - x -} - pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> { // Do the encode. let mut input_i = 0; @@ -44,7 +17,7 @@ pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<' if code <= 0xFFFF { // One code unit if (output_i + 1) < output.len() { - let val = to_big_endian(code as u16); + let val = to_big_endian_u16(code as u16); output[output_i] = val[0]; output[output_i + 1] = val[1]; output_i += 2; @@ -55,8 +28,8 @@ pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<' } else if (output_i + 3) < output.len() { // Two code units code -= 0x10000; - let first = to_big_endian(0xD800 | ((code >> 10) as u16)); - let second = to_big_endian(0xDC00 | ((code as u16) & 0x3FF)); + let first = to_big_endian_u16(0xD800 | ((code >> 10) as u16)); + let second = to_big_endian_u16(0xDC00 | ((code as u16) & 0x3FF)); output[output_i] = first[0]; output[output_i + 1] = first[1]; output[output_i + 2] = second[0]; @@ -94,10 +67,10 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a // Decode to scalar value. let code = { - let code_1 = from_big_endian([bytes[0], bytes[1]]); + let code_1 = from_big_endian_u16([bytes[0], bytes[1]]); if code_1 < 0xD800 || code_1 > 0xDFFF { // Single code unit. - unsafe { std::char::from_u32_unchecked(code_1 as u32) } + unsafe { core::char::from_u32_unchecked(code_1 as u32) } } else if (code_1 & 0xFC00) == 0xDC00 { // Error: orphaned second half of a surrogate pair. return Err(DecodeError { @@ -112,7 +85,7 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a break; } let bytes_2 = itr.next().unwrap(); - let code_2 = from_big_endian([bytes_2[0], bytes_2[1]]); + let code_2 = from_big_endian_u16([bytes_2[0], bytes_2[1]]); if (code_2 & 0xFC00) != 0xDC00 { // Error: second half is not valid surrogate. return Err(DecodeError { @@ -122,7 +95,7 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a } unsafe { - std::char::from_u32_unchecked( + core::char::from_u32_unchecked( (((code_1 as u32 - 0xD800) << 10) | (code_2 as u32 - 0xDC00)) + 0x10000, ) } @@ -143,6 +116,6 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a } Ok((input_i, unsafe { - std::str::from_utf8_unchecked(&output[..output_i]) + core::str::from_utf8_unchecked(&output[..output_i]) })) } diff --git a/sub_crates/text_encoding/src/utf16_le.rs b/sub_crates/text_encoding/src/utf16_le.rs index b3dd4f5..6781e45 100644 --- a/sub_crates/text_encoding/src/utf16_le.rs +++ b/sub_crates/text_encoding/src/utf16_le.rs @@ -4,37 +4,10 @@ //! only possible error is when invalid utf16 is encountered when decoding //! to utf8. -use std; +use core; +use utils::{from_little_endian_u16, to_little_endian_u16}; use {DecodeError, DecodeResult, EncodeResult}; -fn to_little_endian(n: u16) -> [u8; 2] { - use std::mem::transmute; - let ptr = unsafe { transmute::<*const u16, *const u8>(&n as *const u16) }; - if cfg!(target_endian = "little") { - unsafe { [*ptr, *ptr.offset(1)] } - } else { - unsafe { [*ptr.offset(1), *ptr] } - } -} - -fn from_little_endian(n: [u8; 2]) -> u16 { - use std::mem::transmute; - let mut x: u16 = 0; - let ptr = unsafe { transmute::<*mut u16, *mut u8>(&mut x as *mut u16) }; - if cfg!(target_endian = "little") { - unsafe { - *ptr = n[0]; - *ptr.offset(1) = n[1]; - } - } else { - unsafe { - *ptr = n[1]; - *ptr.offset(1) = n[0]; - } - } - x -} - pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> { // Do the encode. let mut input_i = 0; @@ -44,7 +17,7 @@ pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<' if code <= 0xFFFF { // One code unit if (output_i + 1) < output.len() { - let val = to_little_endian(code as u16); + let val = to_little_endian_u16(code as u16); output[output_i] = val[0]; output[output_i + 1] = val[1]; output_i += 2; @@ -55,8 +28,8 @@ pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<' } else if (output_i + 3) < output.len() { // Two code units code -= 0x10000; - let first = to_little_endian(0xD800 | ((code >> 10) as u16)); - let second = to_little_endian(0xDC00 | ((code as u16) & 0x3FF)); + let first = to_little_endian_u16(0xD800 | ((code >> 10) as u16)); + let second = to_little_endian_u16(0xDC00 | ((code as u16) & 0x3FF)); output[output_i] = first[0]; output[output_i + 1] = first[1]; output[output_i + 2] = second[0]; @@ -94,10 +67,10 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a // Decode to scalar value. let code = { - let code_1 = from_little_endian([bytes[0], bytes[1]]); + let code_1 = from_little_endian_u16([bytes[0], bytes[1]]); if code_1 < 0xD800 || code_1 > 0xDFFF { // Single code unit. - unsafe { std::char::from_u32_unchecked(code_1 as u32) } + unsafe { core::char::from_u32_unchecked(code_1 as u32) } } else if (code_1 & 0xFC00) == 0xDC00 { // Error: orphaned second half of a surrogate pair. return Err(DecodeError { @@ -112,7 +85,7 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a break; } let bytes_2 = itr.next().unwrap(); - let code_2 = from_little_endian([bytes_2[0], bytes_2[1]]); + let code_2 = from_little_endian_u16([bytes_2[0], bytes_2[1]]); if (code_2 & 0xFC00) != 0xDC00 { // Error: second half is not valid surrogate. return Err(DecodeError { @@ -122,7 +95,7 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a } unsafe { - std::char::from_u32_unchecked( + core::char::from_u32_unchecked( (((code_1 as u32 - 0xD800) << 10) | (code_2 as u32 - 0xDC00)) + 0x10000, ) } @@ -143,6 +116,6 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a } Ok((input_i, unsafe { - std::str::from_utf8_unchecked(&output[..output_i]) + core::str::from_utf8_unchecked(&output[..output_i]) })) } diff --git a/sub_crates/text_encoding/src/utf32_be.rs b/sub_crates/text_encoding/src/utf32_be.rs index 69e9b7a..12e0254 100644 --- a/sub_crates/text_encoding/src/utf32_be.rs +++ b/sub_crates/text_encoding/src/utf32_be.rs @@ -4,48 +4,17 @@ //! only possible error is when invalid utf32 is encountered when decoding //! to utf8. -use std; +use core; +use utils::{from_big_endian_u32, to_big_endian_u32}; use {DecodeError, DecodeResult, EncodeResult}; -fn to_big_endian(n: u32) -> [u8; 4] { - use std::mem::transmute; - let ptr = unsafe { transmute::<*const u32, *const u8>(&n as *const u32) }; - if cfg!(target_endian = "little") { - unsafe { [*ptr.offset(3), *ptr.offset(2), *ptr.offset(1), *ptr] } - } else { - unsafe { [*ptr, *ptr.offset(1), *ptr.offset(2), *ptr.offset(3)] } - } -} - -fn from_big_endian(n: [u8; 4]) -> u32 { - use std::mem::transmute; - let mut x: u32 = 0; - let ptr = unsafe { transmute::<*mut u32, *mut u8>(&mut x as *mut u32) }; - if cfg!(target_endian = "little") { - unsafe { - *ptr = n[3]; - *ptr.offset(1) = n[2]; - *ptr.offset(2) = n[1]; - *ptr.offset(3) = n[0]; - } - } else { - unsafe { - *ptr = n[0]; - *ptr.offset(1) = n[1]; - *ptr.offset(2) = n[2]; - *ptr.offset(3) = n[3]; - } - } - x -} - pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> { // Do the encode. let mut input_i = 0; let mut output_i = 0; for (offset, c) in input.char_indices() { if (output_i + 3) < output.len() { - let mut code = to_big_endian(c as u32); + let mut code = to_big_endian_u32(c as u32); output[output_i] = code[0]; output[output_i + 1] = code[1]; output[output_i + 2] = code[2]; @@ -82,9 +51,9 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a } // Do the decode. - if let Some(code) = - std::char::from_u32(from_big_endian([bytes[0], bytes[1], bytes[2], bytes[3]])) - { + if let Some(code) = core::char::from_u32(from_big_endian_u32([ + bytes[0], bytes[1], bytes[2], bytes[3], + ])) { // Encode to utf8. let mut buf = [0u8; 4]; let s = code.encode_utf8(&mut buf); @@ -106,6 +75,6 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a } Ok((input_i, unsafe { - std::str::from_utf8_unchecked(&output[..output_i]) + core::str::from_utf8_unchecked(&output[..output_i]) })) } diff --git a/sub_crates/text_encoding/src/utf32_le.rs b/sub_crates/text_encoding/src/utf32_le.rs index fd34435..ba5ff0a 100644 --- a/sub_crates/text_encoding/src/utf32_le.rs +++ b/sub_crates/text_encoding/src/utf32_le.rs @@ -4,48 +4,17 @@ //! only possible error is when invalid utf32 is encountered when decoding //! to utf8. -use std; +use core; +use utils::{from_little_endian_u32, to_little_endian_u32}; use {DecodeError, DecodeResult, EncodeResult}; -fn to_little_endian(n: u32) -> [u8; 4] { - use std::mem::transmute; - let ptr = unsafe { transmute::<*const u32, *const u8>(&n as *const u32) }; - if cfg!(target_endian = "little") { - unsafe { [*ptr, *ptr.offset(1), *ptr.offset(2), *ptr.offset(3)] } - } else { - unsafe { [*ptr.offset(3), *ptr.offset(2), *ptr.offset(1), *ptr] } - } -} - -fn from_little_endian(n: [u8; 4]) -> u32 { - use std::mem::transmute; - let mut x: u32 = 0; - let ptr = unsafe { transmute::<*mut u32, *mut u8>(&mut x as *mut u32) }; - if cfg!(target_endian = "little") { - unsafe { - *ptr = n[0]; - *ptr.offset(1) = n[1]; - *ptr.offset(2) = n[2]; - *ptr.offset(3) = n[3]; - } - } else { - unsafe { - *ptr = n[3]; - *ptr.offset(1) = n[2]; - *ptr.offset(2) = n[1]; - *ptr.offset(3) = n[0]; - } - } - x -} - pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> { // Do the encode. let mut input_i = 0; let mut output_i = 0; for (offset, c) in input.char_indices() { if (output_i + 3) < output.len() { - let mut code = to_little_endian(c as u32); + let mut code = to_little_endian_u32(c as u32); output[output_i] = code[0]; output[output_i + 1] = code[1]; output[output_i + 2] = code[2]; @@ -82,9 +51,9 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a } // Do the decode. - if let Some(code) = - std::char::from_u32(from_little_endian([bytes[0], bytes[1], bytes[2], bytes[3]])) - { + if let Some(code) = core::char::from_u32(from_little_endian_u32([ + bytes[0], bytes[1], bytes[2], bytes[3], + ])) { // Encode to utf8. let mut buf = [0u8; 4]; let s = code.encode_utf8(&mut buf); @@ -106,6 +75,6 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a } Ok((input_i, unsafe { - std::str::from_utf8_unchecked(&output[..output_i]) + core::str::from_utf8_unchecked(&output[..output_i]) })) } diff --git a/sub_crates/text_encoding/src/utf8.rs b/sub_crates/text_encoding/src/utf8.rs index 3cd65e9..8757a2a 100644 --- a/sub_crates/text_encoding/src/utf8.rs +++ b/sub_crates/text_encoding/src/utf8.rs @@ -3,7 +3,7 @@ //! is still useful for validating unknown input. And they allow a uniform //! API for all encodings. -use std; +use core; use {DecodeError, DecodeResult, EncodeResult}; pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> { @@ -25,7 +25,7 @@ pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<' } pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> { - let valid_up_to = match std::str::from_utf8(input) { + let valid_up_to = match core::str::from_utf8(input) { Ok(text) => text.len(), Err(e) => { if e.valid_up_to() > 0 { @@ -40,11 +40,11 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a }; let (in_consumed, out_slice) = encode_from_utf8( - unsafe { std::str::from_utf8_unchecked(&input[..valid_up_to]) }, + unsafe { core::str::from_utf8_unchecked(&input[..valid_up_to]) }, output, ).unwrap(); Ok((in_consumed, unsafe { - std::str::from_utf8_unchecked(out_slice) + core::str::from_utf8_unchecked(out_slice) })) } diff --git a/sub_crates/text_encoding/src/utils.rs b/sub_crates/text_encoding/src/utils.rs new file mode 100644 index 0000000..9c1efde --- /dev/null +++ b/sub_crates/text_encoding/src/utils.rs @@ -0,0 +1,121 @@ +use core::mem::transmute; + +#[inline(always)] +pub(crate) fn to_big_endian_u16(n: u16) -> [u8; 2] { + let ptr = unsafe { transmute::<*const u16, *const u8>(&n as *const u16) }; + if cfg!(target_endian = "little") { + unsafe { [*ptr.offset(1), *ptr] } + } else { + unsafe { [*ptr, *ptr.offset(1)] } + } +} + +#[inline(always)] +pub(crate) fn from_big_endian_u16(n: [u8; 2]) -> u16 { + let mut x: u16 = 0; + let ptr = unsafe { transmute::<*mut u16, *mut u8>(&mut x as *mut u16) }; + if cfg!(target_endian = "little") { + unsafe { + *ptr = n[1]; + *ptr.offset(1) = n[0]; + } + } else { + unsafe { + *ptr = n[0]; + *ptr.offset(1) = n[1]; + } + } + x +} + +#[inline(always)] +pub(crate) fn to_little_endian_u16(n: u16) -> [u8; 2] { + let ptr = unsafe { transmute::<*const u16, *const u8>(&n as *const u16) }; + if cfg!(target_endian = "little") { + unsafe { [*ptr, *ptr.offset(1)] } + } else { + unsafe { [*ptr.offset(1), *ptr] } + } +} + +#[inline(always)] +pub(crate) fn from_little_endian_u16(n: [u8; 2]) -> u16 { + let mut x: u16 = 0; + let ptr = unsafe { transmute::<*mut u16, *mut u8>(&mut x as *mut u16) }; + if cfg!(target_endian = "little") { + unsafe { + *ptr = n[0]; + *ptr.offset(1) = n[1]; + } + } else { + unsafe { + *ptr = n[1]; + *ptr.offset(1) = n[0]; + } + } + x +} + +#[inline(always)] +pub(crate) fn to_big_endian_u32(n: u32) -> [u8; 4] { + let ptr = unsafe { transmute::<*const u32, *const u8>(&n as *const u32) }; + if cfg!(target_endian = "little") { + unsafe { [*ptr.offset(3), *ptr.offset(2), *ptr.offset(1), *ptr] } + } else { + unsafe { [*ptr, *ptr.offset(1), *ptr.offset(2), *ptr.offset(3)] } + } +} + +#[inline(always)] +pub(crate) fn from_big_endian_u32(n: [u8; 4]) -> u32 { + let mut x: u32 = 0; + let ptr = unsafe { transmute::<*mut u32, *mut u8>(&mut x as *mut u32) }; + if cfg!(target_endian = "little") { + unsafe { + *ptr = n[3]; + *ptr.offset(1) = n[2]; + *ptr.offset(2) = n[1]; + *ptr.offset(3) = n[0]; + } + } else { + unsafe { + *ptr = n[0]; + *ptr.offset(1) = n[1]; + *ptr.offset(2) = n[2]; + *ptr.offset(3) = n[3]; + } + } + x +} + +#[inline(always)] +pub(crate) fn to_little_endian_u32(n: u32) -> [u8; 4] { + let ptr = unsafe { transmute::<*const u32, *const u8>(&n as *const u32) }; + if cfg!(target_endian = "little") { + unsafe { [*ptr, *ptr.offset(1), *ptr.offset(2), *ptr.offset(3)] } + } else { + unsafe { [*ptr.offset(3), *ptr.offset(2), *ptr.offset(1), *ptr] } + } +} + +#[inline(always)] +pub(crate) fn from_little_endian_u32(n: [u8; 4]) -> u32 { + let mut x: u32 = 0; + let ptr = unsafe { transmute::<*mut u32, *mut u8>(&mut x as *mut u32) }; + if cfg!(target_endian = "little") { + unsafe { + *ptr = n[0]; + *ptr.offset(1) = n[1]; + *ptr.offset(2) = n[2]; + *ptr.offset(3) = n[3]; + } + } else { + unsafe { + *ptr = n[3]; + *ptr.offset(1) = n[2]; + *ptr.offset(2) = n[1]; + *ptr.offset(3) = n[0]; + } + } + x +} diff --git a/sub_crates/text_encoding/src/windows1252.rs b/sub_crates/text_encoding/src/windows1252.rs index 3e407e7..9c4b514 100644 --- a/sub_crates/text_encoding/src/windows1252.rs +++ b/sub_crates/text_encoding/src/windows1252.rs @@ -1,6 +1,6 @@ //! Encoding/decoding functions for Windows-1252. -use std; +use core; use {DecodeError, DecodeResult, EncodeError, EncodeResult}; pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> { @@ -81,7 +81,7 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a } Ok((input_i, unsafe { - std::str::from_utf8_unchecked(&output[..output_i]) + core::str::from_utf8_unchecked(&output[..output_i]) })) }