Adjusting APIs for text encoding/decoding.

This commit is contained in:
Nathan Vegdahl 2018-08-19 19:14:13 -07:00
parent 2f5adfad75
commit 3ea4e25fbe
5 changed files with 39 additions and 36 deletions

View File

@ -8,7 +8,7 @@
use std; use std;
use {DecodeResult, EncodeError, EncodeResult}; use {DecodeResult, EncodeError, EncodeResult};
pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult { pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
// Do the encode. // Do the encode.
let mut input_i = 0; let mut input_i = 0;
let mut output_i = 0; let mut output_i = 0;
@ -19,8 +19,8 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
if c as u32 > 255 { if c as u32 > 255 {
return Err(EncodeError { return Err(EncodeError {
character: c, character: c,
byte_offset: offset, error_range: (offset, offset + c.len_utf8()),
bytes_written: output_i, output_bytes_written: output_i,
}); });
} }
output[output_i] = c as u8; output[output_i] = c as u8;
@ -38,7 +38,7 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
} }
} }
Ok((input_i, output_i)) Ok((input_i, &output[..output_i]))
} }
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> { pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {

View File

@ -7,7 +7,11 @@ mod utf16_le;
mod utf8; mod utf8;
/// Encodes text from utf8 to a destination encoding. /// Encodes text from utf8 to a destination encoding.
pub fn encode_from_utf8(output_encoding: Encoding, input: &str, output: &mut [u8]) -> EncodeResult { pub fn encode_from_utf8<'a>(
output_encoding: Encoding,
input: &str,
output: &'a mut [u8],
) -> EncodeResult<'a> {
match output_encoding { match output_encoding {
Encoding::Utf8 => utf8::encode_from_utf8(input, output), Encoding::Utf8 => utf8::encode_from_utf8(input, output),
Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output), Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output),
@ -48,9 +52,9 @@ pub enum Encoding {
/// Result type for encoding text from utf8 to a target encoding. /// Result type for encoding text from utf8 to a target encoding.
/// ///
/// The Ok() variant provides the number of bytes consumed and the /// The Ok() variant provides the number of bytes consumed and a reference
/// number of bytes written, in that order. /// to the valid encoded text data.
pub type EncodeResult = Result<(usize, usize), EncodeError>; pub type EncodeResult<'a> = Result<(usize, &'a [u8]), EncodeError>;
/// Result type for decoding text from a target encoding to utf8. /// Result type for decoding text from a target encoding to utf8.
/// ///
@ -64,17 +68,17 @@ pub type DecodeResult<'a> = Result<(usize, &'a str), DecodeError>;
/// error is encountering a char that is not representable in the target /// error is encountering a char that is not representable in the target
/// encoding. /// encoding.
/// ///
/// The problematic character, the byte offset of that character /// The problematic character, the byte index range of that character in the
/// in the input utf8, and the number of bytes already written to the output /// input utf8, and the number of bytes already written to the output buffer
/// buffer is provided. /// are provided.
/// ///
/// It is guaranteed that all input leading up to the problem character has /// It is guaranteed that all input leading up to the problem character has
/// already been encoded and written to the output buffer. /// already been encoded and written to the output buffer.
#[derive(Debug, Copy, Clone)] #[derive(Debug, Copy, Clone)]
pub struct EncodeError { pub struct EncodeError {
pub character: char, pub character: char,
pub byte_offset: usize, pub error_range: (usize, usize),
pub bytes_written: usize, pub output_bytes_written: usize,
} }
/// Represents an error when decoding to utf8 from some other format. /// Represents an error when decoding to utf8 from some other format.
@ -84,14 +88,13 @@ pub struct EncodeError {
/// input data that are invalid for the text encoding we're attempting /// input data that are invalid for the text encoding we're attempting
/// to decode from. /// to decode from.
/// ///
/// The byte offset of the invalid input data and in the number of bytes /// The byte index range of the invalid input data and the number of bytes
/// already written to the output buffer are. /// already encoded and written to the output buffer are provided.
/// already been encoded and written to the output buffer.
/// ///
/// It is guaranteed that all input leading up to the invalid data has /// It is guaranteed that all input leading up to the invalid data has
/// already been encoded and written to the output buffer. /// already been encoded and written to the output buffer.
#[derive(Debug, Copy, Clone)] #[derive(Debug, Copy, Clone)]
pub struct DecodeError { pub struct DecodeError {
pub byte_offset: usize, pub error_range: (usize, usize),
pub bytes_written: usize, pub output_bytes_written: usize,
} }

View File

@ -35,7 +35,7 @@ fn from_big_endian(n: [u8; 2]) -> u16 {
x x
} }
pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult { pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
// Do the encode. // Do the encode.
let mut input_i = 0; let mut input_i = 0;
let mut output_i = 0; let mut output_i = 0;
@ -78,7 +78,7 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
} }
} }
Ok((input_i, output_i)) Ok((input_i, &output[..output_i]))
} }
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> { pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
@ -101,8 +101,8 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
} else if (code_1 & 0xFC00) == 0xDC00 { } else if (code_1 & 0xFC00) == 0xDC00 {
// Error: orphaned second half of a surrogate pair. // Error: orphaned second half of a surrogate pair.
return Err(DecodeError { return Err(DecodeError {
byte_offset: input_i, error_range: (input_i, input_i + 2),
bytes_written: output_i, output_bytes_written: output_i,
}); });
} else { } else {
// Two code units. // Two code units.
@ -116,8 +116,8 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
if !(code_2 & 0xFC00) == 0xDC00 { if !(code_2 & 0xFC00) == 0xDC00 {
// Error: second half is not valid surrogate. // Error: second half is not valid surrogate.
return Err(DecodeError { return Err(DecodeError {
byte_offset: input_i, error_range: (input_i, input_i + 2),
bytes_written: output_i, output_bytes_written: output_i,
}); });
} }

View File

@ -35,7 +35,7 @@ fn from_little_endian(n: [u8; 2]) -> u16 {
x x
} }
pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult { pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
// Do the encode. // Do the encode.
let mut input_i = 0; let mut input_i = 0;
let mut output_i = 0; let mut output_i = 0;
@ -78,7 +78,7 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
} }
} }
Ok((input_i, output_i)) Ok((input_i, &output[..output_i]))
} }
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> { pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
@ -101,8 +101,8 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
} else if (code_1 & 0xFC00) == 0xDC00 { } else if (code_1 & 0xFC00) == 0xDC00 {
// Error: orphaned second half of a surrogate pair. // Error: orphaned second half of a surrogate pair.
return Err(DecodeError { return Err(DecodeError {
byte_offset: input_i, error_range: (input_i, input_i + 2),
bytes_written: output_i, output_bytes_written: output_i,
}); });
} else { } else {
// Two code units. // Two code units.
@ -116,8 +116,8 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
if !(code_2 & 0xFC00) == 0xDC00 { if !(code_2 & 0xFC00) == 0xDC00 {
// Error: second half is not valid surrogate. // Error: second half is not valid surrogate.
return Err(DecodeError { return Err(DecodeError {
byte_offset: input_i, error_range: (input_i, input_i + 2),
bytes_written: output_i, output_bytes_written: output_i,
}); });
} }

View File

@ -7,7 +7,7 @@ use std;
use {DecodeError, DecodeResult, EncodeResult}; use {DecodeError, DecodeResult, EncodeResult};
// Encode from utf8 // Encode from utf8
pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult { pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
let copy_len = { let copy_len = {
if output.len() >= input.len() { if output.len() >= input.len() {
input.len() input.len()
@ -22,7 +22,7 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
output[..copy_len].copy_from_slice(input[..copy_len].as_bytes()); output[..copy_len].copy_from_slice(input[..copy_len].as_bytes());
Ok((copy_len, copy_len)) Ok((copy_len, &output[..copy_len]))
} }
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> { pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
@ -33,19 +33,19 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
e.valid_up_to() e.valid_up_to()
} else { } else {
return Err(DecodeError { return Err(DecodeError {
byte_offset: 0, error_range: (0, 1), // TODO: search for the next starting byte to get the range.
bytes_written: 0, output_bytes_written: 0,
}); });
} }
} }
}; };
let (in_consumed, out_written) = encode_from_utf8( let (in_consumed, out_slice) = encode_from_utf8(
unsafe { std::str::from_utf8_unchecked(&input[..valid_up_to]) }, unsafe { std::str::from_utf8_unchecked(&input[..valid_up_to]) },
output, output,
).unwrap(); ).unwrap();
Ok((in_consumed, unsafe { Ok((in_consumed, unsafe {
std::str::from_utf8_unchecked(&output[..out_written]) std::str::from_utf8_unchecked(out_slice)
})) }))
} }