diff --git a/sub_crates/text_encoding/src/latin1.rs b/sub_crates/text_encoding/src/latin1.rs index d3c8a0d..71df924 100644 --- a/sub_crates/text_encoding/src/latin1.rs +++ b/sub_crates/text_encoding/src/latin1.rs @@ -8,7 +8,7 @@ use std; use {DecodeResult, EncodeError, EncodeResult}; -pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult { +pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> { // Do the encode. let mut input_i = 0; let mut output_i = 0; @@ -19,8 +19,8 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult { if c as u32 > 255 { return Err(EncodeError { character: c, - byte_offset: offset, - bytes_written: output_i, + error_range: (offset, offset + c.len_utf8()), + output_bytes_written: output_i, }); } output[output_i] = c as u8; @@ -38,7 +38,7 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult { } } - Ok((input_i, output_i)) + Ok((input_i, &output[..output_i])) } pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> { diff --git a/sub_crates/text_encoding/src/lib.rs b/sub_crates/text_encoding/src/lib.rs index da17fe9..d4d0628 100644 --- a/sub_crates/text_encoding/src/lib.rs +++ b/sub_crates/text_encoding/src/lib.rs @@ -7,7 +7,11 @@ mod utf16_le; mod utf8; /// Encodes text from utf8 to a destination encoding. -pub fn encode_from_utf8(output_encoding: Encoding, input: &str, output: &mut [u8]) -> EncodeResult { +pub fn encode_from_utf8<'a>( + output_encoding: Encoding, + input: &str, + output: &'a mut [u8], +) -> EncodeResult<'a> { match output_encoding { Encoding::Utf8 => utf8::encode_from_utf8(input, output), Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output), @@ -48,9 +52,9 @@ pub enum Encoding { /// Result type for encoding text from utf8 to a target encoding. /// -/// The Ok() variant provides the number of bytes consumed and the -/// number of bytes written, in that order. -pub type EncodeResult = Result<(usize, usize), EncodeError>; +/// The Ok() variant provides the number of bytes consumed and a reference +/// to the valid encoded text data. +pub type EncodeResult<'a> = Result<(usize, &'a [u8]), EncodeError>; /// Result type for decoding text from a target encoding to utf8. /// @@ -64,17 +68,17 @@ pub type DecodeResult<'a> = Result<(usize, &'a str), DecodeError>; /// error is encountering a char that is not representable in the target /// encoding. /// -/// The problematic character, the byte offset of that character -/// in the input utf8, and the number of bytes already written to the output -/// buffer is provided. +/// The problematic character, the byte index range of that character in the +/// input utf8, and the number of bytes already written to the output buffer +/// are provided. /// /// It is guaranteed that all input leading up to the problem character has /// already been encoded and written to the output buffer. #[derive(Debug, Copy, Clone)] pub struct EncodeError { pub character: char, - pub byte_offset: usize, - pub bytes_written: usize, + pub error_range: (usize, usize), + pub output_bytes_written: usize, } /// Represents an error when decoding to utf8 from some other format. @@ -84,14 +88,13 @@ pub struct EncodeError { /// input data that are invalid for the text encoding we're attempting /// to decode from. /// -/// The byte offset of the invalid input data and in the number of bytes -/// already written to the output buffer are. -/// already been encoded and written to the output buffer. +/// The byte index range of the invalid input data and the number of bytes +/// already encoded and written to the output buffer are provided. /// /// It is guaranteed that all input leading up to the invalid data has /// already been encoded and written to the output buffer. #[derive(Debug, Copy, Clone)] pub struct DecodeError { - pub byte_offset: usize, - pub bytes_written: usize, + pub error_range: (usize, usize), + pub output_bytes_written: usize, } diff --git a/sub_crates/text_encoding/src/utf16_be.rs b/sub_crates/text_encoding/src/utf16_be.rs index 439beb6..152b55f 100644 --- a/sub_crates/text_encoding/src/utf16_be.rs +++ b/sub_crates/text_encoding/src/utf16_be.rs @@ -35,7 +35,7 @@ fn from_big_endian(n: [u8; 2]) -> u16 { x } -pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult { +pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> { // Do the encode. let mut input_i = 0; let mut output_i = 0; @@ -78,7 +78,7 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult { } } - Ok((input_i, output_i)) + Ok((input_i, &output[..output_i])) } pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> { @@ -101,8 +101,8 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a } else if (code_1 & 0xFC00) == 0xDC00 { // Error: orphaned second half of a surrogate pair. return Err(DecodeError { - byte_offset: input_i, - bytes_written: output_i, + error_range: (input_i, input_i + 2), + output_bytes_written: output_i, }); } else { // Two code units. @@ -116,8 +116,8 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a if !(code_2 & 0xFC00) == 0xDC00 { // Error: second half is not valid surrogate. return Err(DecodeError { - byte_offset: input_i, - bytes_written: output_i, + error_range: (input_i, input_i + 2), + output_bytes_written: output_i, }); } diff --git a/sub_crates/text_encoding/src/utf16_le.rs b/sub_crates/text_encoding/src/utf16_le.rs index 9e235c3..f0860dd 100644 --- a/sub_crates/text_encoding/src/utf16_le.rs +++ b/sub_crates/text_encoding/src/utf16_le.rs @@ -35,7 +35,7 @@ fn from_little_endian(n: [u8; 2]) -> u16 { x } -pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult { +pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> { // Do the encode. let mut input_i = 0; let mut output_i = 0; @@ -78,7 +78,7 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult { } } - Ok((input_i, output_i)) + Ok((input_i, &output[..output_i])) } pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> { @@ -101,8 +101,8 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a } else if (code_1 & 0xFC00) == 0xDC00 { // Error: orphaned second half of a surrogate pair. return Err(DecodeError { - byte_offset: input_i, - bytes_written: output_i, + error_range: (input_i, input_i + 2), + output_bytes_written: output_i, }); } else { // Two code units. @@ -116,8 +116,8 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a if !(code_2 & 0xFC00) == 0xDC00 { // Error: second half is not valid surrogate. return Err(DecodeError { - byte_offset: input_i, - bytes_written: output_i, + error_range: (input_i, input_i + 2), + output_bytes_written: output_i, }); } diff --git a/sub_crates/text_encoding/src/utf8.rs b/sub_crates/text_encoding/src/utf8.rs index 404edf8..830b03c 100644 --- a/sub_crates/text_encoding/src/utf8.rs +++ b/sub_crates/text_encoding/src/utf8.rs @@ -7,7 +7,7 @@ use std; use {DecodeError, DecodeResult, EncodeResult}; // Encode from utf8 -pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult { +pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> { let copy_len = { if output.len() >= input.len() { input.len() @@ -22,7 +22,7 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult { output[..copy_len].copy_from_slice(input[..copy_len].as_bytes()); - Ok((copy_len, copy_len)) + Ok((copy_len, &output[..copy_len])) } pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> { @@ -33,19 +33,19 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a e.valid_up_to() } else { return Err(DecodeError { - byte_offset: 0, - bytes_written: 0, + error_range: (0, 1), // TODO: search for the next starting byte to get the range. + output_bytes_written: 0, }); } } }; - let (in_consumed, out_written) = encode_from_utf8( + let (in_consumed, out_slice) = encode_from_utf8( unsafe { std::str::from_utf8_unchecked(&input[..valid_up_to]) }, output, ).unwrap(); Ok((in_consumed, unsafe { - std::str::from_utf8_unchecked(&output[..out_written]) + std::str::from_utf8_unchecked(out_slice) })) }