Adjusting APIs for text encoding/decoding.

This commit is contained in:
Nathan Vegdahl 2018-08-19 19:14:13 -07:00
parent 2f5adfad75
commit 3ea4e25fbe
5 changed files with 39 additions and 36 deletions

View File

@ -8,7 +8,7 @@
use std;
use {DecodeResult, EncodeError, EncodeResult};
pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
// Do the encode.
let mut input_i = 0;
let mut output_i = 0;
@ -19,8 +19,8 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
if c as u32 > 255 {
return Err(EncodeError {
character: c,
byte_offset: offset,
bytes_written: output_i,
error_range: (offset, offset + c.len_utf8()),
output_bytes_written: output_i,
});
}
output[output_i] = c as u8;
@ -38,7 +38,7 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
}
}
Ok((input_i, output_i))
Ok((input_i, &output[..output_i]))
}
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {

View File

@ -7,7 +7,11 @@ mod utf16_le;
mod utf8;
/// Encodes text from utf8 to a destination encoding.
pub fn encode_from_utf8(output_encoding: Encoding, input: &str, output: &mut [u8]) -> EncodeResult {
pub fn encode_from_utf8<'a>(
output_encoding: Encoding,
input: &str,
output: &'a mut [u8],
) -> EncodeResult<'a> {
match output_encoding {
Encoding::Utf8 => utf8::encode_from_utf8(input, output),
Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output),
@ -48,9 +52,9 @@ pub enum Encoding {
/// Result type for encoding text from utf8 to a target encoding.
///
/// The Ok() variant provides the number of bytes consumed and the
/// number of bytes written, in that order.
pub type EncodeResult = Result<(usize, usize), EncodeError>;
/// The Ok() variant provides the number of bytes consumed and a reference
/// to the valid encoded text data.
pub type EncodeResult<'a> = Result<(usize, &'a [u8]), EncodeError>;
/// Result type for decoding text from a target encoding to utf8.
///
@ -64,17 +68,17 @@ pub type DecodeResult<'a> = Result<(usize, &'a str), DecodeError>;
/// error is encountering a char that is not representable in the target
/// encoding.
///
/// The problematic character, the byte offset of that character
/// in the input utf8, and the number of bytes already written to the output
/// buffer is provided.
/// The problematic character, the byte index range of that character in the
/// input utf8, and the number of bytes already written to the output buffer
/// are provided.
///
/// It is guaranteed that all input leading up to the problem character has
/// already been encoded and written to the output buffer.
#[derive(Debug, Copy, Clone)]
pub struct EncodeError {
pub character: char,
pub byte_offset: usize,
pub bytes_written: usize,
pub error_range: (usize, usize),
pub output_bytes_written: usize,
}
/// Represents an error when decoding to utf8 from some other format.
@ -84,14 +88,13 @@ pub struct EncodeError {
/// input data that are invalid for the text encoding we're attempting
/// to decode from.
///
/// The byte offset of the invalid input data and in the number of bytes
/// already written to the output buffer are.
/// already been encoded and written to the output buffer.
/// The byte index range of the invalid input data and the number of bytes
/// already encoded and written to the output buffer are provided.
///
/// It is guaranteed that all input leading up to the invalid data has
/// already been encoded and written to the output buffer.
#[derive(Debug, Copy, Clone)]
pub struct DecodeError {
pub byte_offset: usize,
pub bytes_written: usize,
pub error_range: (usize, usize),
pub output_bytes_written: usize,
}

View File

@ -35,7 +35,7 @@ fn from_big_endian(n: [u8; 2]) -> u16 {
x
}
pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
// Do the encode.
let mut input_i = 0;
let mut output_i = 0;
@ -78,7 +78,7 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
}
}
Ok((input_i, output_i))
Ok((input_i, &output[..output_i]))
}
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
@ -101,8 +101,8 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
} else if (code_1 & 0xFC00) == 0xDC00 {
// Error: orphaned second half of a surrogate pair.
return Err(DecodeError {
byte_offset: input_i,
bytes_written: output_i,
error_range: (input_i, input_i + 2),
output_bytes_written: output_i,
});
} else {
// Two code units.
@ -116,8 +116,8 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
if !(code_2 & 0xFC00) == 0xDC00 {
// Error: second half is not valid surrogate.
return Err(DecodeError {
byte_offset: input_i,
bytes_written: output_i,
error_range: (input_i, input_i + 2),
output_bytes_written: output_i,
});
}

View File

@ -35,7 +35,7 @@ fn from_little_endian(n: [u8; 2]) -> u16 {
x
}
pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
// Do the encode.
let mut input_i = 0;
let mut output_i = 0;
@ -78,7 +78,7 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
}
}
Ok((input_i, output_i))
Ok((input_i, &output[..output_i]))
}
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
@ -101,8 +101,8 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
} else if (code_1 & 0xFC00) == 0xDC00 {
// Error: orphaned second half of a surrogate pair.
return Err(DecodeError {
byte_offset: input_i,
bytes_written: output_i,
error_range: (input_i, input_i + 2),
output_bytes_written: output_i,
});
} else {
// Two code units.
@ -116,8 +116,8 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
if !(code_2 & 0xFC00) == 0xDC00 {
// Error: second half is not valid surrogate.
return Err(DecodeError {
byte_offset: input_i,
bytes_written: output_i,
error_range: (input_i, input_i + 2),
output_bytes_written: output_i,
});
}

View File

@ -7,7 +7,7 @@ use std;
use {DecodeError, DecodeResult, EncodeResult};
// Encode from utf8
pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
let copy_len = {
if output.len() >= input.len() {
input.len()
@ -22,7 +22,7 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
output[..copy_len].copy_from_slice(input[..copy_len].as_bytes());
Ok((copy_len, copy_len))
Ok((copy_len, &output[..copy_len]))
}
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
@ -33,19 +33,19 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
e.valid_up_to()
} else {
return Err(DecodeError {
byte_offset: 0,
bytes_written: 0,
error_range: (0, 1), // TODO: search for the next starting byte to get the range.
output_bytes_written: 0,
});
}
}
};
let (in_consumed, out_written) = encode_from_utf8(
let (in_consumed, out_slice) = encode_from_utf8(
unsafe { std::str::from_utf8_unchecked(&input[..valid_up_to]) },
output,
).unwrap();
Ok((in_consumed, unsafe {
std::str::from_utf8_unchecked(&output[..out_written])
std::str::from_utf8_unchecked(out_slice)
}))
}