Adjusting APIs for text encoding/decoding.
This commit is contained in:
parent
2f5adfad75
commit
3ea4e25fbe
|
@ -8,7 +8,7 @@
|
|||
use std;
|
||||
use {DecodeResult, EncodeError, EncodeResult};
|
||||
|
||||
pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
|
||||
pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
|
||||
// Do the encode.
|
||||
let mut input_i = 0;
|
||||
let mut output_i = 0;
|
||||
|
@ -19,8 +19,8 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
|
|||
if c as u32 > 255 {
|
||||
return Err(EncodeError {
|
||||
character: c,
|
||||
byte_offset: offset,
|
||||
bytes_written: output_i,
|
||||
error_range: (offset, offset + c.len_utf8()),
|
||||
output_bytes_written: output_i,
|
||||
});
|
||||
}
|
||||
output[output_i] = c as u8;
|
||||
|
@ -38,7 +38,7 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
|
|||
}
|
||||
}
|
||||
|
||||
Ok((input_i, output_i))
|
||||
Ok((input_i, &output[..output_i]))
|
||||
}
|
||||
|
||||
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
|
||||
|
|
|
@ -7,7 +7,11 @@ mod utf16_le;
|
|||
mod utf8;
|
||||
|
||||
/// Encodes text from utf8 to a destination encoding.
|
||||
pub fn encode_from_utf8(output_encoding: Encoding, input: &str, output: &mut [u8]) -> EncodeResult {
|
||||
pub fn encode_from_utf8<'a>(
|
||||
output_encoding: Encoding,
|
||||
input: &str,
|
||||
output: &'a mut [u8],
|
||||
) -> EncodeResult<'a> {
|
||||
match output_encoding {
|
||||
Encoding::Utf8 => utf8::encode_from_utf8(input, output),
|
||||
Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output),
|
||||
|
@ -48,9 +52,9 @@ pub enum Encoding {
|
|||
|
||||
/// Result type for encoding text from utf8 to a target encoding.
|
||||
///
|
||||
/// The Ok() variant provides the number of bytes consumed and the
|
||||
/// number of bytes written, in that order.
|
||||
pub type EncodeResult = Result<(usize, usize), EncodeError>;
|
||||
/// The Ok() variant provides the number of bytes consumed and a reference
|
||||
/// to the valid encoded text data.
|
||||
pub type EncodeResult<'a> = Result<(usize, &'a [u8]), EncodeError>;
|
||||
|
||||
/// Result type for decoding text from a target encoding to utf8.
|
||||
///
|
||||
|
@ -64,17 +68,17 @@ pub type DecodeResult<'a> = Result<(usize, &'a str), DecodeError>;
|
|||
/// error is encountering a char that is not representable in the target
|
||||
/// encoding.
|
||||
///
|
||||
/// The problematic character, the byte offset of that character
|
||||
/// in the input utf8, and the number of bytes already written to the output
|
||||
/// buffer is provided.
|
||||
/// The problematic character, the byte index range of that character in the
|
||||
/// input utf8, and the number of bytes already written to the output buffer
|
||||
/// are provided.
|
||||
///
|
||||
/// It is guaranteed that all input leading up to the problem character has
|
||||
/// already been encoded and written to the output buffer.
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct EncodeError {
|
||||
pub character: char,
|
||||
pub byte_offset: usize,
|
||||
pub bytes_written: usize,
|
||||
pub error_range: (usize, usize),
|
||||
pub output_bytes_written: usize,
|
||||
}
|
||||
|
||||
/// Represents an error when decoding to utf8 from some other format.
|
||||
|
@ -84,14 +88,13 @@ pub struct EncodeError {
|
|||
/// input data that are invalid for the text encoding we're attempting
|
||||
/// to decode from.
|
||||
///
|
||||
/// The byte offset of the invalid input data and in the number of bytes
|
||||
/// already written to the output buffer are.
|
||||
/// already been encoded and written to the output buffer.
|
||||
/// The byte index range of the invalid input data and the number of bytes
|
||||
/// already encoded and written to the output buffer are provided.
|
||||
///
|
||||
/// It is guaranteed that all input leading up to the invalid data has
|
||||
/// already been encoded and written to the output buffer.
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct DecodeError {
|
||||
pub byte_offset: usize,
|
||||
pub bytes_written: usize,
|
||||
pub error_range: (usize, usize),
|
||||
pub output_bytes_written: usize,
|
||||
}
|
||||
|
|
|
@ -35,7 +35,7 @@ fn from_big_endian(n: [u8; 2]) -> u16 {
|
|||
x
|
||||
}
|
||||
|
||||
pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
|
||||
pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
|
||||
// Do the encode.
|
||||
let mut input_i = 0;
|
||||
let mut output_i = 0;
|
||||
|
@ -78,7 +78,7 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
|
|||
}
|
||||
}
|
||||
|
||||
Ok((input_i, output_i))
|
||||
Ok((input_i, &output[..output_i]))
|
||||
}
|
||||
|
||||
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
|
||||
|
@ -101,8 +101,8 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
|
|||
} else if (code_1 & 0xFC00) == 0xDC00 {
|
||||
// Error: orphaned second half of a surrogate pair.
|
||||
return Err(DecodeError {
|
||||
byte_offset: input_i,
|
||||
bytes_written: output_i,
|
||||
error_range: (input_i, input_i + 2),
|
||||
output_bytes_written: output_i,
|
||||
});
|
||||
} else {
|
||||
// Two code units.
|
||||
|
@ -116,8 +116,8 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
|
|||
if !(code_2 & 0xFC00) == 0xDC00 {
|
||||
// Error: second half is not valid surrogate.
|
||||
return Err(DecodeError {
|
||||
byte_offset: input_i,
|
||||
bytes_written: output_i,
|
||||
error_range: (input_i, input_i + 2),
|
||||
output_bytes_written: output_i,
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@ fn from_little_endian(n: [u8; 2]) -> u16 {
|
|||
x
|
||||
}
|
||||
|
||||
pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
|
||||
pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
|
||||
// Do the encode.
|
||||
let mut input_i = 0;
|
||||
let mut output_i = 0;
|
||||
|
@ -78,7 +78,7 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
|
|||
}
|
||||
}
|
||||
|
||||
Ok((input_i, output_i))
|
||||
Ok((input_i, &output[..output_i]))
|
||||
}
|
||||
|
||||
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
|
||||
|
@ -101,8 +101,8 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
|
|||
} else if (code_1 & 0xFC00) == 0xDC00 {
|
||||
// Error: orphaned second half of a surrogate pair.
|
||||
return Err(DecodeError {
|
||||
byte_offset: input_i,
|
||||
bytes_written: output_i,
|
||||
error_range: (input_i, input_i + 2),
|
||||
output_bytes_written: output_i,
|
||||
});
|
||||
} else {
|
||||
// Two code units.
|
||||
|
@ -116,8 +116,8 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
|
|||
if !(code_2 & 0xFC00) == 0xDC00 {
|
||||
// Error: second half is not valid surrogate.
|
||||
return Err(DecodeError {
|
||||
byte_offset: input_i,
|
||||
bytes_written: output_i,
|
||||
error_range: (input_i, input_i + 2),
|
||||
output_bytes_written: output_i,
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ use std;
|
|||
use {DecodeError, DecodeResult, EncodeResult};
|
||||
|
||||
// Encode from utf8
|
||||
pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
|
||||
pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
|
||||
let copy_len = {
|
||||
if output.len() >= input.len() {
|
||||
input.len()
|
||||
|
@ -22,7 +22,7 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
|
|||
|
||||
output[..copy_len].copy_from_slice(input[..copy_len].as_bytes());
|
||||
|
||||
Ok((copy_len, copy_len))
|
||||
Ok((copy_len, &output[..copy_len]))
|
||||
}
|
||||
|
||||
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
|
||||
|
@ -33,19 +33,19 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
|
|||
e.valid_up_to()
|
||||
} else {
|
||||
return Err(DecodeError {
|
||||
byte_offset: 0,
|
||||
bytes_written: 0,
|
||||
error_range: (0, 1), // TODO: search for the next starting byte to get the range.
|
||||
output_bytes_written: 0,
|
||||
});
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let (in_consumed, out_written) = encode_from_utf8(
|
||||
let (in_consumed, out_slice) = encode_from_utf8(
|
||||
unsafe { std::str::from_utf8_unchecked(&input[..valid_up_to]) },
|
||||
output,
|
||||
).unwrap();
|
||||
|
||||
Ok((in_consumed, unsafe {
|
||||
std::str::from_utf8_unchecked(&output[..out_written])
|
||||
std::str::from_utf8_unchecked(out_slice)
|
||||
}))
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user