Adjusting APIs for text encoding/decoding.
This commit is contained in:
parent
2f5adfad75
commit
3ea4e25fbe
|
@ -8,7 +8,7 @@
|
||||||
use std;
|
use std;
|
||||||
use {DecodeResult, EncodeError, EncodeResult};
|
use {DecodeResult, EncodeError, EncodeResult};
|
||||||
|
|
||||||
pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
|
pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
|
||||||
// Do the encode.
|
// Do the encode.
|
||||||
let mut input_i = 0;
|
let mut input_i = 0;
|
||||||
let mut output_i = 0;
|
let mut output_i = 0;
|
||||||
|
@ -19,8 +19,8 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
|
||||||
if c as u32 > 255 {
|
if c as u32 > 255 {
|
||||||
return Err(EncodeError {
|
return Err(EncodeError {
|
||||||
character: c,
|
character: c,
|
||||||
byte_offset: offset,
|
error_range: (offset, offset + c.len_utf8()),
|
||||||
bytes_written: output_i,
|
output_bytes_written: output_i,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
output[output_i] = c as u8;
|
output[output_i] = c as u8;
|
||||||
|
@ -38,7 +38,7 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok((input_i, output_i))
|
Ok((input_i, &output[..output_i]))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
|
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
|
||||||
|
|
|
@ -7,7 +7,11 @@ mod utf16_le;
|
||||||
mod utf8;
|
mod utf8;
|
||||||
|
|
||||||
/// Encodes text from utf8 to a destination encoding.
|
/// Encodes text from utf8 to a destination encoding.
|
||||||
pub fn encode_from_utf8(output_encoding: Encoding, input: &str, output: &mut [u8]) -> EncodeResult {
|
pub fn encode_from_utf8<'a>(
|
||||||
|
output_encoding: Encoding,
|
||||||
|
input: &str,
|
||||||
|
output: &'a mut [u8],
|
||||||
|
) -> EncodeResult<'a> {
|
||||||
match output_encoding {
|
match output_encoding {
|
||||||
Encoding::Utf8 => utf8::encode_from_utf8(input, output),
|
Encoding::Utf8 => utf8::encode_from_utf8(input, output),
|
||||||
Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output),
|
Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output),
|
||||||
|
@ -48,9 +52,9 @@ pub enum Encoding {
|
||||||
|
|
||||||
/// Result type for encoding text from utf8 to a target encoding.
|
/// Result type for encoding text from utf8 to a target encoding.
|
||||||
///
|
///
|
||||||
/// The Ok() variant provides the number of bytes consumed and the
|
/// The Ok() variant provides the number of bytes consumed and a reference
|
||||||
/// number of bytes written, in that order.
|
/// to the valid encoded text data.
|
||||||
pub type EncodeResult = Result<(usize, usize), EncodeError>;
|
pub type EncodeResult<'a> = Result<(usize, &'a [u8]), EncodeError>;
|
||||||
|
|
||||||
/// Result type for decoding text from a target encoding to utf8.
|
/// Result type for decoding text from a target encoding to utf8.
|
||||||
///
|
///
|
||||||
|
@ -64,17 +68,17 @@ pub type DecodeResult<'a> = Result<(usize, &'a str), DecodeError>;
|
||||||
/// error is encountering a char that is not representable in the target
|
/// error is encountering a char that is not representable in the target
|
||||||
/// encoding.
|
/// encoding.
|
||||||
///
|
///
|
||||||
/// The problematic character, the byte offset of that character
|
/// The problematic character, the byte index range of that character in the
|
||||||
/// in the input utf8, and the number of bytes already written to the output
|
/// input utf8, and the number of bytes already written to the output buffer
|
||||||
/// buffer is provided.
|
/// are provided.
|
||||||
///
|
///
|
||||||
/// It is guaranteed that all input leading up to the problem character has
|
/// It is guaranteed that all input leading up to the problem character has
|
||||||
/// already been encoded and written to the output buffer.
|
/// already been encoded and written to the output buffer.
|
||||||
#[derive(Debug, Copy, Clone)]
|
#[derive(Debug, Copy, Clone)]
|
||||||
pub struct EncodeError {
|
pub struct EncodeError {
|
||||||
pub character: char,
|
pub character: char,
|
||||||
pub byte_offset: usize,
|
pub error_range: (usize, usize),
|
||||||
pub bytes_written: usize,
|
pub output_bytes_written: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Represents an error when decoding to utf8 from some other format.
|
/// Represents an error when decoding to utf8 from some other format.
|
||||||
|
@ -84,14 +88,13 @@ pub struct EncodeError {
|
||||||
/// input data that are invalid for the text encoding we're attempting
|
/// input data that are invalid for the text encoding we're attempting
|
||||||
/// to decode from.
|
/// to decode from.
|
||||||
///
|
///
|
||||||
/// The byte offset of the invalid input data and in the number of bytes
|
/// The byte index range of the invalid input data and the number of bytes
|
||||||
/// already written to the output buffer are.
|
/// already encoded and written to the output buffer are provided.
|
||||||
/// already been encoded and written to the output buffer.
|
|
||||||
///
|
///
|
||||||
/// It is guaranteed that all input leading up to the invalid data has
|
/// It is guaranteed that all input leading up to the invalid data has
|
||||||
/// already been encoded and written to the output buffer.
|
/// already been encoded and written to the output buffer.
|
||||||
#[derive(Debug, Copy, Clone)]
|
#[derive(Debug, Copy, Clone)]
|
||||||
pub struct DecodeError {
|
pub struct DecodeError {
|
||||||
pub byte_offset: usize,
|
pub error_range: (usize, usize),
|
||||||
pub bytes_written: usize,
|
pub output_bytes_written: usize,
|
||||||
}
|
}
|
||||||
|
|
|
@ -35,7 +35,7 @@ fn from_big_endian(n: [u8; 2]) -> u16 {
|
||||||
x
|
x
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
|
pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
|
||||||
// Do the encode.
|
// Do the encode.
|
||||||
let mut input_i = 0;
|
let mut input_i = 0;
|
||||||
let mut output_i = 0;
|
let mut output_i = 0;
|
||||||
|
@ -78,7 +78,7 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok((input_i, output_i))
|
Ok((input_i, &output[..output_i]))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
|
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
|
||||||
|
@ -101,8 +101,8 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
|
||||||
} else if (code_1 & 0xFC00) == 0xDC00 {
|
} else if (code_1 & 0xFC00) == 0xDC00 {
|
||||||
// Error: orphaned second half of a surrogate pair.
|
// Error: orphaned second half of a surrogate pair.
|
||||||
return Err(DecodeError {
|
return Err(DecodeError {
|
||||||
byte_offset: input_i,
|
error_range: (input_i, input_i + 2),
|
||||||
bytes_written: output_i,
|
output_bytes_written: output_i,
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
// Two code units.
|
// Two code units.
|
||||||
|
@ -116,8 +116,8 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
|
||||||
if !(code_2 & 0xFC00) == 0xDC00 {
|
if !(code_2 & 0xFC00) == 0xDC00 {
|
||||||
// Error: second half is not valid surrogate.
|
// Error: second half is not valid surrogate.
|
||||||
return Err(DecodeError {
|
return Err(DecodeError {
|
||||||
byte_offset: input_i,
|
error_range: (input_i, input_i + 2),
|
||||||
bytes_written: output_i,
|
output_bytes_written: output_i,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -35,7 +35,7 @@ fn from_little_endian(n: [u8; 2]) -> u16 {
|
||||||
x
|
x
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
|
pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
|
||||||
// Do the encode.
|
// Do the encode.
|
||||||
let mut input_i = 0;
|
let mut input_i = 0;
|
||||||
let mut output_i = 0;
|
let mut output_i = 0;
|
||||||
|
@ -78,7 +78,7 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok((input_i, output_i))
|
Ok((input_i, &output[..output_i]))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
|
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
|
||||||
|
@ -101,8 +101,8 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
|
||||||
} else if (code_1 & 0xFC00) == 0xDC00 {
|
} else if (code_1 & 0xFC00) == 0xDC00 {
|
||||||
// Error: orphaned second half of a surrogate pair.
|
// Error: orphaned second half of a surrogate pair.
|
||||||
return Err(DecodeError {
|
return Err(DecodeError {
|
||||||
byte_offset: input_i,
|
error_range: (input_i, input_i + 2),
|
||||||
bytes_written: output_i,
|
output_bytes_written: output_i,
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
// Two code units.
|
// Two code units.
|
||||||
|
@ -116,8 +116,8 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
|
||||||
if !(code_2 & 0xFC00) == 0xDC00 {
|
if !(code_2 & 0xFC00) == 0xDC00 {
|
||||||
// Error: second half is not valid surrogate.
|
// Error: second half is not valid surrogate.
|
||||||
return Err(DecodeError {
|
return Err(DecodeError {
|
||||||
byte_offset: input_i,
|
error_range: (input_i, input_i + 2),
|
||||||
bytes_written: output_i,
|
output_bytes_written: output_i,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@ use std;
|
||||||
use {DecodeError, DecodeResult, EncodeResult};
|
use {DecodeError, DecodeResult, EncodeResult};
|
||||||
|
|
||||||
// Encode from utf8
|
// Encode from utf8
|
||||||
pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
|
pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
|
||||||
let copy_len = {
|
let copy_len = {
|
||||||
if output.len() >= input.len() {
|
if output.len() >= input.len() {
|
||||||
input.len()
|
input.len()
|
||||||
|
@ -22,7 +22,7 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
|
||||||
|
|
||||||
output[..copy_len].copy_from_slice(input[..copy_len].as_bytes());
|
output[..copy_len].copy_from_slice(input[..copy_len].as_bytes());
|
||||||
|
|
||||||
Ok((copy_len, copy_len))
|
Ok((copy_len, &output[..copy_len]))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
|
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
|
||||||
|
@ -33,19 +33,19 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
|
||||||
e.valid_up_to()
|
e.valid_up_to()
|
||||||
} else {
|
} else {
|
||||||
return Err(DecodeError {
|
return Err(DecodeError {
|
||||||
byte_offset: 0,
|
error_range: (0, 1), // TODO: search for the next starting byte to get the range.
|
||||||
bytes_written: 0,
|
output_bytes_written: 0,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let (in_consumed, out_written) = encode_from_utf8(
|
let (in_consumed, out_slice) = encode_from_utf8(
|
||||||
unsafe { std::str::from_utf8_unchecked(&input[..valid_up_to]) },
|
unsafe { std::str::from_utf8_unchecked(&input[..valid_up_to]) },
|
||||||
output,
|
output,
|
||||||
).unwrap();
|
).unwrap();
|
||||||
|
|
||||||
Ok((in_consumed, unsafe {
|
Ok((in_consumed, unsafe {
|
||||||
std::str::from_utf8_unchecked(&output[..out_written])
|
std::str::from_utf8_unchecked(out_slice)
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user