Tests for utf8 encoding/decoding, and improved utf8 errors.

This commit is contained in:
Nathan Vegdahl 2018-08-22 17:53:07 -07:00
parent 006b578318
commit 9409bfe7fb

View File

@ -7,44 +7,332 @@ use core;
use {DecodeError, DecodeResult, EncodeResult};
pub fn encode_from_str<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
let copy_len = {
if output.len() >= input.len() {
input.len()
} else {
let mut i = output.len();
while !input.is_char_boundary(i) {
i -= 1;
}
i
}
};
output[..copy_len].copy_from_slice(input[..copy_len].as_bytes());
Ok((copy_len, &output[..copy_len]))
let cl = copy_len(input.as_bytes(), output.len());
output[..cl].copy_from_slice(input[..cl].as_bytes());
Ok((cl, &output[..cl]))
}
pub fn decode_to_str<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
// Find how much of the data is valid utf8.
let valid_up_to = match core::str::from_utf8(input) {
Ok(text) => text.len(),
Err(e) => {
if e.valid_up_to() > 0 {
e.valid_up_to()
} else {
return Err(DecodeError {
error_range: (0, 1), // TODO: search for the next starting byte to get the range.
output_bytes_written: 0,
});
}
}
Err(e) => e.valid_up_to(),
};
let (in_consumed, out_slice) = encode_from_str(
unsafe { core::str::from_utf8_unchecked(&input[..valid_up_to]) },
output,
).unwrap();
// Copy over what we can.
let bytes_copied = copy_len(&input[..valid_up_to], output.len());
output[..bytes_copied].copy_from_slice(&input[..bytes_copied]);
Ok((in_consumed, unsafe {
core::str::from_utf8_unchecked(out_slice)
// Determine if there's an error.
if bytes_copied < output.len() && bytes_copied == valid_up_to && valid_up_to < input.len() {
let trailing_bytes = input.len() - valid_up_to;
let byte = input[valid_up_to];
// First we check if we're truncated. If we are, then don't error
// yet, because we want to provide the full byte range of the error.
let is_truncated = ((byte & 0b11100000) == 0b11000000 && trailing_bytes < 2)
|| ((byte & 0b11110000) == 0b11100000 && trailing_bytes < 3)
|| ((byte & 0b11111000) == 0b11110000 && trailing_bytes < 4);
if !is_truncated {
// Find the byte range of the error by finding the next valid
// starting byte (or reaching end of input).
let mut i = valid_up_to + 1;
while i < input.len()
&& ((input[i] & 0b11000000) == 0b10000000 || (input[i] & 0b11111000) == 0b11111000)
{
i += 1;
}
// Return the error.
return Err(DecodeError {
error_range: (valid_up_to, i),
output_bytes_written: bytes_copied,
});
}
}
// No error, return success.
Ok((bytes_copied, unsafe {
core::str::from_utf8_unchecked(&output[..bytes_copied])
}))
}
/// Calculates how many bytes should be copied from input to output given
/// their lengths and the content of input. Specifically, it calculates
/// the maximum amount that can be copied without incompletely copying
/// any multi-byte codepoints.
///
/// Input is assumed to be valid and complete utf8 (i.e. could be turned
/// directly into a &str).
#[inline(always)]
fn copy_len(input: &[u8], output_len: usize) -> usize {
if output_len >= input.len() {
input.len()
} else {
let mut i = output_len;
while i > 0 && (input[i] & 0b11000000) == 0b10000000 {
i -= 1;
}
i
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn encode_01() {
let text = "こんにちは!";
let mut buf = [0u8; 2];
let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap();
assert_eq!(consumed_count, 0);
assert_eq!(encoded, &[]);
}
#[test]
fn encode_02() {
let text = "こんにちは!";
let mut buf = [0u8; 3];
let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap();
assert_eq!(consumed_count, 3);
assert_eq!(encoded, &[0xE3, 0x81, 0x93]);
}
#[test]
fn encode_03() {
let text = "こんにちは!";
let mut buf = [0u8; 5];
let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap();
assert_eq!(consumed_count, 3);
assert_eq!(encoded, &[0xE3, 0x81, 0x93]);
}
#[test]
fn decode_01() {
let data = [
0xE3, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81,
0xAF, 0xEF, 0xBC, 0x81,
]; // "こんにちは!"
let mut buf = [0u8; 2];
let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap();
assert_eq!(consumed_count, 0);
assert_eq!(decoded, "");
}
#[test]
fn decode_02() {
let data = [
0xE3, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81,
0xAF, 0xEF, 0xBC, 0x81,
]; // "こんにちは!"
let mut buf = [0u8; 3];
let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap();
assert_eq!(consumed_count, 3);
assert_eq!(decoded, "");
}
#[test]
fn decode_03() {
let data = [
0xE3, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81,
0xAF, 0xEF, 0xBC, 0x81,
]; // "こんにちは!"
let mut buf = [0u8; 5];
let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap();
assert_eq!(consumed_count, 3);
assert_eq!(decoded, "");
}
#[test]
fn decode_04() {
let data = [
0xE3, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81,
0xAF, 0xEF, 0xBC,
]; // "こんにちは!" with last byte chopped off.
let mut buf = [0u8; 64];
let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap();
assert_eq!(consumed_count, 15);
assert_eq!(decoded, "こんにちは");
}
#[test]
fn decode_05() {
let data = [
0xE3, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81,
0xAF, 0xEF,
]; // "こんにちは!" with last 2 bytes chopped off.
let mut buf = [0u8; 64];
let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap();
assert_eq!(consumed_count, 15);
assert_eq!(decoded, "こんにちは");
}
#[test]
fn decode_error_01() {
let data = [
0b10000000, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3,
0x81, 0xAF, 0xEF, 0xBC, 0x81,
]; // "こんにちは!" with an error on the first char (continuing code unit).
let mut buf = [0u8; 2];
let error = decode_to_str(&data, &mut buf);
assert_eq!(
error,
Err(DecodeError {
error_range: (0, 3),
output_bytes_written: 0,
})
);
}
#[test]
fn decode_error_02() {
let data = [
0xE3, 0x81, 0xE3, 0x82, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, 0xAF,
0xEF, 0xBC, 0x81,
]; // "こんにちは!" with an error on the first code point (too few continuing code units).
let mut buf = [0u8; 2];
let error = decode_to_str(&data, &mut buf);
assert_eq!(
error,
Err(DecodeError {
error_range: (0, 2),
output_bytes_written: 0,
})
);
}
#[test]
fn decode_error_03() {
let data = [
0xE3, 0x81, 0x93, 0b10000000, 0x82, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3,
0x81, 0xAF, 0xEF, 0xBC, 0x81,
]; // "こんにちは!" with an error on the second code point (continuing code unit).
let mut buf = [0u8; 64];
let error = decode_to_str(&data, &mut buf);
assert_eq!(
error,
Err(DecodeError {
error_range: (3, 6),
output_bytes_written: 3,
})
);
}
#[test]
fn decode_error_04() {
let data = [
0xE3, 0x81, 0x93, 0b10000000, 0x82, 0x93, 0b10000000, 0x81, 0xAB, 0b10000000, 0x81,
0xA1, 0xE3, 0x81, 0xAF, 0xEF, 0xBC, 0x81,
]; // "こんにちは!" with an error on the second code point (lots of continuing code units).
let mut buf = [0u8; 64];
let error = decode_to_str(&data, &mut buf);
assert_eq!(
error,
Err(DecodeError {
error_range: (3, 12),
output_bytes_written: 3,
})
);
}
#[test]
fn decode_error_05() {
let data = [
0xE3, 0x81, 0x93, 0b11111000, 0x82, 0x93, 0x93, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81,
0xA1, 0xE3, 0x81, 0xAF, 0xEF, 0xBC, 0x81,
]; // "こんにちは!" with an error on the second code point (invalid bit pattern).
let mut buf = [0u8; 64];
let error = decode_to_str(&data, &mut buf);
assert_eq!(
error,
Err(DecodeError {
error_range: (3, 8),
output_bytes_written: 3,
})
);
}
#[test]
fn decode_error_06() {
let data = [
0xE3, 0x81, 0x93, 0xED, 0xA0, 0x80, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81,
0xAF, 0xEF, 0xBC, 0x81,
]; // "こんにちは!" with an error on the second code point (beginning of surrogate range).
let mut buf = [0u8; 64];
let error = decode_to_str(&data, &mut buf);
assert_eq!(
error,
Err(DecodeError {
error_range: (3, 6),
output_bytes_written: 3,
})
);
}
#[test]
fn decode_error_07() {
let data = [
0xE3, 0x81, 0x93, 0xED, 0xBF, 0xBF, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81,
0xAF, 0xEF, 0xBC, 0x81,
]; // "こんにちは!" with an error on the second code point (end of surrogate range).
let mut buf = [0u8; 64];
let error = decode_to_str(&data, &mut buf);
assert_eq!(
error,
Err(DecodeError {
error_range: (3, 6),
output_bytes_written: 3,
})
);
}
#[test]
fn decode_error_08() {
let data = [
0xE3, 0x81, 0x93, 0xF4, 0x90, 0x80, 0x80, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3,
0x81, 0xAF, 0xEF, 0xBC, 0x81,
]; // "こんにちは!" with an error on the second code point (out of unicode range).
let mut buf = [0u8; 64];
let error = decode_to_str(&data, &mut buf);
assert_eq!(
error,
Err(DecodeError {
error_range: (3, 7),
output_bytes_written: 3,
})
);
}
#[test]
fn decode_error_09() {
let data = [
0xE3, 0x81, 0x93, 0xC0, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, 0xAF,
0xEF, 0xBC, 0x81,
]; // "こんにちは!" with an error on the second code point (byte == 0xC0).
let mut buf = [0u8; 64];
let error = decode_to_str(&data, &mut buf);
assert_eq!(
error,
Err(DecodeError {
error_range: (3, 5),
output_bytes_written: 3,
})
);
}
#[test]
fn decode_error_10() {
let data = [
0xE3, 0x81, 0x93, 0xC1, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, 0xAF,
0xEF, 0xBC, 0x81,
]; // "こんにちは!" with an error on the second code point (byte == 0xC1).
let mut buf = [0u8; 64];
let error = decode_to_str(&data, &mut buf);
assert_eq!(
error,
Err(DecodeError {
error_range: (3, 5),
output_bytes_written: 3,
})
);
}
}